1 ## Copyright (C) 2005, 2006 William Poetra Yoga Hadisoeseno
2 ## Copyright (C) 2011 Nir Krakauer
4 ## This program is free software; you can redistribute it and/or modify it under
5 ## the terms of the GNU General Public License as published by the Free Software
6 ## Foundation; either version 3 of the License, or (at your option) any later
9 ## This program is distributed in the hope that it will be useful, but WITHOUT
10 ## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 ## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
14 ## You should have received a copy of the GNU General Public License along with
15 ## this program; if not, see <http://www.gnu.org/licenses/>.
18 ## @deftypefn {Function File} {[@var{b}, @var{bint}, @var{r}, @var{rint}, @var{stats}] =} regress (@var{y}, @var{X}, [@var{alpha}])
19 ## Multiple Linear Regression using Least Squares Fit of @var{y} on @var{X}
20 ## with the model @code{y = X * beta + e}.
26 ## @code{y} is a column vector of observed values
28 ## @code{X} is a matrix of regressors, with the first column filled with
29 ## the constant value 1
31 ## @code{beta} is a column vector of regression parameters
33 ## @code{e} is a column vector of random errors
40 ## @var{y} is the @code{y} in the model
42 ## @var{X} is the @code{X} in the model
44 ## @var{alpha} is the significance level used to calculate the confidence
45 ## intervals @var{bint} and @var{rint} (see `Return values' below). If not
46 ## specified, ALPHA defaults to 0.05
53 ## @var{b} is the @code{beta} in the model
55 ## @var{bint} is the confidence interval for @var{b}
57 ## @var{r} is a column vector of residuals
59 ## @var{rint} is the confidence interval for @var{r}
61 ## @var{stats} is a row vector containing:
64 ## @item The R^2 statistic
65 ## @item The F statistic
66 ## @item The p value for the full model
67 ## @item The estimated error variance
71 ## @var{r} and @var{rint} can be passed to @code{rcoplot} to visualize
72 ## the residual intervals and identify outliers.
74 ## NaN values in @var{y} and @var{X} are removed before calculation begins.
79 ## - Matlab 7.0 documentation (pdf)
80 ## - ¡¶´óѧÊýѧʵÑé¡· ½ªÆôÔ´ µÈ (textbook)
81 ## - http://www.netnam.vn/unescocourse/statistics/12_5.htm
82 ## - wsolve.m in octave-forge
83 ## - http://www.stanford.edu/class/ee263/ls_ln_matlab.pdf
85 function [b, bint, r, rint, stats] = regress (y, X, alpha)
87 if (nargin < 2 || nargin > 3)
92 error ("regress: y must be a numeric matrix");
95 error ("regress: X must be a numeric matrix");
99 error ("regress: y must be a column vector");
102 if (rows (y) != rows (X))
103 error ("regress: y and X must contain the same number of rows");
108 elseif (! isscalar (alpha))
109 error ("regress: alpha must be a scalar value")
112 notnans = ! logical (sum (isnan ([y X]), 2));
126 t_alpha_2 = tinv (alpha / 2, dof);
128 r = y - X * b; # added -- Nir
132 # c = diag(inv (X' * X)) using (economy) QR decomposition
133 # which means that we only have to use Xr
134 c = diag (inv (Xr' * Xr));
136 db = t_alpha_2 * sqrt (v * c);
138 bint = [b + db, b - db];
145 h = sum(X.*pinv_X', 2); #added -- Nir (same as diag(X*pinv_X), without doing the matrix multiply)
147 # From Matlab's documentation on Multiple Linear Regression,
148 # sigmaihat2 = norm (r) ^ 2 / dof1 - r .^ 2 / (dof1 * (1 - h));
149 # dr = -tinv (1 - alpha / 2, dof) * sqrt (sigmaihat2 .* (1 - h));
151 # norm (r) ^ 2 == sum (r .^ 2) == SSE
152 # -tinv (1 - alpha / 2, dof) == tinv (alpha / 2, dof) == t_alpha_2
154 # sigmaihat2 = (SSE - r .^ 2 / (1 - h)) / dof1;
155 # dr = t_alpha_2 * sqrt (sigmaihat2 .* (1 - h));
157 # dr = t_alpha_2 * sqrt ((SSE * (1 - h) - (r .^ 2)) / dof1);
159 dr = t_alpha_2 * sqrt ((SSE * (1 - h) - (r .^ 2)) / dof1);
161 rint = [r + dr, r - dr];
167 R2 = 1 - SSE / sum ((y - mean (y)) .^ 2);
168 # F = (R2 / (p - 1)) / ((1 - R2) / dof);
169 F = dof / (p - 1) / (1 / R2 - 1);
170 pval = 1 - fcdf (F, p - 1, dof);
172 stats = [R2 F pval v];
180 %! % Longley data from the NIST Statistical Reference Dataset
181 %! Z = [ 60323 83.0 234289 2356 1590 107608 1947
182 %! 61122 88.5 259426 2325 1456 108632 1948
183 %! 60171 88.2 258054 3682 1616 109773 1949
184 %! 61187 89.5 284599 3351 1650 110929 1950
185 %! 63221 96.2 328975 2099 3099 112075 1951
186 %! 63639 98.1 346999 1932 3594 113270 1952
187 %! 64989 99.0 365385 1870 3547 115094 1953
188 %! 63761 100.0 363112 3578 3350 116219 1954
189 %! 66019 101.2 397469 2904 3048 117388 1955
190 %! 67857 104.6 419180 2822 2857 118734 1956
191 %! 68169 108.4 442769 2936 2798 120445 1957
192 %! 66513 110.8 444546 4681 2637 121950 1958
193 %! 68655 112.6 482704 3813 2552 123366 1959
194 %! 69564 114.2 502601 3931 2514 125368 1960
195 %! 69331 115.7 518173 4806 2572 127852 1961
196 %! 70551 116.9 554894 4007 2827 130081 1962 ];
197 %! % Results certified by NIST using 500 digit arithmetic
198 %! % b and standard error in b
199 %! V = [ -3482258.63459582 890420.383607373
200 %! 15.0618722713733 84.9149257747669
201 %! -0.358191792925910E-01 0.334910077722432E-01
202 %! -2.02022980381683 0.488399681651699
203 %! -1.03322686717359 0.214274163161675
204 %! -0.511041056535807E-01 0.226073200069370
205 %! 1829.15146461355 455.478499142212 ];
206 %! Rsq = 0.995479004577296;
207 %! F = 330.285339234588;
208 %! y = Z(:,1); X = [ones(rows(Z),1), Z(:,2:end)];
210 %! [b, bint, r, rint, stats] = regress (y, X, alpha);
211 %! assert(b,V(:,1),3e-6);
212 %! assert(stats(1),Rsq,1e-12);
213 %! assert(stats(2),F,3e-8);
214 %! assert(((bint(:,1)-bint(:,2))/2)/tinv(alpha/2,9),V(:,2),-1.e-5);