octave_packages/statistics-1.1.3/regress.m

   1 ## Copyright (C) 2005, 2006 William Poetra Yoga Hadisoeseno
   2 ## Copyright (C) 2011 Nir Krakauer
   3 ##
   4 ## This program is free software; you can redistribute it and/or modify it under
   5 ## the terms of the GNU General Public License as published by the Free Software
   6 ## Foundation; either version 3 of the License, or (at your option) any later
   7 ## version.
   8 ##
   9 ## This program is distributed in the hope that it will be useful, but WITHOUT
  10 ## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11 ## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
  12 ## details.
  13 ##
  14 ## You should have received a copy of the GNU General Public License along with
  15 ## this program; if not, see <http://www.gnu.org/licenses/>.
  16
  17 ## -*- texinfo -*-
  18 ## @deftypefn {Function File} {[@var{b}, @var{bint}, @var{r}, @var{rint}, @var{stats}] =} regress (@var{y}, @var{X}, [@var{alpha}])
  19 ## Multiple Linear Regression using Least Squares Fit of @var{y} on @var{X}
  20 ## with the model @code{y = X * beta + e}.
  21 ##
  22 ## Here,
  23 ##
  24 ## @itemize
  25 ## @item
  26 ## @code{y} is a column vector of observed values
  27 ## @item
  28 ## @code{X} is a matrix of regressors, with the first column filled with
  29 ## the constant value 1
  30 ## @item
  31 ## @code{beta} is a column vector of regression parameters
  32 ## @item
  33 ## @code{e} is a column vector of random errors
  34 ## @end itemize
  35 ##
  36 ## Arguments are
  37 ##
  38 ## @itemize
  39 ## @item
  40 ## @var{y} is the @code{y} in the model
  41 ## @item
  42 ## @var{X} is the @code{X} in the model
  43 ## @item
  44 ## @var{alpha} is the significance level used to calculate the confidence
  45 ## intervals @var{bint} and @var{rint} (see `Return values' below). If not
  46 ## specified, ALPHA defaults to 0.05
  47 ## @end itemize
  48 ##
  49 ## Return values are
  50 ##
  51 ## @itemize
  52 ## @item
  53 ## @var{b} is the @code{beta} in the model
  54 ## @item
  55 ## @var{bint} is the confidence interval for @var{b}
  56 ## @item
  57 ## @var{r} is a column vector of residuals
  58 ## @item
  59 ## @var{rint} is the confidence interval for @var{r}
  60 ## @item
  61 ## @var{stats} is a row vector containing:
  62 ##
  63 ##   @itemize
  64 ##   @item The R^2 statistic
  65 ##   @item The F statistic
  66 ##   @item The p value for the full model
  67 ##   @item The estimated error variance
  68 ##   @end itemize
  69 ## @end itemize
  70 ##
  71 ## @var{r} and @var{rint} can be passed to @code{rcoplot} to visualize
  72 ## the residual intervals and identify outliers.
  73 ##
  74 ## NaN values in @var{y} and @var{X} are removed before calculation begins.
  75 ##
  76 ## @end deftypefn
  77
  78 ## References:
  79 ## - Matlab 7.0 documentation (pdf)
  80 ## - ¡¶´óÑ§ÊýÑ§ÊµÑé¡· ½ªÆôÔ´ µÈ (textbook)
  81 ## - http://www.netnam.vn/unescocourse/statistics/12_5.htm
  82 ## - wsolve.m in octave-forge
  83 ## - http://www.stanford.edu/class/ee263/ls_ln_matlab.pdf
  84
  85 function [b, bint, r, rint, stats] = regress (y, X, alpha)
  86
  87   if (nargin < 2 || nargin > 3)
  88     print_usage;
  89   endif
  90
  91   if (! ismatrix (y))
  92     error ("regress: y must be a numeric matrix");
  93   endif
  94   if (! ismatrix (X))
  95     error ("regress: X must be a numeric matrix");
  96   endif
  97
  98   if (columns (y) != 1)
  99     error ("regress: y must be a column vector");
 100   endif
 101
 102   if (rows (y) != rows (X))
 103     error ("regress: y and X must contain the same number of rows");
 104   endif
 105
 106   if (nargin < 3)
 107     alpha = 0.05;
 108   elseif (! isscalar (alpha))
 109     error ("regress: alpha must be a scalar value")
 110   endif
 111
 112   notnans = ! logical (sum (isnan ([y X]), 2));
 113   y = y(notnans);
 114   X = X(notnans,:);
 115
 116   [Xq Xr] = qr (X, 0);
 117   pinv_X = Xr \ Xq';
 118
 119   b = pinv_X * y;
 120
 121   if (nargout > 1)
 122
 123     n = rows (X);
 124     p = columns (X);
 125     dof = n - p;
 126     t_alpha_2 = tinv (alpha / 2, dof);
 127
 128     r = y - X * b; # added -- Nir
 129     SSE = sum (r .^ 2);
 130     v = SSE / dof;
 131
 132     # c = diag(inv (X' * X)) using (economy) QR decomposition
 133     # which means that we only have to use Xr
 134     c = diag (inv (Xr' * Xr));
 135
 136     db = t_alpha_2 * sqrt (v * c);
 137
 138     bint = [b + db, b - db];
 139
 140   endif
 141
 142   if (nargout > 3)
 143
 144     dof1 = n - p - 1;
 145     h = sum(X.*pinv_X', 2); #added -- Nir (same as diag(X*pinv_X), without doing the matrix multiply)
 146
 147     # From Matlab's documentation on Multiple Linear Regression,
 148     #   sigmaihat2 = norm (r) ^ 2 / dof1 - r .^ 2 / (dof1 * (1 - h));
 149     #   dr = -tinv (1 - alpha / 2, dof) * sqrt (sigmaihat2 .* (1 - h));
 150     # Substitute
 151     #   norm (r) ^ 2 == sum (r .^ 2) == SSE
 152     #   -tinv (1 - alpha / 2, dof) == tinv (alpha / 2, dof) == t_alpha_2
 153     # We get
 154     #   sigmaihat2 = (SSE - r .^ 2 / (1 - h)) / dof1;
 155     #   dr = t_alpha_2 * sqrt (sigmaihat2 .* (1 - h));
 156     # Combine, we get
 157     #   dr = t_alpha_2 * sqrt ((SSE * (1 - h) - (r .^ 2)) / dof1);
 158
 159     dr = t_alpha_2 * sqrt ((SSE * (1 - h) - (r .^ 2)) / dof1);
 160
 161     rint = [r + dr, r - dr];
 162
 163   endif
 164
 165   if (nargout > 4)
 166
 167     R2 = 1 - SSE / sum ((y - mean (y)) .^ 2);
 168 #    F = (R2 / (p - 1)) / ((1 - R2) / dof);
 169     F = dof / (p - 1) / (1 / R2 - 1);
 170     pval = 1 - fcdf (F, p - 1, dof);
 171
 172     stats = [R2 F pval v];
 173
 174   endif
 175
 176 endfunction
 177
 178
 179 %!test
 180 %! % Longley data from the NIST Statistical Reference Dataset
 181 %! Z = [  60323    83.0   234289   2356     1590    107608  1947
 182 %!        61122    88.5   259426   2325     1456    108632  1948
 183 %!        60171    88.2   258054   3682     1616    109773  1949
 184 %!        61187    89.5   284599   3351     1650    110929  1950
 185 %!        63221    96.2   328975   2099     3099    112075  1951
 186 %!        63639    98.1   346999   1932     3594    113270  1952
 187 %!        64989    99.0   365385   1870     3547    115094  1953
 188 %!        63761   100.0   363112   3578     3350    116219  1954
 189 %!        66019   101.2   397469   2904     3048    117388  1955
 190 %!        67857   104.6   419180   2822     2857    118734  1956
 191 %!        68169   108.4   442769   2936     2798    120445  1957
 192 %!        66513   110.8   444546   4681     2637    121950  1958
 193 %!        68655   112.6   482704   3813     2552    123366  1959
 194 %!        69564   114.2   502601   3931     2514    125368  1960
 195 %!        69331   115.7   518173   4806     2572    127852  1961
 196 %!        70551   116.9   554894   4007     2827    130081  1962 ];
 197 %! % Results certified by NIST using 500 digit arithmetic
 198 %! % b and standard error in b
 199 %! V = [  -3482258.63459582         890420.383607373
 200 %!         15.0618722713733         84.9149257747669
 201 %!        -0.358191792925910E-01    0.334910077722432E-01
 202 %!        -2.02022980381683         0.488399681651699
 203 %!        -1.03322686717359         0.214274163161675
 204 %!        -0.511041056535807E-01    0.226073200069370
 205 %!         1829.15146461355         455.478499142212 ];
 206 %! Rsq = 0.995479004577296;
 207 %! F = 330.285339234588;
 208 %! y = Z(:,1); X = [ones(rows(Z),1), Z(:,2:end)];
 209 %! alpha = 0.05;
 210 %! [b, bint, r, rint, stats] = regress (y, X, alpha);
 211 %! assert(b,V(:,1),3e-6);
 212 %! assert(stats(1),Rsq,1e-12);
 213 %! assert(stats(2),F,3e-8);
 214 %! assert(((bint(:,1)-bint(:,2))/2)/tinv(alpha/2,9),V(:,2),-1.e-5);