X-Git-Url: https://git.creatis.insa-lyon.fr/pubgit/?p=CreaPhase.git;a=blobdiff_plain;f=octave_packages%2Fstatistics-1.1.3%2Fregress.m;fp=octave_packages%2Fstatistics-1.1.3%2Fregress.m;h=bff793e650f561a9307268f5ec3412cc01e6515b;hp=0000000000000000000000000000000000000000;hb=c880e8788dfc484bf23ce13fa2787f2c6bca4863;hpb=1705066eceaaea976f010f669ce8e972f3734b05 diff --git a/octave_packages/statistics-1.1.3/regress.m b/octave_packages/statistics-1.1.3/regress.m new file mode 100644 index 0000000..bff793e --- /dev/null +++ b/octave_packages/statistics-1.1.3/regress.m @@ -0,0 +1,214 @@ +## Copyright (C) 2005, 2006 William Poetra Yoga Hadisoeseno +## Copyright (C) 2011 Nir Krakauer +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {[@var{b}, @var{bint}, @var{r}, @var{rint}, @var{stats}] =} regress (@var{y}, @var{X}, [@var{alpha}]) +## Multiple Linear Regression using Least Squares Fit of @var{y} on @var{X} +## with the model @code{y = X * beta + e}. +## +## Here, +## +## @itemize +## @item +## @code{y} is a column vector of observed values +## @item +## @code{X} is a matrix of regressors, with the first column filled with +## the constant value 1 +## @item +## @code{beta} is a column vector of regression parameters +## @item +## @code{e} is a column vector of random errors +## @end itemize +## +## Arguments are +## +## @itemize +## @item +## @var{y} is the @code{y} in the model +## @item +## @var{X} is the @code{X} in the model +## @item +## @var{alpha} is the significance level used to calculate the confidence +## intervals @var{bint} and @var{rint} (see `Return values' below). If not +## specified, ALPHA defaults to 0.05 +## @end itemize +## +## Return values are +## +## @itemize +## @item +## @var{b} is the @code{beta} in the model +## @item +## @var{bint} is the confidence interval for @var{b} +## @item +## @var{r} is a column vector of residuals +## @item +## @var{rint} is the confidence interval for @var{r} +## @item +## @var{stats} is a row vector containing: +## +## @itemize +## @item The R^2 statistic +## @item The F statistic +## @item The p value for the full model +## @item The estimated error variance +## @end itemize +## @end itemize +## +## @var{r} and @var{rint} can be passed to @code{rcoplot} to visualize +## the residual intervals and identify outliers. +## +## NaN values in @var{y} and @var{X} are removed before calculation begins. +## +## @end deftypefn + +## References: +## - Matlab 7.0 documentation (pdf) +## - ¡¶´óѧÊýѧʵÑé¡· ½ªÆôÔ´ µÈ (textbook) +## - http://www.netnam.vn/unescocourse/statistics/12_5.htm +## - wsolve.m in octave-forge +## - http://www.stanford.edu/class/ee263/ls_ln_matlab.pdf + +function [b, bint, r, rint, stats] = regress (y, X, alpha) + + if (nargin < 2 || nargin > 3) + print_usage; + endif + + if (! ismatrix (y)) + error ("regress: y must be a numeric matrix"); + endif + if (! ismatrix (X)) + error ("regress: X must be a numeric matrix"); + endif + + if (columns (y) != 1) + error ("regress: y must be a column vector"); + endif + + if (rows (y) != rows (X)) + error ("regress: y and X must contain the same number of rows"); + endif + + if (nargin < 3) + alpha = 0.05; + elseif (! isscalar (alpha)) + error ("regress: alpha must be a scalar value") + endif + + notnans = ! logical (sum (isnan ([y X]), 2)); + y = y(notnans); + X = X(notnans,:); + + [Xq Xr] = qr (X, 0); + pinv_X = Xr \ Xq'; + + b = pinv_X * y; + + if (nargout > 1) + + n = rows (X); + p = columns (X); + dof = n - p; + t_alpha_2 = tinv (alpha / 2, dof); + + r = y - X * b; # added -- Nir + SSE = sum (r .^ 2); + v = SSE / dof; + + # c = diag(inv (X' * X)) using (economy) QR decomposition + # which means that we only have to use Xr + c = diag (inv (Xr' * Xr)); + + db = t_alpha_2 * sqrt (v * c); + + bint = [b + db, b - db]; + + endif + + if (nargout > 3) + + dof1 = n - p - 1; + h = sum(X.*pinv_X', 2); #added -- Nir (same as diag(X*pinv_X), without doing the matrix multiply) + + # From Matlab's documentation on Multiple Linear Regression, + # sigmaihat2 = norm (r) ^ 2 / dof1 - r .^ 2 / (dof1 * (1 - h)); + # dr = -tinv (1 - alpha / 2, dof) * sqrt (sigmaihat2 .* (1 - h)); + # Substitute + # norm (r) ^ 2 == sum (r .^ 2) == SSE + # -tinv (1 - alpha / 2, dof) == tinv (alpha / 2, dof) == t_alpha_2 + # We get + # sigmaihat2 = (SSE - r .^ 2 / (1 - h)) / dof1; + # dr = t_alpha_2 * sqrt (sigmaihat2 .* (1 - h)); + # Combine, we get + # dr = t_alpha_2 * sqrt ((SSE * (1 - h) - (r .^ 2)) / dof1); + + dr = t_alpha_2 * sqrt ((SSE * (1 - h) - (r .^ 2)) / dof1); + + rint = [r + dr, r - dr]; + + endif + + if (nargout > 4) + + R2 = 1 - SSE / sum ((y - mean (y)) .^ 2); +# F = (R2 / (p - 1)) / ((1 - R2) / dof); + F = dof / (p - 1) / (1 / R2 - 1); + pval = 1 - fcdf (F, p - 1, dof); + + stats = [R2 F pval v]; + + endif + +endfunction + + +%!test +%! % Longley data from the NIST Statistical Reference Dataset +%! Z = [ 60323 83.0 234289 2356 1590 107608 1947 +%! 61122 88.5 259426 2325 1456 108632 1948 +%! 60171 88.2 258054 3682 1616 109773 1949 +%! 61187 89.5 284599 3351 1650 110929 1950 +%! 63221 96.2 328975 2099 3099 112075 1951 +%! 63639 98.1 346999 1932 3594 113270 1952 +%! 64989 99.0 365385 1870 3547 115094 1953 +%! 63761 100.0 363112 3578 3350 116219 1954 +%! 66019 101.2 397469 2904 3048 117388 1955 +%! 67857 104.6 419180 2822 2857 118734 1956 +%! 68169 108.4 442769 2936 2798 120445 1957 +%! 66513 110.8 444546 4681 2637 121950 1958 +%! 68655 112.6 482704 3813 2552 123366 1959 +%! 69564 114.2 502601 3931 2514 125368 1960 +%! 69331 115.7 518173 4806 2572 127852 1961 +%! 70551 116.9 554894 4007 2827 130081 1962 ]; +%! % Results certified by NIST using 500 digit arithmetic +%! % b and standard error in b +%! V = [ -3482258.63459582 890420.383607373 +%! 15.0618722713733 84.9149257747669 +%! -0.358191792925910E-01 0.334910077722432E-01 +%! -2.02022980381683 0.488399681651699 +%! -1.03322686717359 0.214274163161675 +%! -0.511041056535807E-01 0.226073200069370 +%! 1829.15146461355 455.478499142212 ]; +%! Rsq = 0.995479004577296; +%! F = 330.285339234588; +%! y = Z(:,1); X = [ones(rows(Z),1), Z(:,2:end)]; +%! alpha = 0.05; +%! [b, bint, r, rint, stats] = regress (y, X, alpha); +%! assert(b,V(:,1),3e-6); +%! assert(stats(1),Rsq,1e-12); +%! assert(stats(2),F,3e-8); +%! assert(((bint(:,1)-bint(:,2))/2)/tinv(alpha/2,9),V(:,2),-1.e-5);