octave_packages/nan-2.5.5/corrcoef.m

   1 function [R,sig,ci1,ci2,nan_sig] = corrcoef(X,Y,varargin)
   2 % CORRCOEF calculates the correlation matrix from pairwise correlations.
   3 %   The input data can contain missing values encoded with NaN.
   4 %   Missing data (NaN's) are handled by pairwise deletion [15].
   5 %   In order to avoid possible pitfalls, use case-wise deletion or
   6 %   or check the correlation of NaN's with your data (see below).
   7 %   A significance test for testing the Hypothesis
   8 %   'correlation coefficient R is significantly different to zero'
   9 %   is included.
  10 %
  11 % [...] = CORRCOEF(X);
  12 %      calculates the (auto-)correlation matrix of X
  13 % [...] = CORRCOEF(X,Y);
  14 %      calculates the crosscorrelation between X and Y
  15 %
  16 % [...] = CORRCOEF(..., Mode);
  17 %       Mode='Pearson' or 'parametric' [default]
  18 %               gives the correlation coefficient
  19 %               also known as the 'product-moment coefficient of correlation'
  20 %               or 'Pearson''s correlation' [1]
  21 %       Mode='Spearman'         gives 'Spearman''s Rank Correlation Coefficient'
  22 %               This replaces SPEARMAN.M
  23 %       Mode='Rank'             gives a nonparametric Rank Correlation Coefficient
  24 %               This is the "Spearman rank correlation with proper handling of ties"
  25 %               This replaces RANKCORR.M
  26 %
  27 % [...] = CORRCOEF(..., param1, value1, param2, value2, ... );
  28 %       param           value
  29 %       'Mode'          type of correlation
  30 %               'Pearson','parametric'
  31 %               'Spearman'
  32 %               'rank'
  33 %       'rows'          how do deal with missing values encoded as NaN's.
  34 %               'complete': remove all rows with at least one NaN
  35 %               'pairwise': [default]
  36 %       'alpha'         0.01    : significance level to compute confidence interval
  37 %
  38 % [R,p,ci1,ci2,nansig] = CORRCOEF(...);
  39 %       R is the correlation matrix
  40 %       R(i,j) is the correlation coefficient r between X(:,i) and Y(:,j)
  41 %  p    gives the significance of R
  42 %       It tests the null hypothesis that the product moment correlation coefficient is zero
  43 %       using Student's t-test on the statistic t = r*sqrt(N-2)/sqrt(1-r^2)
  44 %       where N is the number of samples (Statistics, M. Spiegel, Schaum series).
  45 %  p > alpha: do not reject the Null hypothesis: 'R is zero'.
  46 %  p < alpha: The alternative hypothesis 'R is larger than zero' is true with probability (1-alpha).
  47 %  ci1  lower (1-alpha) confidence interval
  48 %  ci2  upper (1-alpha) confidence interval
  49 %       If no alpha is provided, the default alpha is 0.01. This can be changed with function flag_implicit_significance.
  50 %  nan_sig      p-value whether H0: 'NaN''s are not correlated' could be correct
  51 %       if nan_sig < alpha, H1 ('NaNs are correlated') is very likely.
  52 %
  53 % The result is only valid if the occurence of NaN's is uncorrelated. In
  54 % order to avoid this pitfall, the correlation of NaN's should be checked
  55 % or case-wise deletion should be applied.
  56 %   Case-Wise deletion can be implemented
  57 %    ix = ~any(isnan([X,Y]),2);
  58 %    [...] = CORRCOEF(X(ix,:),Y(ix,:),...);
  59 %
  60 %  Correlation (non-random distribution) of NaN's can be checked with
  61 %       [nan_R,nan_sig]=corrcoef(X,isnan(X))
  62 %   or  [nan_R,nan_sig]=corrcoef([X,Y],isnan([X,Y]))
  63 %   or  [R,p,ci1,ci2] = CORRCOEF(...);
  64 %
  65 % Further recommandation related to the correlation coefficient:
  66 % + LOOK AT THE SCATTERPLOTS to make sure that the relationship is linear
  67 % + Correlation is not causation because
  68 %       it is not clear which parameter is 'cause' and which is 'effect' and
  69 %       the observed correlation between two variables might be due to the action of other, unobserved variables.
  70 %
  71 % see also: SUMSKIPNAN, COVM, COV, COR, SPEARMAN, RANKCORR, RANKS,
  72 %       PARTCORRCOEF, flag_implicit_significance
  73 %
  74 % REFERENCES:
  75 % on the correlation coefficient
  76 % [ 1] http://mathworld.wolfram.com/CorrelationCoefficient.html
  77 % [ 2] http://www.geography.btinternet.co.uk/spearman.htm
  78 % [ 3] Hogg, R. V. and Craig, A. T. Introduction to Mathematical Statistics, 5th ed.  New York: Macmillan, pp. 338 and 400, 1995.
  79 % [ 4] Lehmann, E. L. and D'Abrera, H. J. M. Nonparametrics: Statistical Methods Based on Ranks, rev. ed. Englewood Cliffs, NJ: Prentice-Hall, pp. 292, 300, and 323, 1998.
  80 % [ 5] Press, W. H.; Flannery, B. P.; Teukolsky, S. A.; and Vetterling, W. T. Numerical Recipes in FORTRAN: The Art of Scientific Computing, 2nd ed. Cambridge, England: Cambridge University Press, pp. 634-637, 1992
  81 % [ 6] http://mathworld.wolfram.com/SpearmanRankCorrelationCoefficient.html
  82 % on the significance test of the correlation coefficient
  83 % [11] http://www.met.rdg.ac.uk/cag/STATS/corr.html
  84 % [12] http://www.janda.org/c10/Lectures/topic06/L24-significanceR.htm
  85 % [13] http://faculty.vassar.edu/lowry/ch4apx.html
  86 % [14] http://davidmlane.com/hyperstat/B134689.html
  87 % [15] http://www.statsoft.com/textbook/stbasic.html%Correlations
  88 % others
  89 % [20] http://www.tufts.edu/~gdallal/corr.htm
  90 % [21] Fisher transformation http://en.wikipedia.org/wiki/Fisher_transformation
  91
  92 %       $Id: corrcoef.m 9387 2011-12-15 10:42:14Z schloegl $
  93 %       Copyright (C) 2000-2004,2008,2009,2011 by Alois Schloegl <alois.schloegl@gmail.com>
  94 %       This function is part of the NaN-toolbox
  95 %       http://pub.ist.ac.at/~schloegl/matlab/NaN/
  96
  97 %    This program is free software: you can redistribute it and/or modify
  98 %    it under the terms of the GNU General Public License as published by
  99 %    the Free Software Foundation, either version 3 of the License, or
 100 %    (at your option) any later version.
 101 %
 102 %    This program is distributed in the hope that it will be useful,
 103 %    but WITHOUT ANY WARRANTY; without even the implied warranty of
 104 %    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 105 %    GNU General Public License for more details.
 106 %
 107 %    You should have received a copy of the GNU General Public License
 108 %    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 109
 110 % Features:
 111 % + handles missing values (encoded as NaN's)
 112 %       + pairwise deletion of missing data
 113 %       + checks independence of missing values (NaNs)
 114 % + parametric and non-parametric (rank) correlation
 115 %       + Pearson's correlation
 116 %       + Spearman's rank correlation
 117 %       + Rank correlation (non-parametric, Spearman rank correlation with proper handling of ties)
 118 % + is fast, using an efficient algorithm O(n.log(n)) for calculating the ranks
 119 % + significance test for null-hypthesis: r=0
 120 % + confidence interval included
 121 % - rank correlation works for cell arrays, too (no check for missing values).
 122 % + compatible with Octave and Matlab
 123
 124 global FLAG_NANS_OCCURED;
 125
 126 NARG = nargout; % needed because nargout is not reentrant in Octave, and corrcoef is recursive
 127 mode = [];
 128
 129 if nargin==1
 130         Y = [];
 131         Mode='Pearson';
 132 elseif nargin==0
 133         fprintf(2,'Error CORRCOEF: Missing argument(s)\n');
 134 elseif nargin>1
 135         if ischar(Y)
 136                 varg = [Y,varargin];
 137                 Y=[];
 138         else
 139                 varg = varargin;
 140         end;
 141
 142         if length(varg)<1,
 143                 Mode = 'Pearson';
 144         elseif length(varg)==1,
 145                 Mode = varg{1};
 146         else
 147                 for k = 2:2:length(varg),
 148                         mode = setfield(mode,lower(varg{k-1}),varg{k});
 149                 end;
 150                 if isfield(mode,'mode')
 151                         Mode = mode.mode;
 152                 end;
 153         end;
 154 end;
 155 if isempty(Mode) Mode='pearson'; end;
 156 Mode=[Mode,'        '];
 157
 158
 159
 160 FLAG_WARNING = warning;         % save warning status
 161 warning('off');
 162
 163 [r1,c1]=size(X);
 164 if ~isempty(Y)
 165         [r2,c2]=size(Y);
 166         if r1~=r2,
 167                 fprintf(2,'Error CORRCOEF: X and Y must have the same number of observations (rows).\n');
 168                 return;
 169         end;
 170         NN = real(~isnan(X)')*real(~isnan(Y));
 171 else
 172         [r2,c2]=size(X);
 173         NN = real(~isnan(X)')*real(~isnan(X));
 174 end;
 175
 176 %%%%% generate combinations using indices for pairwise calculation of the correlation
 177 YESNAN = any(isnan(X(:))) | any(isnan(Y(:)));
 178 if YESNAN,
 179         FLAG_NANS_OCCURED=(1==1);
 180         if isfield(mode,'rows')
 181                 if strcmp(mode.rows,'complete')
 182                         ix = ~any([X,Y],2);
 183                         X = X(ix,:);
 184                         if ~isempty(Y)
 185                                 Y = Y(ix,:);
 186                         end;
 187                         YESNAN = 0;
 188                         NN = size(X,1);
 189                 elseif strcmp(mode.rows,'all')
 190                         fprintf(1,'Warning: data contains NaNs, rows=pairwise is used.');
 191                         %%NN(NN < size(X,1)) = NaN;
 192                 elseif strcmp(mode.rows,'pairwise')
 193                         %%% default
 194                 end;
 195         end;
 196 end;
 197 if isempty(Y),
 198         IX = ones(c1)-diag(ones(c1,1));
 199         [jx, jy ] = find(IX);
 200         [jxo,jyo] = find(IX);
 201         R = eye(c1);
 202 else
 203         IX = sparse([],[],[],c1+c2,c1+c2,c1*c2);
 204         IX(1:c1,c1+(1:c2)) = 1;
 205         [jx,jy] = find(IX);
 206
 207         IX = ones(c1,c2);
 208         [jxo,jyo] = find(IX);
 209         R = zeros(c1,c2);
 210 end;
 211
 212 if strcmp(lower(Mode(1:7)),'pearson');
 213         % see http://mathworld.wolfram.com/CorrelationCoefficient.html
 214         if ~YESNAN,
 215                 [S,N,SSQ] = sumskipnan(X,1);
 216                 if ~isempty(Y),
 217                         [S2,N2,SSQ2] = sumskipnan(Y,1);
 218                         CC = X'*Y;
 219                         M1 = S./N;
 220                         M2 = S2./N2;
 221                         cc = CC./NN - M1'*M2;
 222                         R  = cc./sqrt((SSQ./N-M1.*M1)'*(SSQ2./N2-M2.*M2));
 223                 else
 224                         CC = X'*X;
 225                         M  = S./N;
 226                         cc = CC./NN - M'*M;
 227                         v  = SSQ./N - M.*M; %max(N-1,0);
 228                         R  = cc./sqrt(v'*v);
 229                 end;
 230         else
 231                 if ~isempty(Y),
 232                         X  = [X,Y];
 233                 end;
 234                 for k = 1:length(jx),
 235                         %ik = ~any(isnan(X(:,[jx(k),jy(k)])),2);
 236                         ik = ~isnan(X(:,jx(k))) & ~isnan(X(:,jy(k)));
 237                         [s,n,s2] = sumskipnan(X(ik,[jx(k),jy(k)]),1);
 238                         v  = (s2-s.*s./n)./n;
 239                         cc = X(ik,jx(k))'*X(ik,jy(k));
 240                         cc = cc/n(1) - prod(s./n);
 241                         %r(k) = cc./sqrt(prod(v));
 242                         R(jxo(k),jyo(k)) = cc./sqrt(prod(v));
 243                 end;
 244         end
 245
 246 elseif strcmp(lower(Mode(1:4)),'rank');
 247         % see [ 6] http://mathworld.wolfram.com/SpearmanRankCorrelationCoefficient.html
 248         if ~YESNAN,
 249                 if isempty(Y)
 250                         R = corrcoef(ranks(X));
 251                 else
 252                         R = corrcoef(ranks(X),ranks(Y));
 253                 end;
 254         else
 255                 if ~isempty(Y),
 256                         X = [X,Y];
 257                 end;
 258                 for k = 1:length(jx),
 259                         %ik = ~any(isnan(X(:,[jx(k),jy(k)])),2);
 260                         ik = ~isnan(X(:,jx(k))) & ~isnan(X(:,jy(k)));
 261                         il = ranks(X(ik,[jx(k),jy(k)]));
 262                         R(jxo(k),jyo(k)) = corrcoef(il(:,1),il(:,2));
 263                 end;
 264                 X = ranks(X);
 265         end;
 266
 267 elseif strcmp(lower(Mode(1:8)),'spearman');
 268         % see [ 6] http://mathworld.wolfram.com/SpearmanRankCorrelationCoefficient.html
 269         if ~isempty(Y),
 270                 X = [X,Y];
 271         end;
 272
 273         n = repmat(nan,c1,c2);
 274
 275         if ~YESNAN,
 276                 iy = ranks(X);  %  calculates ranks;
 277
 278                 for k = 1:length(jx),
 279                         [R(jxo(k),jyo(k)),n(jxo(k),jyo(k))] = sumskipnan((iy(:,jx(k)) - iy(:,jy(k))).^2);       % NN is the number of non-missing values
 280                 end;
 281         else
 282                 for k = 1:length(jx),
 283                         %ik = ~any(isnan(X(:,[jx(k),jy(k)])),2);
 284                         ik = ~isnan(X(:,jx(k))) & ~isnan(X(:,jy(k)));
 285                         il = ranks(X(ik,[jx(k),jy(k)]));
 286                         % NN is the number of non-missing values
 287                         [R(jxo(k),jyo(k)),n(jxo(k),jyo(k))] = sumskipnan((il(:,1) - il(:,2)).^2);
 288                 end;
 289                 X = ranks(X);
 290         end;
 291         R = 1 - 6 * R ./ (n.*(n.*n-1));
 292
 293 elseif strcmp(lower(Mode(1:7)),'partial');
 294         fprintf(2,'Error CORRCOEF: use PARTCORRCOEF \n',Mode);
 295
 296         return;
 297
 298 elseif strcmp(lower(Mode(1:7)),'kendall');
 299         fprintf(2,'Error CORRCOEF: mode ''%s'' not implemented yet.\n',Mode);
 300
 301         return;
 302 else
 303         fprintf(2,'Error CORRCOEF: unknown mode ''%s''\n',Mode);
 304 end;
 305
 306 if (NARG<2),
 307         warning(FLAG_WARNING);  % restore warning status
 308         return;
 309 end;
 310
 311
 312 % CONFIDENCE INTERVAL
 313 if isfield(mode,'alpha')
 314         alpha = mode.alpha;
 315 elseif exist('flag_implicit_significance','file'),
 316         alpha = flag_implicit_significance;
 317 else
 318         alpha = 0.01;
 319 end;
 320 % fprintf(1,'CORRCOEF: confidence interval is based on alpha=%f\n',alpha);
 321
 322
 323 % SIGNIFICANCE TEST
 324 R(isnan(R))=0;
 325 tmp = 1 - R.*R;
 326 tmp(tmp<0) = 0;         % prevent tmp<0 i.e. imag(t)~=0
 327 t   = R.*sqrt(max(NN-2,0)./tmp);
 328
 329 if exist('t_cdf','file');
 330         sig = t_cdf(t,NN-2);
 331 elseif exist('tcdf','file')>1;
 332         sig = tcdf(t,NN-2);
 333 else
 334         fprintf('CORRCOEF: significance test not completed because of missing TCDF-function\n')
 335         sig = repmat(nan,size(R));
 336 end;
 337 sig  = 2 * min(sig,1 - sig);
 338
 339
 340 if NARG<3,
 341         warning(FLAG_WARNING);  % restore warning status
 342         return;
 343 end;
 344
 345
 346 tmp = R;
 347 %tmp(ix1 | ix2) = nan;          % avoid division-by-zero warning
 348 z   = log((1+tmp)./(1-tmp))/2;  % Fisher transformation [21]
 349 %sz = 1./sqrt(NN-3);            % standard error of z
 350 sz  = sqrt(2)*erfinv(1-alpha)./sqrt(NN-3);      % confidence interval for alpha of z
 351
 352 ci1 = tanh(z-sz);
 353 ci2 = tanh(z+sz);
 354
 355 %ci1(isnan(ci1))=R(isnan(ci1)); % in case of isnan(ci), the interval limits are exactly the R value
 356 %ci2(isnan(ci2))=R(isnan(ci2));
 357
 358 if (NARG<5) || ~YESNAN,
 359         nan_sig = repmat(NaN,size(R));
 360         warning(FLAG_WARNING);  % restore warning status
 361         return;
 362 end;
 363
 364 %%%%% ----- check independence of NaNs (missing values) -----
 365 [nan_R, nan_sig] = corrcoef(X,double(isnan(X)));
 366
 367 % remove diagonal elements, because these have not any meaning %
 368 nan_sig(isnan(nan_R)) = nan;
 369 % remove diagonal elements, because these have not any meaning %
 370 nan_R(isnan(nan_R)) = 0;
 371
 372 if 0, any(nan_sig(:) < alpha),
 373         tmp = nan_sig(:);                       % Hack to skip NaN's in MIN(X)
 374         min_sig = min(tmp(~isnan(tmp)));        % Necessary, because Octave returns NaN rather than min(X) for min(NaN,X)
 375         fprintf(1,'CORRCOFF Warning: Missing Values (i.e. NaNs) are not independent of data (p-value=%f)\n', min_sig);
 376         fprintf(1,'   Its recommended to remove all samples (i.e. rows) with any missing value (NaN).\n');
 377         fprintf(1,'   The null-hypotheses (NaNs are uncorrelated) is rejected for the following parameter pair(s).\n');
 378         [ix,iy] = find(nan_sig < alpha);
 379         disp([ix,iy])
 380 end;
 381
 382 %%%%% ----- end of independence check ------
 383
 384 warning(FLAG_WARNING);  % restore warning status
 385 return;
 386