1 function [R,sig,ci1,ci2,nan_sig] = corrcoef(X,Y,varargin)
2 % CORRCOEF calculates the correlation matrix from pairwise correlations.
3 % The input data can contain missing values encoded with NaN.
4 % Missing data (NaN's) are handled by pairwise deletion [15].
5 % In order to avoid possible pitfalls, use case-wise deletion or
6 % or check the correlation of NaN's with your data (see below).
7 % A significance test for testing the Hypothesis
8 % 'correlation coefficient R is significantly different to zero'
11 % [...] = CORRCOEF(X);
12 % calculates the (auto-)correlation matrix of X
13 % [...] = CORRCOEF(X,Y);
14 % calculates the crosscorrelation between X and Y
16 % [...] = CORRCOEF(..., Mode);
17 % Mode='Pearson' or 'parametric' [default]
18 % gives the correlation coefficient
19 % also known as the 'product-moment coefficient of correlation'
20 % or 'Pearson''s correlation' [1]
21 % Mode='Spearman' gives 'Spearman''s Rank Correlation Coefficient'
22 % This replaces SPEARMAN.M
23 % Mode='Rank' gives a nonparametric Rank Correlation Coefficient
24 % This is the "Spearman rank correlation with proper handling of ties"
25 % This replaces RANKCORR.M
27 % [...] = CORRCOEF(..., param1, value1, param2, value2, ... );
29 % 'Mode' type of correlation
30 % 'Pearson','parametric'
33 % 'rows' how do deal with missing values encoded as NaN's.
34 % 'complete': remove all rows with at least one NaN
35 % 'pairwise': [default]
36 % 'alpha' 0.01 : significance level to compute confidence interval
38 % [R,p,ci1,ci2,nansig] = CORRCOEF(...);
39 % R is the correlation matrix
40 % R(i,j) is the correlation coefficient r between X(:,i) and Y(:,j)
41 % p gives the significance of R
42 % It tests the null hypothesis that the product moment correlation coefficient is zero
43 % using Student's t-test on the statistic t = r*sqrt(N-2)/sqrt(1-r^2)
44 % where N is the number of samples (Statistics, M. Spiegel, Schaum series).
45 % p > alpha: do not reject the Null hypothesis: 'R is zero'.
46 % p < alpha: The alternative hypothesis 'R is larger than zero' is true with probability (1-alpha).
47 % ci1 lower (1-alpha) confidence interval
48 % ci2 upper (1-alpha) confidence interval
49 % If no alpha is provided, the default alpha is 0.01. This can be changed with function flag_implicit_significance.
50 % nan_sig p-value whether H0: 'NaN''s are not correlated' could be correct
51 % if nan_sig < alpha, H1 ('NaNs are correlated') is very likely.
53 % The result is only valid if the occurence of NaN's is uncorrelated. In
54 % order to avoid this pitfall, the correlation of NaN's should be checked
55 % or case-wise deletion should be applied.
56 % Case-Wise deletion can be implemented
57 % ix = ~any(isnan([X,Y]),2);
58 % [...] = CORRCOEF(X(ix,:),Y(ix,:),...);
60 % Correlation (non-random distribution) of NaN's can be checked with
61 % [nan_R,nan_sig]=corrcoef(X,isnan(X))
62 % or [nan_R,nan_sig]=corrcoef([X,Y],isnan([X,Y]))
63 % or [R,p,ci1,ci2] = CORRCOEF(...);
65 % Further recommandation related to the correlation coefficient:
66 % + LOOK AT THE SCATTERPLOTS to make sure that the relationship is linear
67 % + Correlation is not causation because
68 % it is not clear which parameter is 'cause' and which is 'effect' and
69 % the observed correlation between two variables might be due to the action of other, unobserved variables.
71 % see also: SUMSKIPNAN, COVM, COV, COR, SPEARMAN, RANKCORR, RANKS,
72 % PARTCORRCOEF, flag_implicit_significance
75 % on the correlation coefficient
76 % [ 1] http://mathworld.wolfram.com/CorrelationCoefficient.html
77 % [ 2] http://www.geography.btinternet.co.uk/spearman.htm
78 % [ 3] Hogg, R. V. and Craig, A. T. Introduction to Mathematical Statistics, 5th ed. New York: Macmillan, pp. 338 and 400, 1995.
79 % [ 4] Lehmann, E. L. and D'Abrera, H. J. M. Nonparametrics: Statistical Methods Based on Ranks, rev. ed. Englewood Cliffs, NJ: Prentice-Hall, pp. 292, 300, and 323, 1998.
80 % [ 5] Press, W. H.; Flannery, B. P.; Teukolsky, S. A.; and Vetterling, W. T. Numerical Recipes in FORTRAN: The Art of Scientific Computing, 2nd ed. Cambridge, England: Cambridge University Press, pp. 634-637, 1992
81 % [ 6] http://mathworld.wolfram.com/SpearmanRankCorrelationCoefficient.html
82 % on the significance test of the correlation coefficient
83 % [11] http://www.met.rdg.ac.uk/cag/STATS/corr.html
84 % [12] http://www.janda.org/c10/Lectures/topic06/L24-significanceR.htm
85 % [13] http://faculty.vassar.edu/lowry/ch4apx.html
86 % [14] http://davidmlane.com/hyperstat/B134689.html
87 % [15] http://www.statsoft.com/textbook/stbasic.html%Correlations
89 % [20] http://www.tufts.edu/~gdallal/corr.htm
90 % [21] Fisher transformation http://en.wikipedia.org/wiki/Fisher_transformation
92 % $Id: corrcoef.m 9387 2011-12-15 10:42:14Z schloegl $
93 % Copyright (C) 2000-2004,2008,2009,2011 by Alois Schloegl <alois.schloegl@gmail.com>
94 % This function is part of the NaN-toolbox
95 % http://pub.ist.ac.at/~schloegl/matlab/NaN/
97 % This program is free software: you can redistribute it and/or modify
98 % it under the terms of the GNU General Public License as published by
99 % the Free Software Foundation, either version 3 of the License, or
100 % (at your option) any later version.
102 % This program is distributed in the hope that it will be useful,
103 % but WITHOUT ANY WARRANTY; without even the implied warranty of
104 % MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
105 % GNU General Public License for more details.
107 % You should have received a copy of the GNU General Public License
108 % along with this program. If not, see <http://www.gnu.org/licenses/>.
111 % + handles missing values (encoded as NaN's)
112 % + pairwise deletion of missing data
113 % + checks independence of missing values (NaNs)
114 % + parametric and non-parametric (rank) correlation
115 % + Pearson's correlation
116 % + Spearman's rank correlation
117 % + Rank correlation (non-parametric, Spearman rank correlation with proper handling of ties)
118 % + is fast, using an efficient algorithm O(n.log(n)) for calculating the ranks
119 % + significance test for null-hypthesis: r=0
120 % + confidence interval included
121 % - rank correlation works for cell arrays, too (no check for missing values).
122 % + compatible with Octave and Matlab
124 global FLAG_NANS_OCCURED;
126 NARG = nargout; % needed because nargout is not reentrant in Octave, and corrcoef is recursive
133 fprintf(2,'Error CORRCOEF: Missing argument(s)\n');
144 elseif length(varg)==1,
147 for k = 2:2:length(varg),
148 mode = setfield(mode,lower(varg{k-1}),varg{k});
150 if isfield(mode,'mode')
155 if isempty(Mode) Mode='pearson'; end;
160 FLAG_WARNING = warning; % save warning status
167 fprintf(2,'Error CORRCOEF: X and Y must have the same number of observations (rows).\n');
170 NN = real(~isnan(X)')*real(~isnan(Y));
173 NN = real(~isnan(X)')*real(~isnan(X));
176 %%%%% generate combinations using indices for pairwise calculation of the correlation
177 YESNAN = any(isnan(X(:))) | any(isnan(Y(:)));
179 FLAG_NANS_OCCURED=(1==1);
180 if isfield(mode,'rows')
181 if strcmp(mode.rows,'complete')
189 elseif strcmp(mode.rows,'all')
190 fprintf(1,'Warning: data contains NaNs, rows=pairwise is used.');
191 %%NN(NN < size(X,1)) = NaN;
192 elseif strcmp(mode.rows,'pairwise')
198 IX = ones(c1)-diag(ones(c1,1));
199 [jx, jy ] = find(IX);
200 [jxo,jyo] = find(IX);
203 IX = sparse([],[],[],c1+c2,c1+c2,c1*c2);
204 IX(1:c1,c1+(1:c2)) = 1;
208 [jxo,jyo] = find(IX);
212 if strcmp(lower(Mode(1:7)),'pearson');
213 % see http://mathworld.wolfram.com/CorrelationCoefficient.html
215 [S,N,SSQ] = sumskipnan(X,1);
217 [S2,N2,SSQ2] = sumskipnan(Y,1);
221 cc = CC./NN - M1'*M2;
222 R = cc./sqrt((SSQ./N-M1.*M1)'*(SSQ2./N2-M2.*M2));
227 v = SSQ./N - M.*M; %max(N-1,0);
234 for k = 1:length(jx),
235 %ik = ~any(isnan(X(:,[jx(k),jy(k)])),2);
236 ik = ~isnan(X(:,jx(k))) & ~isnan(X(:,jy(k)));
237 [s,n,s2] = sumskipnan(X(ik,[jx(k),jy(k)]),1);
239 cc = X(ik,jx(k))'*X(ik,jy(k));
240 cc = cc/n(1) - prod(s./n);
241 %r(k) = cc./sqrt(prod(v));
242 R(jxo(k),jyo(k)) = cc./sqrt(prod(v));
246 elseif strcmp(lower(Mode(1:4)),'rank');
247 % see [ 6] http://mathworld.wolfram.com/SpearmanRankCorrelationCoefficient.html
250 R = corrcoef(ranks(X));
252 R = corrcoef(ranks(X),ranks(Y));
258 for k = 1:length(jx),
259 %ik = ~any(isnan(X(:,[jx(k),jy(k)])),2);
260 ik = ~isnan(X(:,jx(k))) & ~isnan(X(:,jy(k)));
261 il = ranks(X(ik,[jx(k),jy(k)]));
262 R(jxo(k),jyo(k)) = corrcoef(il(:,1),il(:,2));
267 elseif strcmp(lower(Mode(1:8)),'spearman');
268 % see [ 6] http://mathworld.wolfram.com/SpearmanRankCorrelationCoefficient.html
273 n = repmat(nan,c1,c2);
276 iy = ranks(X); % calculates ranks;
278 for k = 1:length(jx),
279 [R(jxo(k),jyo(k)),n(jxo(k),jyo(k))] = sumskipnan((iy(:,jx(k)) - iy(:,jy(k))).^2); % NN is the number of non-missing values
282 for k = 1:length(jx),
283 %ik = ~any(isnan(X(:,[jx(k),jy(k)])),2);
284 ik = ~isnan(X(:,jx(k))) & ~isnan(X(:,jy(k)));
285 il = ranks(X(ik,[jx(k),jy(k)]));
286 % NN is the number of non-missing values
287 [R(jxo(k),jyo(k)),n(jxo(k),jyo(k))] = sumskipnan((il(:,1) - il(:,2)).^2);
291 R = 1 - 6 * R ./ (n.*(n.*n-1));
293 elseif strcmp(lower(Mode(1:7)),'partial');
294 fprintf(2,'Error CORRCOEF: use PARTCORRCOEF \n',Mode);
298 elseif strcmp(lower(Mode(1:7)),'kendall');
299 fprintf(2,'Error CORRCOEF: mode ''%s'' not implemented yet.\n',Mode);
303 fprintf(2,'Error CORRCOEF: unknown mode ''%s''\n',Mode);
307 warning(FLAG_WARNING); % restore warning status
312 % CONFIDENCE INTERVAL
313 if isfield(mode,'alpha')
315 elseif exist('flag_implicit_significance','file'),
316 alpha = flag_implicit_significance;
320 % fprintf(1,'CORRCOEF: confidence interval is based on alpha=%f\n',alpha);
326 tmp(tmp<0) = 0; % prevent tmp<0 i.e. imag(t)~=0
327 t = R.*sqrt(max(NN-2,0)./tmp);
329 if exist('t_cdf','file');
331 elseif exist('tcdf','file')>1;
334 fprintf('CORRCOEF: significance test not completed because of missing TCDF-function\n')
335 sig = repmat(nan,size(R));
337 sig = 2 * min(sig,1 - sig);
341 warning(FLAG_WARNING); % restore warning status
347 %tmp(ix1 | ix2) = nan; % avoid division-by-zero warning
348 z = log((1+tmp)./(1-tmp))/2; % Fisher transformation [21]
349 %sz = 1./sqrt(NN-3); % standard error of z
350 sz = sqrt(2)*erfinv(1-alpha)./sqrt(NN-3); % confidence interval for alpha of z
355 %ci1(isnan(ci1))=R(isnan(ci1)); % in case of isnan(ci), the interval limits are exactly the R value
356 %ci2(isnan(ci2))=R(isnan(ci2));
358 if (NARG<5) || ~YESNAN,
359 nan_sig = repmat(NaN,size(R));
360 warning(FLAG_WARNING); % restore warning status
364 %%%%% ----- check independence of NaNs (missing values) -----
365 [nan_R, nan_sig] = corrcoef(X,double(isnan(X)));
367 % remove diagonal elements, because these have not any meaning %
368 nan_sig(isnan(nan_R)) = nan;
369 % remove diagonal elements, because these have not any meaning %
370 nan_R(isnan(nan_R)) = 0;
372 if 0, any(nan_sig(:) < alpha),
373 tmp = nan_sig(:); % Hack to skip NaN's in MIN(X)
374 min_sig = min(tmp(~isnan(tmp))); % Necessary, because Octave returns NaN rather than min(X) for min(NaN,X)
375 fprintf(1,'CORRCOFF Warning: Missing Values (i.e. NaNs) are not independent of data (p-value=%f)\n', min_sig);
376 fprintf(1,' Its recommended to remove all samples (i.e. rows) with any missing value (NaN).\n');
377 fprintf(1,' The null-hypotheses (NaNs are uncorrelated) is rejected for the following parameter pair(s).\n');
378 [ix,iy] = find(nan_sig < alpha);
382 %%%%% ----- end of independence check ------
384 warning(FLAG_WARNING); % restore warning status