X-Git-Url: https://git.creatis.insa-lyon.fr/pubgit/?p=CreaPhase.git;a=blobdiff_plain;f=octave_packages%2Fnan-2.5.5%2Fcorrcoef.m;fp=octave_packages%2Fnan-2.5.5%2Fcorrcoef.m;h=f14b7548555fc216b22db6b5d27c9fed103bd2c8;hp=0000000000000000000000000000000000000000;hb=c880e8788dfc484bf23ce13fa2787f2c6bca4863;hpb=1705066eceaaea976f010f669ce8e972f3734b05 diff --git a/octave_packages/nan-2.5.5/corrcoef.m b/octave_packages/nan-2.5.5/corrcoef.m new file mode 100644 index 0000000..f14b754 --- /dev/null +++ b/octave_packages/nan-2.5.5/corrcoef.m @@ -0,0 +1,386 @@ +function [R,sig,ci1,ci2,nan_sig] = corrcoef(X,Y,varargin) +% CORRCOEF calculates the correlation matrix from pairwise correlations. +% The input data can contain missing values encoded with NaN. +% Missing data (NaN's) are handled by pairwise deletion [15]. +% In order to avoid possible pitfalls, use case-wise deletion or +% or check the correlation of NaN's with your data (see below). +% A significance test for testing the Hypothesis +% 'correlation coefficient R is significantly different to zero' +% is included. +% +% [...] = CORRCOEF(X); +% calculates the (auto-)correlation matrix of X +% [...] = CORRCOEF(X,Y); +% calculates the crosscorrelation between X and Y +% +% [...] = CORRCOEF(..., Mode); +% Mode='Pearson' or 'parametric' [default] +% gives the correlation coefficient +% also known as the 'product-moment coefficient of correlation' +% or 'Pearson''s correlation' [1] +% Mode='Spearman' gives 'Spearman''s Rank Correlation Coefficient' +% This replaces SPEARMAN.M +% Mode='Rank' gives a nonparametric Rank Correlation Coefficient +% This is the "Spearman rank correlation with proper handling of ties" +% This replaces RANKCORR.M +% +% [...] = CORRCOEF(..., param1, value1, param2, value2, ... ); +% param value +% 'Mode' type of correlation +% 'Pearson','parametric' +% 'Spearman' +% 'rank' +% 'rows' how do deal with missing values encoded as NaN's. +% 'complete': remove all rows with at least one NaN +% 'pairwise': [default] +% 'alpha' 0.01 : significance level to compute confidence interval +% +% [R,p,ci1,ci2,nansig] = CORRCOEF(...); +% R is the correlation matrix +% R(i,j) is the correlation coefficient r between X(:,i) and Y(:,j) +% p gives the significance of R +% It tests the null hypothesis that the product moment correlation coefficient is zero +% using Student's t-test on the statistic t = r*sqrt(N-2)/sqrt(1-r^2) +% where N is the number of samples (Statistics, M. Spiegel, Schaum series). +% p > alpha: do not reject the Null hypothesis: 'R is zero'. +% p < alpha: The alternative hypothesis 'R is larger than zero' is true with probability (1-alpha). +% ci1 lower (1-alpha) confidence interval +% ci2 upper (1-alpha) confidence interval +% If no alpha is provided, the default alpha is 0.01. This can be changed with function flag_implicit_significance. +% nan_sig p-value whether H0: 'NaN''s are not correlated' could be correct +% if nan_sig < alpha, H1 ('NaNs are correlated') is very likely. +% +% The result is only valid if the occurence of NaN's is uncorrelated. In +% order to avoid this pitfall, the correlation of NaN's should be checked +% or case-wise deletion should be applied. +% Case-Wise deletion can be implemented +% ix = ~any(isnan([X,Y]),2); +% [...] = CORRCOEF(X(ix,:),Y(ix,:),...); +% +% Correlation (non-random distribution) of NaN's can be checked with +% [nan_R,nan_sig]=corrcoef(X,isnan(X)) +% or [nan_R,nan_sig]=corrcoef([X,Y],isnan([X,Y])) +% or [R,p,ci1,ci2] = CORRCOEF(...); +% +% Further recommandation related to the correlation coefficient: +% + LOOK AT THE SCATTERPLOTS to make sure that the relationship is linear +% + Correlation is not causation because +% it is not clear which parameter is 'cause' and which is 'effect' and +% the observed correlation between two variables might be due to the action of other, unobserved variables. +% +% see also: SUMSKIPNAN, COVM, COV, COR, SPEARMAN, RANKCORR, RANKS, +% PARTCORRCOEF, flag_implicit_significance +% +% REFERENCES: +% on the correlation coefficient +% [ 1] http://mathworld.wolfram.com/CorrelationCoefficient.html +% [ 2] http://www.geography.btinternet.co.uk/spearman.htm +% [ 3] Hogg, R. V. and Craig, A. T. Introduction to Mathematical Statistics, 5th ed. New York: Macmillan, pp. 338 and 400, 1995. +% [ 4] Lehmann, E. L. and D'Abrera, H. J. M. Nonparametrics: Statistical Methods Based on Ranks, rev. ed. Englewood Cliffs, NJ: Prentice-Hall, pp. 292, 300, and 323, 1998. +% [ 5] Press, W. H.; Flannery, B. P.; Teukolsky, S. A.; and Vetterling, W. T. Numerical Recipes in FORTRAN: The Art of Scientific Computing, 2nd ed. Cambridge, England: Cambridge University Press, pp. 634-637, 1992 +% [ 6] http://mathworld.wolfram.com/SpearmanRankCorrelationCoefficient.html +% on the significance test of the correlation coefficient +% [11] http://www.met.rdg.ac.uk/cag/STATS/corr.html +% [12] http://www.janda.org/c10/Lectures/topic06/L24-significanceR.htm +% [13] http://faculty.vassar.edu/lowry/ch4apx.html +% [14] http://davidmlane.com/hyperstat/B134689.html +% [15] http://www.statsoft.com/textbook/stbasic.html%Correlations +% others +% [20] http://www.tufts.edu/~gdallal/corr.htm +% [21] Fisher transformation http://en.wikipedia.org/wiki/Fisher_transformation + +% $Id: corrcoef.m 9387 2011-12-15 10:42:14Z schloegl $ +% Copyright (C) 2000-2004,2008,2009,2011 by Alois Schloegl +% This function is part of the NaN-toolbox +% http://pub.ist.ac.at/~schloegl/matlab/NaN/ + +% This program is free software: you can redistribute it and/or modify +% it under the terms of the GNU General Public License as published by +% the Free Software Foundation, either version 3 of the License, or +% (at your option) any later version. +% +% This program is distributed in the hope that it will be useful, +% but WITHOUT ANY WARRANTY; without even the implied warranty of +% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +% GNU General Public License for more details. +% +% You should have received a copy of the GNU General Public License +% along with this program. If not, see . + +% Features: +% + handles missing values (encoded as NaN's) +% + pairwise deletion of missing data +% + checks independence of missing values (NaNs) +% + parametric and non-parametric (rank) correlation +% + Pearson's correlation +% + Spearman's rank correlation +% + Rank correlation (non-parametric, Spearman rank correlation with proper handling of ties) +% + is fast, using an efficient algorithm O(n.log(n)) for calculating the ranks +% + significance test for null-hypthesis: r=0 +% + confidence interval included +% - rank correlation works for cell arrays, too (no check for missing values). +% + compatible with Octave and Matlab + +global FLAG_NANS_OCCURED; + +NARG = nargout; % needed because nargout is not reentrant in Octave, and corrcoef is recursive +mode = []; + +if nargin==1 + Y = []; + Mode='Pearson'; +elseif nargin==0 + fprintf(2,'Error CORRCOEF: Missing argument(s)\n'); +elseif nargin>1 + if ischar(Y) + varg = [Y,varargin]; + Y=[]; + else + varg = varargin; + end; + + if length(varg)<1, + Mode = 'Pearson'; + elseif length(varg)==1, + Mode = varg{1}; + else + for k = 2:2:length(varg), + mode = setfield(mode,lower(varg{k-1}),varg{k}); + end; + if isfield(mode,'mode') + Mode = mode.mode; + end; + end; +end; +if isempty(Mode) Mode='pearson'; end; +Mode=[Mode,' ']; + + + +FLAG_WARNING = warning; % save warning status +warning('off'); + +[r1,c1]=size(X); +if ~isempty(Y) + [r2,c2]=size(Y); + if r1~=r2, + fprintf(2,'Error CORRCOEF: X and Y must have the same number of observations (rows).\n'); + return; + end; + NN = real(~isnan(X)')*real(~isnan(Y)); +else + [r2,c2]=size(X); + NN = real(~isnan(X)')*real(~isnan(X)); +end; + +%%%%% generate combinations using indices for pairwise calculation of the correlation +YESNAN = any(isnan(X(:))) | any(isnan(Y(:))); +if YESNAN, + FLAG_NANS_OCCURED=(1==1); + if isfield(mode,'rows') + if strcmp(mode.rows,'complete') + ix = ~any([X,Y],2); + X = X(ix,:); + if ~isempty(Y) + Y = Y(ix,:); + end; + YESNAN = 0; + NN = size(X,1); + elseif strcmp(mode.rows,'all') + fprintf(1,'Warning: data contains NaNs, rows=pairwise is used.'); + %%NN(NN < size(X,1)) = NaN; + elseif strcmp(mode.rows,'pairwise') + %%% default + end; + end; +end; +if isempty(Y), + IX = ones(c1)-diag(ones(c1,1)); + [jx, jy ] = find(IX); + [jxo,jyo] = find(IX); + R = eye(c1); +else + IX = sparse([],[],[],c1+c2,c1+c2,c1*c2); + IX(1:c1,c1+(1:c2)) = 1; + [jx,jy] = find(IX); + + IX = ones(c1,c2); + [jxo,jyo] = find(IX); + R = zeros(c1,c2); +end; + +if strcmp(lower(Mode(1:7)),'pearson'); + % see http://mathworld.wolfram.com/CorrelationCoefficient.html + if ~YESNAN, + [S,N,SSQ] = sumskipnan(X,1); + if ~isempty(Y), + [S2,N2,SSQ2] = sumskipnan(Y,1); + CC = X'*Y; + M1 = S./N; + M2 = S2./N2; + cc = CC./NN - M1'*M2; + R = cc./sqrt((SSQ./N-M1.*M1)'*(SSQ2./N2-M2.*M2)); + else + CC = X'*X; + M = S./N; + cc = CC./NN - M'*M; + v = SSQ./N - M.*M; %max(N-1,0); + R = cc./sqrt(v'*v); + end; + else + if ~isempty(Y), + X = [X,Y]; + end; + for k = 1:length(jx), + %ik = ~any(isnan(X(:,[jx(k),jy(k)])),2); + ik = ~isnan(X(:,jx(k))) & ~isnan(X(:,jy(k))); + [s,n,s2] = sumskipnan(X(ik,[jx(k),jy(k)]),1); + v = (s2-s.*s./n)./n; + cc = X(ik,jx(k))'*X(ik,jy(k)); + cc = cc/n(1) - prod(s./n); + %r(k) = cc./sqrt(prod(v)); + R(jxo(k),jyo(k)) = cc./sqrt(prod(v)); + end; + end + +elseif strcmp(lower(Mode(1:4)),'rank'); + % see [ 6] http://mathworld.wolfram.com/SpearmanRankCorrelationCoefficient.html + if ~YESNAN, + if isempty(Y) + R = corrcoef(ranks(X)); + else + R = corrcoef(ranks(X),ranks(Y)); + end; + else + if ~isempty(Y), + X = [X,Y]; + end; + for k = 1:length(jx), + %ik = ~any(isnan(X(:,[jx(k),jy(k)])),2); + ik = ~isnan(X(:,jx(k))) & ~isnan(X(:,jy(k))); + il = ranks(X(ik,[jx(k),jy(k)])); + R(jxo(k),jyo(k)) = corrcoef(il(:,1),il(:,2)); + end; + X = ranks(X); + end; + +elseif strcmp(lower(Mode(1:8)),'spearman'); + % see [ 6] http://mathworld.wolfram.com/SpearmanRankCorrelationCoefficient.html + if ~isempty(Y), + X = [X,Y]; + end; + + n = repmat(nan,c1,c2); + + if ~YESNAN, + iy = ranks(X); % calculates ranks; + + for k = 1:length(jx), + [R(jxo(k),jyo(k)),n(jxo(k),jyo(k))] = sumskipnan((iy(:,jx(k)) - iy(:,jy(k))).^2); % NN is the number of non-missing values + end; + else + for k = 1:length(jx), + %ik = ~any(isnan(X(:,[jx(k),jy(k)])),2); + ik = ~isnan(X(:,jx(k))) & ~isnan(X(:,jy(k))); + il = ranks(X(ik,[jx(k),jy(k)])); + % NN is the number of non-missing values + [R(jxo(k),jyo(k)),n(jxo(k),jyo(k))] = sumskipnan((il(:,1) - il(:,2)).^2); + end; + X = ranks(X); + end; + R = 1 - 6 * R ./ (n.*(n.*n-1)); + +elseif strcmp(lower(Mode(1:7)),'partial'); + fprintf(2,'Error CORRCOEF: use PARTCORRCOEF \n',Mode); + + return; + +elseif strcmp(lower(Mode(1:7)),'kendall'); + fprintf(2,'Error CORRCOEF: mode ''%s'' not implemented yet.\n',Mode); + + return; +else + fprintf(2,'Error CORRCOEF: unknown mode ''%s''\n',Mode); +end; + +if (NARG<2), + warning(FLAG_WARNING); % restore warning status + return; +end; + + +% CONFIDENCE INTERVAL +if isfield(mode,'alpha') + alpha = mode.alpha; +elseif exist('flag_implicit_significance','file'), + alpha = flag_implicit_significance; +else + alpha = 0.01; +end; +% fprintf(1,'CORRCOEF: confidence interval is based on alpha=%f\n',alpha); + + +% SIGNIFICANCE TEST +R(isnan(R))=0; +tmp = 1 - R.*R; +tmp(tmp<0) = 0; % prevent tmp<0 i.e. imag(t)~=0 +t = R.*sqrt(max(NN-2,0)./tmp); + +if exist('t_cdf','file'); + sig = t_cdf(t,NN-2); +elseif exist('tcdf','file')>1; + sig = tcdf(t,NN-2); +else + fprintf('CORRCOEF: significance test not completed because of missing TCDF-function\n') + sig = repmat(nan,size(R)); +end; +sig = 2 * min(sig,1 - sig); + + +if NARG<3, + warning(FLAG_WARNING); % restore warning status + return; +end; + + +tmp = R; +%tmp(ix1 | ix2) = nan; % avoid division-by-zero warning +z = log((1+tmp)./(1-tmp))/2; % Fisher transformation [21] +%sz = 1./sqrt(NN-3); % standard error of z +sz = sqrt(2)*erfinv(1-alpha)./sqrt(NN-3); % confidence interval for alpha of z + +ci1 = tanh(z-sz); +ci2 = tanh(z+sz); + +%ci1(isnan(ci1))=R(isnan(ci1)); % in case of isnan(ci), the interval limits are exactly the R value +%ci2(isnan(ci2))=R(isnan(ci2)); + +if (NARG<5) || ~YESNAN, + nan_sig = repmat(NaN,size(R)); + warning(FLAG_WARNING); % restore warning status + return; +end; + +%%%%% ----- check independence of NaNs (missing values) ----- +[nan_R, nan_sig] = corrcoef(X,double(isnan(X))); + +% remove diagonal elements, because these have not any meaning % +nan_sig(isnan(nan_R)) = nan; +% remove diagonal elements, because these have not any meaning % +nan_R(isnan(nan_R)) = 0; + +if 0, any(nan_sig(:) < alpha), + tmp = nan_sig(:); % Hack to skip NaN's in MIN(X) + min_sig = min(tmp(~isnan(tmp))); % Necessary, because Octave returns NaN rather than min(X) for min(NaN,X) + fprintf(1,'CORRCOFF Warning: Missing Values (i.e. NaNs) are not independent of data (p-value=%f)\n', min_sig); + fprintf(1,' Its recommended to remove all samples (i.e. rows) with any missing value (NaN).\n'); + fprintf(1,' The null-hypotheses (NaNs are uncorrelated) is rejected for the following parameter pair(s).\n'); + [ix,iy] = find(nan_sig < alpha); + disp([ix,iy]) +end; + +%%%%% ----- end of independence check ------ + +warning(FLAG_WARNING); % restore warning status +return; +