octave_packages/statistics-1.1.3/normalise_distribution.m

   1 ## Copyright (C) 2011 Alexander Klein <alexander.klein@math.uni-giessen.de>
   2 ##
   3 ## This program is free software; you can redistribute it and/or modify it under
   4 ## the terms of the GNU General Public License as published by the Free Software
   5 ## Foundation; either version 3 of the License, or (at your option) any later
   6 ## version.
   7 ##
   8 ## This program is distributed in the hope that it will be useful, but WITHOUT
   9 ## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  10 ## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
  11 ## details.
  12 ##
  13 ## You should have received a copy of the GNU General Public License along with
  14 ## this program; if not, see <http://www.gnu.org/licenses/>.
  15
  16 ## -*- texinfo -*-
  17 ## @deftypefn{Function File} {@var{NORMALISED} =} normalise_distribution (@var{DATA})
  18 ## @deftypefnx{Function File} {@var{NORMALISED} =} normalise_distribution (@var{DATA}, @var{DISTRIBUTION})
  19 ## @deftypefnx{Function File} {@var{NORMALISED} =} normalise_distribution (@var{DATA}, @var{DISTRIBUTION}, @var{DIMENSION})
  20 ##
  21 ## Transform a set of data so as to be N(0,1) distributed according to an idea
  22 ## by van Albada and Robinson.
  23 ## This is achieved by first passing it through its own cumulative distribution
  24 ## function (CDF) in order to get a uniform distribution, and then mapping
  25 ## the uniform to a normal distribution.
  26 ## The data must be passed as a vector or matrix in @var{DATA}.
  27 ## If the CDF is unknown, then [] can be passed in @var{DISTRIBUTION}, and in
  28 ## this case the empirical CDF will be used.
  29 ## Otherwise, if the CDFs for all data are known, they can be passed in
  30 ## @var{DISTRIBUTION},
  31 ## either in the form of a single function name as a string,
  32 ## or a single function handle,
  33 ## or a cell array consisting of either all function names as strings,
  34 ## or all function handles.
  35 ## In the latter case, the number of CDFs passed must match the number
  36 ## of rows, or columns respectively, to normalise.
  37 ## If the data are passed as a matrix, then the transformation will
  38 ## operate either along the first non-singleton dimension,
  39 ## or along @var{DIMENSION} if present.
  40 ##
  41 ## Notes:
  42 ## The empirical CDF will map any two sets of data
  43 ## having the same size and their ties in the same places after sorting
  44 ## to some permutation of the same normalised data:
  45 ## @example
  46 ## @code{normalise_distribution([1 2 2 3 4])}
  47 ## @result{} -1.28  0.00  0.00  0.52  1.28
  48 ##
  49 ## @code{normalise_distribution([1 10 100 10 1000])}
  50 ## @result{} -1.28  0.00  0.52  0.00  1.28
  51 ## @end example
  52 ##
  53 ## Original source:
  54 ## S.J. van Albada, P.A. Robinson
  55 ## "Transformation of arbitrary distributions to the
  56 ## normal distribution with application to EEG
  57 ## test-retest reliability"
  58 ## Journal of Neuroscience Methods, Volume 161, Issue 2,
  59 ## 15 April 2007, Pages 205-211
  60 ## ISSN 0165-0270, 10.1016/j.jneumeth.2006.11.004.
  61 ## (http://www.sciencedirect.com/science/article/pii/S0165027006005668)
  62 ## @end deftypefn
  63
  64 function [ normalised ] = normalise_distribution ( data, distribution, dimension )
  65
  66   if ( nargin < 1 || nargin > 3 )
  67     print_usage;
  68   elseif ( !ismatrix ( data ) || length ( size ( data ) ) > 2 )
  69     error ( "First argument must be a vector or matrix" );
  70   end
  71
  72
  73   if ( nargin >= 2 )
  74
  75     if ( !isempty ( distribution ) )
  76
  77       #Wrap a single handle in a cell array.
  78       if ( strcmp ( typeinfo ( distribution ), typeinfo ( @(x)(x) ) ) )
  79
  80         distribution = { distribution };
  81
  82       #Do we have a string argument instead?
  83       elseif ( ischar ( distribution ) )
  84
  85         ##Is it a single string?
  86         if ( rows ( distribution ) == 1 )
  87
  88           distribution = { str2func( distribution ) };
  89         else
  90           error ( ["Second argument cannot contain more than one string" ...
  91              " unless in a cell array"] );
  92         end
  93
  94
  95       ##Do we have a cell array of distributions instead?
  96       elseif ( iscell ( distribution ) )
  97
  98         ##Does it consist of strings only?
  99         if ( all ( cellfun ( @ischar, distribution ) ) )
 100
 101           distribution = cellfun ( @str2func, distribution, "UniformOutput", false );
 102         end
 103
 104         ##Does it eventually consist of function handles only
 105         if ( !all ( cellfun ( @ ( h ) ( strcmp ( typeinfo ( h ), typeinfo ( @(x)(x) ) ) ), distribution ) ) )
 106
 107           error ( ["Second argument must contain either" ...
 108              " a single function name or handle or " ...
 109              " a cell array of either all function names or handles!"] );
 110         end
 111
 112       else
 113         error ( "Illegal second argument: ", typeinfo ( distribution ) );
 114       end
 115
 116     end
 117
 118   else
 119
 120     distribution = [];
 121   end
 122
 123
 124   if ( nargin == 3 )
 125
 126     if ( !isscalar ( dimension ) || ( dimension != 1 && dimension != 2 ) )
 127       error ( "Third argument must be either 1 or 2" );
 128     end
 129
 130   else
 131     if ( isvector ( data ) && rows ( data ) == 1 )
 132
 133       dimension = 2;
 134
 135     else
 136
 137       dimension = 1;
 138     end
 139   end
 140
 141   trp = ( dimension == 2 );
 142
 143   if ( trp )
 144     data = data';
 145   end
 146
 147   r = rows ( data );
 148   c = columns ( data );
 149   normalised = NA ( r, c );
 150
 151   ##Do we know the distribution of the sample?
 152   if ( isempty ( distribution ) )
 153
 154     precomputed_normalisation = [];
 155
 156
 157     for k = 1 : columns ( data )
 158
 159       ##Note that this line is in accordance with equation (16) in the
 160       ##original text. The author's original program, however, produces
 161       ##different values in the presence of ties, namely those you'd
 162       ##get replacing "last" by "first".
 163       [ uniq, indices ] = unique ( sort ( data ( :, k ) ), "last" );
 164
 165
 166       ##Does the sample have ties?
 167       if ( rows ( uniq ) != r )
 168
 169         ##Transform to uniform, then normal distribution.
 170         uniform = ( indices - 1/2 ) / r;
 171         normal = norminv ( uniform );
 172
 173       else
 174         ## Without ties everything is pretty much straightforward as
 175         ## stated in the text.
 176         if ( isempty ( precomputed_normalisation ) )
 177
 178           precomputed_normalisation = norminv ( 1 / (2*r) : 1/r : 1 - 1 / (2*r) );
 179         end
 180
 181         normal = precomputed_normalisation;
 182       end
 183
 184       #Find the original indices in the unsorted sample.
 185       #This somewhat quirky way of doing it is still faster than
 186       #using a for-loop.
 187       [ ignore, ignore, target_indices ] = unique ( data (:, k ) );
 188
 189       #Put normalised values in the places where they belong.
 190
 191       ## A regression in the 3.4 series made this no longer work so we behave
 192       ## differently depending on octave version. This applies the fix for all
 193       ## 3.4 releases but it may have appeared on 3.2.4 (can someone check?)
 194       ## See https://savannah.gnu.org/bugs/index.php?34765
 195       ## FIXME Once package dependency increases beyond an octave version that
 196       ## has this fixed, remove this
 197       if (compare_versions (OCTAVE_VERSION, "3.4", "<") || compare_versions (OCTAVE_VERSION, "3.6.2", ">="))
 198         ## this is how it should work
 199         f_remap = @( k ) ( normal ( k ) );
 200         normalised ( :, k ) = arrayfun ( f_remap, target_indices );
 201       else
 202         ## this is the workaround because of bug in 3.4.??
 203         for index = 1:numel(target_indices)
 204           normalised ( index, k ) = normal(target_indices(index));
 205         endfor
 206       endif
 207
 208     end
 209
 210   else
 211     ##With known distributions, everything boils down to a few lines of code
 212
 213     ##The same distribution for all data?
 214     if ( all ( size ( distribution ) == 1 ) )
 215
 216       normalised = norminv ( distribution {1,1} ( data ) );
 217
 218     elseif ( length ( vec ( distribution ) ) == c )
 219
 220       for k = 1 : c
 221
 222         normalised ( :, k ) = norminv ( distribution { k } ( data ) ( :, k ) );
 223       end
 224
 225     else
 226       error ( "Number of distributions does not match data size! ")
 227
 228     end
 229   end
 230
 231   if ( trp )
 232
 233     normalised = normalised';
 234   end
 235
 236 endfunction
 237
 238 %!test
 239 %! v = normalise_distribution ( [ 1 2 3 ], [], 1 );
 240 %! assert ( v, [ 0 0 0 ] )
 241
 242 %!test
 243 %! v = normalise_distribution ( [ 1 2 3 ], [], 2 );
 244 %! assert ( v, norminv ( [ 1 3 5 ] / 6 ),  3 * eps )
 245
 246 %!test
 247 %! v = normalise_distribution ( [ 1 2 3 ]', [], 2 );
 248 %! assert ( v, [ 0 0 0 ]' )
 249
 250 %!test
 251 %! v = normalise_distribution ( [ 1 2 3 ]' , [], 1 );
 252 %! assert ( v, norminv ( [ 1 3 5 ]' / 6 ), 3 * eps )
 253
 254 %!test
 255 %! v = normalise_distribution ( [ 1 1 2 2 3 3 ], [], 2 );
 256 %! assert ( v, norminv ( [ 3 3 7 7 11 11 ] / 12 ),  3 * eps )
 257
 258 %!test
 259 %! v = normalise_distribution ( [ 1 1 2 2 3 3 ]', [], 1 );
 260 %! assert ( v, norminv ( [ 3 3 7 7 11 11 ]' / 12 ),  3 * eps )
 261
 262 %!test
 263 %! A = randn ( 10 );
 264 %! N = normalise_distribution ( A, @normcdf );
 265 %! assert ( A, N, 1000 * eps )
 266
 267 %!xtest
 268 %! A = exprnd ( 1, 100 );
 269 %! N = normalise_distribution ( A, @ ( x ) ( expcdf ( x, 1 ) ) );
 270 %! assert ( mean ( vec ( N ) ), 0, 0.1 )
 271 %! assert ( std ( vec ( N ) ), 1, 0.1 )
 272
 273 %!xtest
 274 %! A = rand (1000,1);
 275 %! N = normalise_distribution  ( A, "unifcdf" );
 276 %! assert ( mean ( vec ( N ) ), 0, 0.1 )
 277 %! assert ( std ( vec ( N ) ), 1, 0.1 )
 278
 279 %!xtest
 280 %! A = [rand(1000,1), randn( 1000, 1)];
 281 %! N = normalise_distribution  ( A, { "unifcdf", "normcdf" } );
 282 %! assert ( mean ( N ), [ 0, 0 ], 0.1 )
 283 %! assert ( std ( N ), [ 1, 1 ], 0.1 )
 284
 285 %!xtest
 286 %! A = [rand(1000,1), randn( 1000, 1), exprnd( 1, 1000, 1 )]';
 287 %! N = normalise_distribution  ( A, { @unifcdf; @normcdf;  @( x )( expcdf ( x, 1 ) ) }, 2 );
 288 %! assert ( mean ( N, 2 ), [ 0, 0, 0 ]', 0.1 )
 289 %! assert ( std ( N, [], 2 ), [ 1, 1, 1 ]', 0.1 )
 290
 291 %!xtest
 292 %! A = exprnd ( 1, 1000, 9 ); A ( 300 : 500, 4:6 ) = 17;
 293 %! N = normalise_distribution ( A );
 294 %! assert ( mean ( N ), [ 0 0 0 0.38 0.38 0.38 0 0 0 ], 0.1 );
 295 %! assert ( var ( N ), [ 1 1 1 2.59 2.59 2.59 1 1 1 ], 0.1 );
 296
 297 %!test
 298 %! fail ("normalise_distribution( zeros ( 3, 4 ), { @unifcdf; @normcdf; @( x )( expcdf ( x, 1 ) ) } )", ...
 299 %! "Number of distributions does not match data size!");