octave_packages/m/statistics/base/quantile.m

   1 ## Copyright (C) 2008-2012 Ben Abbott and Jaroslav Hajek
   2 ##
   3 ## This file is part of Octave.
   4 ##
   5 ## Octave is free software; you can redistribute it and/or modify it
   6 ## under the terms of the GNU General Public License as published by
   7 ## the Free Software Foundation; either version 3 of the License, or (at
   8 ## your option) any later version.
   9 ##
  10 ## Octave is distributed in the hope that it will be useful, but
  11 ## WITHOUT ANY WARRANTY; without even the implied warranty of
  12 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13 ## General Public License for more details.
  14 ##
  15 ## You should have received a copy of the GNU General Public License
  16 ## along with Octave; see the file COPYING.  If not, see
  17 ## <http://www.gnu.org/licenses/>.
  18
  19 ## -*- texinfo -*-
  20 ## @deftypefn  {Function File} {@var{q} =} quantile (@var{x}, @var{p})
  21 ## @deftypefnx {Function File} {@var{q} =} quantile (@var{x}, @var{p}, @var{dim})
  22 ## @deftypefnx {Function File} {@var{q} =} quantile (@var{x}, @var{p}, @var{dim}, @var{method})
  23 ## For a sample, @var{x}, calculate the quantiles, @var{q}, corresponding to
  24 ## the cumulative probability values in @var{p}.  All non-numeric values (NaNs)
  25 ## of @var{x} are ignored.
  26 ##
  27 ## If @var{x} is a matrix, compute the quantiles for each column and
  28 ## return them in a matrix, such that the i-th row of @var{q} contains
  29 ## the @var{p}(i)th quantiles of each column of @var{x}.
  30 ##
  31 ## The optional argument @var{dim} determines the dimension along which
  32 ## the quantiles are calculated.  If @var{dim} is omitted, and @var{x} is
  33 ## a vector or matrix, it defaults to 1 (column-wise quantiles).  If
  34 ## @var{x} is an N-D array, @var{dim} defaults to the first non-singleton
  35 ## dimension.
  36 ##
  37 ## The methods available to calculate sample quantiles are the nine methods
  38 ## used by R (http://www.r-project.org/).  The default value is METHOD = 5.
  39 ##
  40 ## Discontinuous sample quantile methods 1, 2, and 3
  41 ##
  42 ## @enumerate 1
  43 ## @item Method 1: Inverse of empirical distribution function.
  44 ##
  45 ## @item Method 2: Similar to method 1 but with averaging at discontinuities.
  46 ##
  47 ## @item Method 3: SAS definition: nearest even order statistic.
  48 ## @end enumerate
  49 ##
  50 ## Continuous sample quantile methods 4 through 9, where p(k) is the linear
  51 ## interpolation function respecting each methods' representative cdf.
  52 ##
  53 ## @enumerate 4
  54 ## @item Method 4: p(k) = k / n. That is, linear interpolation of the
  55 ## empirical cdf.
  56 ##
  57 ## @item Method 5: p(k) = (k - 0.5) / n. That is a piecewise linear function
  58 ## where the knots are the values midway through the steps of the empirical
  59 ## cdf.
  60 ##
  61 ## @item Method 6: p(k) = k / (n + 1).
  62 ##
  63 ## @item Method 7: p(k) = (k - 1) / (n - 1).
  64 ##
  65 ## @item Method 8: p(k) = (k - 1/3) / (n + 1/3).  The resulting quantile
  66 ## estimates are approximately median-unbiased regardless of the distribution
  67 ## of @var{x}.
  68 ##
  69 ## @item Method 9: p(k) = (k - 3/8) / (n + 1/4).  The resulting quantile
  70 ## estimates are approximately unbiased for the expected order statistics if
  71 ## @var{x} is normally distributed.
  72 ## @end enumerate
  73 ##
  74 ## Hyndman and Fan (1996) recommend method 8.  Maxima, S, and R
  75 ## (versions prior to 2.0.0) use 7 as their default.  Minitab and SPSS
  76 ## use method 6.  @sc{matlab} uses method 5.
  77 ##
  78 ## References:
  79 ##
  80 ## @itemize @bullet
  81 ## @item Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988) The New
  82 ## S Language.  Wadsworth & Brooks/Cole.
  83 ##
  84 ## @item Hyndman, R. J. and Fan, Y. (1996) Sample quantiles in
  85 ## statistical packages, American Statistician, 50, 361--365.
  86 ##
  87 ## @item R: A Language and Environment for Statistical Computing;
  88 ## @url{http://cran.r-project.org/doc/manuals/fullrefman.pdf}.
  89 ## @end itemize
  90 ##
  91 ## Examples:
  92 ## @c Set example in small font to prevent overfull line
  93 ##
  94 ## @smallexample
  95 ## @group
  96 ## x = randi (1000, [10, 1]);  # Create empirical data in range 1-1000
  97 ## q = quantile (x, [0, 1]);   # Return minimum, maximum of distribution
  98 ## q = quantile (x, [0.25 0.5 0.75]); # Return quartiles of distribution
  99 ## @end group
 100 ## @end smallexample
 101 ## @seealso{prctile}
 102 ## @end deftypefn
 103
 104 ## Author: Ben Abbott <bpabbott@mac.com>
 105 ## Description: Matlab style quantile function of a discrete/continuous distribution
 106
 107 function q = quantile (x, p = [], dim = 1, method = 5)
 108
 109   if (nargin < 1 || nargin > 4)
 110     print_usage ();
 111   endif
 112
 113   if (! (isnumeric (x) || islogical (x)))
 114     error ("quantile: X must be a numeric vector or matrix");
 115   endif
 116
 117   if (isempty (p))
 118     p = [0.00 0.25, 0.50, 0.75, 1.00];
 119   endif
 120
 121   if (! (isnumeric (p) && isvector (p)))
 122     error ("quantile: P must be a numeric vector");
 123   endif
 124
 125   if (!(isscalar (dim) && dim == fix (dim))
 126       || !(1 <= dim && dim <= ndims (x)))
 127     error ("quantile: DIM must be an integer and a valid dimension");
 128   endif
 129
 130   ## Set the permutation vector.
 131   perm = 1:ndims(x);
 132   perm(1) = dim;
 133   perm(dim) = 1;
 134
 135   ## Permute dim to the 1st index.
 136   x = permute (x, perm);
 137
 138   ## Save the size of the permuted x N-d array.
 139   sx = size (x);
 140
 141   ## Reshape to a 2-d array.
 142   x = reshape (x, [sx(1), prod(sx(2:end))]);
 143
 144   ## Calculate the quantiles.
 145   q = __quantile__ (x, p, method);
 146
 147   ## Return the shape to the original N-d array.
 148   q = reshape (q, [numel(p), sx(2:end)]);
 149
 150   ## Permute the 1st index back to dim.
 151   q = ipermute (q, perm);
 152
 153 endfunction
 154
 155
 156 %!test
 157 %! p = 0.5;
 158 %! x = sort (rand (11));
 159 %! q = quantile (x, p);
 160 %! assert (q, x(6,:))
 161 %! x = x.';
 162 %! q = quantile (x, p, 2);
 163 %! assert (q, x(:,6));
 164
 165 %!test
 166 %! p = [0.00, 0.25, 0.50, 0.75, 1.00];
 167 %! x = [1; 2; 3; 4];
 168 %! a = [1.0000   1.0000   2.0000   3.0000   4.0000
 169 %!      1.0000   1.5000   2.5000   3.5000   4.0000
 170 %!      1.0000   1.0000   2.0000   3.0000   4.0000
 171 %!      1.0000   1.0000   2.0000   3.0000   4.0000
 172 %!      1.0000   1.5000   2.5000   3.5000   4.0000
 173 %!      1.0000   1.2500   2.5000   3.7500   4.0000
 174 %!      1.0000   1.7500   2.5000   3.2500   4.0000
 175 %!      1.0000   1.4167   2.5000   3.5833   4.0000
 176 %!      1.0000   1.4375   2.5000   3.5625   4.0000];
 177 %! for m = (1:9)
 178 %!   q = quantile (x, p, 1, m).';
 179 %!   assert (q, a(m,:), 0.0001)
 180 %! endfor
 181
 182 %!test
 183 %! p = [0.00, 0.25, 0.50, 0.75, 1.00];
 184 %! x = [1; 2; 3; 4; 5];
 185 %! a = [1.0000   2.0000   3.0000   4.0000   5.0000
 186 %!      1.0000   2.0000   3.0000   4.0000   5.0000
 187 %!      1.0000   1.0000   2.0000   4.0000   5.0000
 188 %!      1.0000   1.2500   2.5000   3.7500   5.0000
 189 %!      1.0000   1.7500   3.0000   4.2500   5.0000
 190 %!      1.0000   1.5000   3.0000   4.5000   5.0000
 191 %!      1.0000   2.0000   3.0000   4.0000   5.0000
 192 %!      1.0000   1.6667   3.0000   4.3333   5.0000
 193 %!      1.0000   1.6875   3.0000   4.3125   5.0000];
 194 %! for m = (1:9)
 195 %!   q = quantile (x, p, 1, m).';
 196 %!   assert (q, a(m,:), 0.0001)
 197 %! endfor
 198
 199 %!test
 200 %! p = [0.00, 0.25, 0.50, 0.75, 1.00];
 201 %! x = [1; 2; 5; 9];
 202 %! a = [1.0000   1.0000   2.0000   5.0000   9.0000
 203 %!      1.0000   1.5000   3.5000   7.0000   9.0000
 204 %!      1.0000   1.0000   2.0000   5.0000   9.0000
 205 %!      1.0000   1.0000   2.0000   5.0000   9.0000
 206 %!      1.0000   1.5000   3.5000   7.0000   9.0000
 207 %!      1.0000   1.2500   3.5000   8.0000   9.0000
 208 %!      1.0000   1.7500   3.5000   6.0000   9.0000
 209 %!      1.0000   1.4167   3.5000   7.3333   9.0000
 210 %!      1.0000   1.4375   3.5000   7.2500   9.0000];
 211 %! for m = (1:9)
 212 %!   q = quantile (x, p, 1, m).';
 213 %!   assert (q, a(m,:), 0.0001)
 214 %! endfor
 215
 216 %!test
 217 %! p = [0.00, 0.25, 0.50, 0.75, 1.00];
 218 %! x = [1; 2; 5; 9; 11];
 219 %! a = [1.0000    2.0000    5.0000    9.0000   11.0000
 220 %!      1.0000    2.0000    5.0000    9.0000   11.0000
 221 %!      1.0000    1.0000    2.0000    9.0000   11.0000
 222 %!      1.0000    1.2500    3.5000    8.0000   11.0000
 223 %!      1.0000    1.7500    5.0000    9.5000   11.0000
 224 %!      1.0000    1.5000    5.0000   10.0000   11.0000
 225 %!      1.0000    2.0000    5.0000    9.0000   11.0000
 226 %!      1.0000    1.6667    5.0000    9.6667   11.0000
 227 %!      1.0000    1.6875    5.0000    9.6250   11.0000];
 228 %! for m = (1:9)
 229 %!   q = quantile (x, p, 1, m).';
 230 %!   assert (q, a(m,:), 0.0001)
 231 %! endfor
 232
 233 %!test
 234 %! p = [0.00, 0.25, 0.50, 0.75, 1.00];
 235 %! x = [16; 11; 15; 12; 15;  8; 11; 12;  6; 10];
 236 %! a = [6.0000   10.0000   11.0000   15.0000   16.0000
 237 %!      6.0000   10.0000   11.5000   15.0000   16.0000
 238 %!      6.0000    8.0000   11.0000   15.0000   16.0000
 239 %!      6.0000    9.0000   11.0000   13.5000   16.0000
 240 %!      6.0000   10.0000   11.5000   15.0000   16.0000
 241 %!      6.0000    9.5000   11.5000   15.0000   16.0000
 242 %!      6.0000   10.2500   11.5000   14.2500   16.0000
 243 %!      6.0000    9.8333   11.5000   15.0000   16.0000
 244 %!      6.0000    9.8750   11.5000   15.0000   16.0000];
 245 %! for m = (1:9)
 246 %!   q = quantile (x, p, 1, m).';
 247 %!   assert (q, a(m,:), 0.0001)
 248 %! endfor
 249
 250 %!test
 251 %! p = [0.00, 0.25, 0.50, 0.75, 1.00];
 252 %! x = [-0.58851;  0.40048;  0.49527; -2.551500; -0.52057; ...
 253 %!      -0.17841; 0.057322; -0.62523;  0.042906;  0.12337];
 254 %! a = [-2.551474  -0.588505  -0.178409   0.123366   0.495271
 255 %!      -2.551474  -0.588505  -0.067751   0.123366   0.495271
 256 %!      -2.551474  -0.625231  -0.178409   0.123366   0.495271
 257 %!      -2.551474  -0.606868  -0.178409   0.090344   0.495271
 258 %!      -2.551474  -0.588505  -0.067751   0.123366   0.495271
 259 %!      -2.551474  -0.597687  -0.067751   0.192645   0.495271
 260 %!      -2.551474  -0.571522  -0.067751   0.106855   0.495271
 261 %!      -2.551474  -0.591566  -0.067751   0.146459   0.495271
 262 %!      -2.551474  -0.590801  -0.067751   0.140686   0.495271];
 263 %! for m = (1:9)
 264 %!   q = quantile (x, p, 1, m).';
 265 %!   assert (q, a(m,:), 0.0001)
 266 %! endfor
 267
 268 %!test
 269 %! p = 0.5;
 270 %! x = [0.112600, 0.114800, 0.052100, 0.236400, 0.139300
 271 %!      0.171800, 0.727300, 0.204100, 0.453100, 0.158500
 272 %!      0.279500, 0.797800, 0.329600, 0.556700, 0.730700
 273 %!      0.428800, 0.875300, 0.647700, 0.628700, 0.816500
 274 %!      0.933100, 0.931200, 0.963500, 0.779600, 0.846100];
 275 %! tol = 0.00001;
 276 %! x(5,5) = NaN;
 277 %! assert (quantile(x, p, 1), [0.27950, 0.79780, 0.32960, 0.55670, 0.44460], tol);
 278 %! x(1,1) = NaN;
 279 %! assert (quantile(x, p, 1), [0.35415, 0.79780, 0.32960, 0.55670, 0.44460], tol);
 280 %! x(3,3) = NaN;
 281 %! assert (quantile(x, p, 1), [0.35415, 0.79780, 0.42590, 0.55670, 0.44460], tol);
 282
 283 %!test
 284 %! sx = [2, 3, 4];
 285 %! x = rand (sx);
 286 %! dim = 2;
 287 %! p = 0.5;
 288 %! yobs = quantile (x, p, dim);
 289 %! yexp = median (x, dim);
 290 %! assert (yobs, yexp);
 291
 292 %% Test input validation
 293 %!error quantile ()
 294 %!error quantile (1, 2, 3, 4, 5)
 295 %!error quantile (['A'; 'B'], 10)
 296 %!error quantile (1:10, [true, false])
 297 %!error quantile (1:10, ones (2,2))
 298 %!error quantile (1, 1, 1.5)
 299 %!error quantile (1, 1, 0)
 300 %!error quantile (1, 1, 3)
 301 %!error quantile ((1:5)', 0.5, 1, 0)
 302 %!error quantile ((1:5)', 0.5, 1, 10)
 303
 304 ## For the cumulative probability values in @var{p}, compute the
 305 ## quantiles, @var{q} (the inverse of the cdf), for the sample, @var{x}.
 306 ##
 307 ## The optional input, @var{method}, refers to nine methods available in R
 308 ## (http://www.r-project.org/). The default is @var{method} = 7. For more
 309 ## detail, see `help quantile'.
 310 ## @seealso{prctile, quantile, statistics}
 311
 312 ## Author: Ben Abbott <bpabbott@mac.com>
 313 ## Vectorized version: Jaroslav Hajek <highegg@gmail.com>
 314 ## Description: Quantile function of empirical samples
 315
 316 function inv = __quantile__ (x, p, method = 5)
 317
 318   if (nargin < 2 || nargin > 3)
 319     print_usage ();
 320   endif
 321
 322   if (isinteger (x) || islogical (x))
 323     x = double (x);
 324   endif
 325
 326   ## set shape of quantiles to column vector.
 327   p = p(:);
 328
 329   ## Save length and set shape of samples.
 330   ## FIXME: does sort guarantee that NaN's come at the end?
 331   x = sort (x);
 332   m = sum (! isnan (x));
 333   [xr, xc] = size (x);
 334
 335   ## Initialize output values.
 336   inv = Inf (class (x)) * (-(p < 0) + (p > 1));
 337   inv = repmat (inv, 1, xc);
 338
 339   ## Do the work.
 340   if (any (k = find ((p >= 0) & (p <= 1))))
 341     n = length (k);
 342     p = p(k);
 343     ## Special case of 1 row.
 344     if (xr == 1)
 345       inv(k,:) = repmat (x, n, 1);
 346       return;
 347     endif
 348
 349     ## The column-distribution indices.
 350     pcd = kron (ones (n, 1), xr*(0:xc-1));
 351     mm = kron (ones (n, 1), m);
 352     switch (method)
 353       case {1, 2, 3}
 354         switch (method)
 355           case 1
 356             p = max (ceil (kron (p, m)), 1);
 357             inv(k,:) = x(p + pcd);
 358
 359           case 2
 360             p = kron (p, m);
 361             p_lr = max (ceil (p), 1);
 362             p_rl = min (floor (p + 1), mm);
 363             inv(k,:) = (x(p_lr + pcd) + x(p_rl + pcd))/2;
 364
 365           case 3
 366            ## Used by SAS, method PCTLDEF=2.
 367            ## http://support.sas.com/onlinedoc/913/getDoc/en/statug.hlp/stdize_sect14.htm
 368             t = max (kron (p, m), 1);
 369             t = roundb (t);
 370             inv(k,:) = x(t + pcd);
 371         endswitch
 372
 373       otherwise
 374         switch (method)
 375           case 4
 376             p = kron (p, m);
 377
 378           case 5
 379             ## Used by Matlab.
 380             p = kron (p, m) + 0.5;
 381
 382           case 6
 383             ## Used by Minitab and SPSS.
 384             p = kron (p, m+1);
 385
 386           case 7
 387             ## Used by S and R.
 388             p = kron (p, m-1) + 1;
 389
 390           case 8
 391             ## Median unbiased.
 392             p = kron (p, m+1/3) + 1/3;
 393
 394           case 9
 395             ## Approximately unbiased respecting order statistics.
 396             p = kron (p, m+0.25) + 0.375;
 397
 398           otherwise
 399             error ("quantile: Unknown METHOD, '%d'", method);
 400         endswitch
 401
 402         ## Duplicate single values.
 403         imm1 = (mm == 1);
 404         x(2,imm1) = x(1,imm1);
 405
 406         ## Interval indices.
 407         pi = max (min (floor (p), mm-1), 1);
 408         pr = max (min (p - pi, 1), 0);
 409         pi += pcd;
 410         inv(k,:) = (1-pr) .* x(pi) + pr .* x(pi+1);
 411     endswitch
 412   endif
 413
 414 endfunction
 415