octave_packages/statistics-1.1.3/anderson_darling_test.m

   1 ## Author: Paul Kienzle <pkienzle@users.sf.net>
   2 ## This program is granted to the public domain.
   3
   4 ## -*- texinfo -*-
   5 ## @deftypefn {Function File} {[@var{q}, @var{Asq}, @var{info}] = } = @
   6 ## anderson_darling_test (@var{x}, @var{distribution})
   7 ##
   8 ## Test the hypothesis that @var{x} is selected from the given distribution
   9 ## using the Anderson-Darling test.  If the returned @var{q} is small, reject
  10 ## the hypothesis at the @var{q}*100% level.
  11 ##
  12 ## The Anderson-Darling @math{@var{A}^2} statistic is calculated as follows:
  13 ##
  14 ## @example
  15 ## @iftex
  16 ## A^2_n = -n - \sum_{i=1}^n (2i-1)/n log(z_i (1-z_{n-i+1}))
  17 ## @end iftex
  18 ## @ifnottex
  19 ##               n
  20 ## A^2_n = -n - SUM (2i-1)/n log(@math{z_i} (1 - @math{z_@{n-i+1@}}))
  21 ##              i=1
  22 ## @end ifnottex
  23 ## @end example
  24 ##
  25 ## where @math{z_i} is the ordered position of the @var{x}'s in the CDF of the
  26 ## distribution.  Unlike the Kolmogorov-Smirnov statistic, the
  27 ## Anderson-Darling statistic is sensitive to the tails of the
  28 ## distribution.
  29 ##
  30 ## The @var{distribution} argument must be a either @t{"uniform"}, @t{"normal"},
  31 ## or @t{"exponential"}.
  32 ##
  33 ## For @t{"normal"}' and @t{"exponential"} distributions, estimate the
  34 ## distribution parameters from the data, convert the values
  35 ## to CDF values, and compare the result to tabluated critical
  36 ## values.  This includes an correction for small @var{n} which
  37 ## works well enough for @var{n} >= 8, but less so from smaller @var{n}.  The
  38 ## returned @code{info.Asq_corrected} contains the adjusted statistic.
  39 ##
  40 ## For @t{"uniform"}, assume the values are uniformly distributed
  41 ## in (0,1), compute @math{@var{A}^2} and return the corresponding @math{p}-value from
  42 ## @code{1-anderson_darling_cdf(A^2,n)}.
  43 ##
  44 ## If you are selecting from a known distribution, convert your
  45 ## values into CDF values for the distribution and use @t{"uniform"}.
  46 ## Do not use @t{"uniform"} if the distribution parameters are estimated
  47 ## from the data itself, as this sharply biases the @math{A^2} statistic
  48 ## toward smaller values.
  49 ##
  50 ## [1] Stephens, MA; (1986), "Tests based on EDF statistics", in
  51 ## D'Agostino, RB; Stephens, MA; (eds.) Goodness-of-fit Techinques.
  52 ## New York: Dekker.
  53 ##
  54 ## @seealso{anderson_darling_cdf}
  55 ## @end deftypefn
  56
  57 function [q,Asq,info] = anderson_darling_test(x,dist)
  58
  59   if size(x,1) == 1, x=x(:); end
  60   x = sort(x);
  61   n = size(x,1);
  62   use_cdf = 0;
  63
  64   # Compute adjustment and critical values to use for stats.
  65   switch dist
  66     case 'normal',
  67       # This expression for adj is used in R.
  68       # Note that the values from NIST dataplot don't work nearly as well.
  69       adj = 1 + (.75 + 2.25/n)/n;
  70       qvals = [ 0.1, 0.05, 0.025, 0.01 ];
  71       Acrit = [ 0.631, 0.752, 0.873, 1.035];
  72       x = stdnormal_cdf(zscore(x));
  73
  74     case 'uniform',
  75       ## Put invalid data at the limits of the distribution
  76       ## This will drive the statistic to infinity.
  77       x(x<0) = 0;
  78       x(x>1) = 1;
  79       adj = 1.;
  80       qvals = [ 0.1, 0.05, 0.025, 0.01 ];
  81       Acrit = [ 1.933, 2.492, 3.070, 3.857 ];
  82       use_cdf = 1;
  83
  84     case 'XXXweibull',
  85       adj = 1 + 0.2/sqrt(n);
  86       qvals = [ 0.1, 0.05, 0.025, 0.01 ];
  87       Acrit = [ 0.637, 0.757, 0.877, 1.038];
  88       ## XXX FIXME XXX how to fit alpha and sigma?
  89       x = wblcdf (x, ones(n,1)*sigma, ones(n,1)*alpha);
  90
  91     case 'exponential',
  92       adj = 1 + 0.6/n;
  93       qvals = [ 0.1, 0.05, 0.025, 0.01 ];
  94       # Critical values depend on n.  Choose the appropriate critical set.
  95       # These values come from NIST dataplot/src/dp8.f.
  96       Acritn = [
  97                   0, 1.022, 1.265, 1.515, 1.888
  98                  11, 1.045, 1.300, 1.556, 1.927;
  99                  21, 1.062, 1.323, 1.582, 1.945;
 100                  51, 1.070, 1.330, 1.595, 1.951;
 101                 101, 1.078, 1.341, 1.606, 1.957;
 102                 ];
 103       # FIXME: consider interpolating in the critical value table.
 104       Acrit = Acritn(lookup(Acritn(:,1),n),2:5);
 105
 106       lambda = 1./mean(x);  # exponential parameter estimation
 107       x = expcdf(x, 1./(ones(n,1)*lambda));
 108
 109     otherwise
 110       # FIXME consider implementing more of distributions; a number
 111       # of them are defined in NIST dataplot/src/dp8.f.
 112       error("Anderson-Darling test for %s not implemented", dist);
 113   endswitch
 114
 115   if any(x<0 | x>1)
 116     error('Anderson-Darling test requires data in CDF form');
 117   endif
 118
 119   i = [1:n]'*ones(1,size(x,2));
 120   Asq = -n - sum( (2*i-1) .* (log(x) + log(1-x(n:-1:1,:))) )/n;
 121
 122   # Lookup adjusted critical value in the cdf (if uniform) or in the
 123   # the critical table.
 124   if use_cdf
 125     q = 1-anderson_darling_cdf(Asq*adj, n);
 126   else
 127     idx = lookup([-Inf,Acrit],Asq*adj);
 128     q = [1,qvals](idx);
 129   endif
 130
 131   if nargout > 2,
 132     info.Asq = Asq;
 133     info.Asq_corrected = Asq*adj;
 134     info.Asq_critical = [100*(1-qvals); Acrit]';
 135     info.p = 1-q;
 136     info.p_is_precise = use_cdf;
 137   endif
 138 endfunction
 139
 140 %!demo
 141 %! c = anderson_darling_test(10*rande(12,10000),'exponential');
 142 %! tabulate(100*c,100*[unique(c),1]);
 143 %! % The Fc column should report 100, 250, 500, 1000, 10000 more or less.
 144
 145 %!demo
 146 %! c = anderson_darling_test(randn(12,10000),'normal');
 147 %! tabulate(100*c,100*[unique(c),1]);
 148 %! % The Fc column should report 100, 250, 500, 1000, 10000 more or less.
 149
 150 %!demo
 151 %! c = anderson_darling_test(rand(12,10000),'uniform');
 152 %! hist(100*c,1:2:99);
 153 %! % The histogram should be flat more or less.