octave_packages/statistics-1.1.3/pdist.m

   1 ## Copyright (C) 2008 Francesco Potortì <pot@gnu.org>
   2 ##
   3 ## This program is free software; you can redistribute it and/or modify it under
   4 ## the terms of the GNU General Public License as published by the Free Software
   5 ## Foundation; either version 3 of the License, or (at your option) any later
   6 ## version.
   7 ##
   8 ## This program is distributed in the hope that it will be useful, but WITHOUT
   9 ## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  10 ## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
  11 ## details.
  12 ##
  13 ## You should have received a copy of the GNU General Public License along with
  14 ## this program; if not, see <http://www.gnu.org/licenses/>.
  15
  16 ## -*- texinfo -*-
  17 ## @deftypefn {Function File} {@var{y} =} pdist (@var{x})
  18 ## @deftypefnx {Function File} {@var{y} =} pdist (@var{x}, @var{metric})
  19 ## @deftypefnx {Function File} {@var{y} =} pdist (@var{x}, @var{metric}, @var{metricarg}, @dots{})
  20 ##
  21 ## Return the distance between any two rows in @var{x}.
  22 ##
  23 ## @var{x} is the @var{n}x@var{d} matrix representing @var{q} row
  24 ## vectors of size @var{d}.
  25 ##
  26 ## The output is a dissimilarity matrix formatted as a row vector
  27 ## @var{y}, @math{(n-1)*n/2} long, where the distances are in
  28 ## the order [(1, 2) (1, 3) @dots{} (2, 3) @dots{} (n-1, n)].  You can
  29 ## use the @code{squareform} function to display the distances between
  30 ## the vectors arranged into an @var{n}x@var{n} matrix.
  31 ##
  32 ## @code{metric} is an optional argument specifying how the distance is
  33 ## computed. It can be any of the following ones, defaulting to
  34 ## "euclidean", or a user defined function that takes two arguments
  35 ## @var{x} and @var{y} plus any number of optional arguments,
  36 ## where @var{x} is a row vector and and @var{y} is a matrix having the
  37 ## same number of columns as @var{x}.  @code{metric} returns a column
  38 ## vector where row @var{i} is the distance between @var{x} and row
  39 ## @var{i} of @var{y}. Any additional arguments after the @code{metric}
  40 ## are passed as metric (@var{x}, @var{y}, @var{metricarg1},
  41 ## @var{metricarg2} @dots{}).
  42 ##
  43 ## Predefined distance functions are:
  44 ##
  45 ## @table @samp
  46 ## @item "euclidean"
  47 ## Euclidean distance (default).
  48 ##
  49 ## @item "seuclidean"
  50 ## Standardized Euclidean distance. Each coordinate in the sum of
  51 ## squares is inverse weighted by the sample variance of that
  52 ## coordinate.
  53 ##
  54 ## @item "mahalanobis"
  55 ## Mahalanobis distance: see the function mahalanobis.
  56 ##
  57 ## @item "cityblock"
  58 ## City Block metric, aka Manhattan distance.
  59 ##
  60 ## @item "minkowski"
  61 ## Minkowski metric.  Accepts a numeric parameter @var{p}: for @var{p}=1
  62 ## this is the same as the cityblock metric, with @var{p}=2 (default) it
  63 ## is equal to the euclidean metric.
  64 ##
  65 ## @item "cosine"
  66 ## One minus the cosine of the included angle between rows, seen as
  67 ## vectors.
  68 ##
  69 ## @item "correlation"
  70 ## One minus the sample correlation between points (treated as
  71 ## sequences of values).
  72 ##
  73 ## @item "spearman"
  74 ## One minus the sample Spearman's rank correlation between
  75 ## observations, treated as sequences of values.
  76 ##
  77 ## @item "hamming"
  78 ## Hamming distance: the quote of the number of coordinates that differ.
  79 ##
  80 ## @item "jaccard"
  81 ## One minus the Jaccard coefficient, the quote of nonzero
  82 ## coordinates that differ.
  83 ##
  84 ## @item "chebychev"
  85 ## Chebychev distance: the maximum coordinate difference.
  86 ## @end table
  87 ## @seealso{linkage, mahalanobis, squareform}
  88 ## @end deftypefn
  89
  90 ## Author: Francesco Potortì  <pot@gnu.org>
  91
  92 function y = pdist (x, metric, varargin)
  93
  94   if (nargin < 1)
  95     print_usage ();
  96   elseif ((nargin > 1)
  97           && ! ischar (metric)
  98           && ! isa (metric, "function_handle"))
  99     error (["pdist: the distance function must be either a string or a "
 100             "function handle."]);
 101   endif
 102
 103   if (nargin < 2)
 104     metric = "euclidean";
 105   endif
 106
 107   if (! ismatrix (x) || isempty (x))
 108     error ("pdist: x must be a nonempty matrix");
 109   elseif (length (size (x)) > 2)
 110     error ("pdist: x must be 1 or 2 dimensional");
 111   endif
 112
 113   y = [];
 114   if (rows(x) == 1)
 115     return;
 116   endif
 117
 118   if (ischar (metric))
 119     order = nchoosek(1:rows(x),2);
 120     Xi = order(:,1);
 121     Yi = order(:,2);
 122     X = x';
 123     metric = lower (metric);
 124     switch (metric)
 125       case "euclidean"
 126         d = X(:,Xi) - X(:,Yi);
 127         if (str2num(version()(1:3)) > 3.1)
 128           y = norm (d, "cols");
 129         else
 130           y = sqrt (sumsq (d, 1));
 131         endif
 132
 133       case "seuclidean"
 134         d = X(:,Xi) - X(:,Yi);
 135         weights = inv (diag (var (x, 0, 1)));
 136         y = sqrt (sum ((weights * d) .* d, 1));
 137
 138       case "mahalanobis"
 139         d = X(:,Xi) - X(:,Yi);
 140         weights = inv (cov (x));
 141         y = sqrt (sum ((weights * d) .* d, 1));
 142
 143       case "cityblock"
 144         d = X(:,Xi) - X(:,Yi);
 145         if (str2num(version()(1:3)) > 3.1)
 146           y = norm (d, 1, "cols");
 147         else
 148           y = sum (abs (d), 1);
 149         endif
 150
 151       case "minkowski"
 152         d = X(:,Xi) - X(:,Yi);
 153         p = 2;                  # default
 154         if (nargin > 2)
 155           p = varargin{1};      # explicitly assigned
 156         endif;
 157         if (str2num(version()(1:3)) > 3.1)
 158           y = norm (d, p, "cols");
 159         else
 160           y = (sum ((abs (d)).^p, 1)).^(1/p);
 161         endif
 162
 163       case "cosine"
 164         prod = X(:,Xi) .* X(:,Yi);
 165         weights = sumsq (X(:,Xi), 1) .* sumsq (X(:,Yi), 1);
 166         y = 1 - sum (prod, 1) ./ sqrt (weights);
 167
 168       case "correlation"
 169         if (rows(X) == 1)
 170           error ("pdist: correlation distance between scalars not defined")
 171         endif
 172         corr = cor (X);
 173         y = 1 - corr (sub2ind (size (corr), Xi, Yi))';
 174
 175       case "spearman"
 176         if (rows(X) == 1)
 177           error ("pdist: spearman distance between scalars not defined")
 178         endif
 179         corr = spearman (X);
 180         y = 1 - corr (sub2ind (size (corr), Xi, Yi))';
 181
 182       case "hamming"
 183         d = logical (X(:,Xi) - X(:,Yi));
 184         y = sum (d, 1) / rows (X);
 185
 186       case "jaccard"
 187         d = logical (X(:,Xi) - X(:,Yi));
 188         weights = X(:,Xi) | X(:,Yi);
 189         y = sum (d & weights, 1) ./ sum (weights, 1);
 190
 191       case "chebychev"
 192         d = X(:,Xi) - X(:,Yi);
 193         if (str2num(version()(1:3)) > 3.1)
 194           y = norm (d, Inf, "cols");
 195         else
 196           y = max (abs (d), [], 1);
 197         endif
 198
 199     endswitch
 200   endif
 201
 202   if (isempty (y))
 203     ## Metric is a function handle or the name of an external function
 204     l = rows (x);
 205     y = zeros (1, nchoosek (l, 2));
 206     idx = 1;
 207     for ii = 1:l-1
 208       for jj = ii+1:l
 209         y(idx++) = feval (metric, x(ii,:), x, varargin{:})(jj);
 210       endfor
 211     endfor
 212   endif
 213
 214 endfunction
 215
 216 %!shared xy, t, eucl
 217 %! xy = [0 1; 0 2; 7 6; 5 6];
 218 %! t = 1e-3;
 219 %! eucl = @(v,m) sqrt(sumsq(repmat(v,rows(m),1)-m,2));
 220 %!assert(pdist(xy),              [1.000 8.602 7.071 8.062 6.403 2.000],t);
 221 %!assert(pdist(xy,eucl),         [1.000 8.602 7.071 8.062 6.403 2.000],t);
 222 %!assert(pdist(xy,"euclidean"),  [1.000 8.602 7.071 8.062 6.403 2.000],t);
 223 %!assert(pdist(xy,"seuclidean"), [0.380 2.735 2.363 2.486 2.070 0.561],t);
 224 %!assert(pdist(xy,"mahalanobis"),[1.384 1.967 2.446 2.384 1.535 2.045],t);
 225 %!assert(pdist(xy,"cityblock"),  [1.000 12.00 10.00 11.00 9.000 2.000],t);
 226 %!assert(pdist(xy,"minkowski"),  [1.000 8.602 7.071 8.062 6.403 2.000],t);
 227 %!assert(pdist(xy,"minkowski",3),[1.000 7.763 6.299 7.410 5.738 2.000],t);
 228 %!assert(pdist(xy,"cosine"),     [0.000 0.349 0.231 0.349 0.231 0.013],t);
 229 %!assert(pdist(xy,"correlation"),[0.000 2.000 0.000 2.000 0.000 2.000],t);
 230 %!assert(pdist(xy,"spearman"),   [0.000 2.000 0.000 2.000 0.000 2.000],t);
 231 %!assert(pdist(xy,"hamming"),    [0.500 1.000 1.000 1.000 1.000 0.500],t);
 232 %!assert(pdist(xy,"jaccard"),    [1.000 1.000 1.000 1.000 1.000 0.500],t);
 233 %!assert(pdist(xy,"chebychev"),  [1.000 7.000 5.000 7.000 5.000 2.000],t);