1 ## Copyright (C) 2008 Francesco Potortì <pot@gnu.org>
3 ## This program is free software; you can redistribute it and/or modify it under
4 ## the terms of the GNU General Public License as published by the Free Software
5 ## Foundation; either version 3 of the License, or (at your option) any later
8 ## This program is distributed in the hope that it will be useful, but WITHOUT
9 ## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 ## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
13 ## You should have received a copy of the GNU General Public License along with
14 ## this program; if not, see <http://www.gnu.org/licenses/>.
17 ## @deftypefn {Function File} {@var{y} =} pdist (@var{x})
18 ## @deftypefnx {Function File} {@var{y} =} pdist (@var{x}, @var{metric})
19 ## @deftypefnx {Function File} {@var{y} =} pdist (@var{x}, @var{metric}, @var{metricarg}, @dots{})
21 ## Return the distance between any two rows in @var{x}.
23 ## @var{x} is the @var{n}x@var{d} matrix representing @var{q} row
24 ## vectors of size @var{d}.
26 ## The output is a dissimilarity matrix formatted as a row vector
27 ## @var{y}, @math{(n-1)*n/2} long, where the distances are in
28 ## the order [(1, 2) (1, 3) @dots{} (2, 3) @dots{} (n-1, n)]. You can
29 ## use the @code{squareform} function to display the distances between
30 ## the vectors arranged into an @var{n}x@var{n} matrix.
32 ## @code{metric} is an optional argument specifying how the distance is
33 ## computed. It can be any of the following ones, defaulting to
34 ## "euclidean", or a user defined function that takes two arguments
35 ## @var{x} and @var{y} plus any number of optional arguments,
36 ## where @var{x} is a row vector and and @var{y} is a matrix having the
37 ## same number of columns as @var{x}. @code{metric} returns a column
38 ## vector where row @var{i} is the distance between @var{x} and row
39 ## @var{i} of @var{y}. Any additional arguments after the @code{metric}
40 ## are passed as metric (@var{x}, @var{y}, @var{metricarg1},
41 ## @var{metricarg2} @dots{}).
43 ## Predefined distance functions are:
47 ## Euclidean distance (default).
50 ## Standardized Euclidean distance. Each coordinate in the sum of
51 ## squares is inverse weighted by the sample variance of that
54 ## @item "mahalanobis"
55 ## Mahalanobis distance: see the function mahalanobis.
58 ## City Block metric, aka Manhattan distance.
61 ## Minkowski metric. Accepts a numeric parameter @var{p}: for @var{p}=1
62 ## this is the same as the cityblock metric, with @var{p}=2 (default) it
63 ## is equal to the euclidean metric.
66 ## One minus the cosine of the included angle between rows, seen as
69 ## @item "correlation"
70 ## One minus the sample correlation between points (treated as
71 ## sequences of values).
74 ## One minus the sample Spearman's rank correlation between
75 ## observations, treated as sequences of values.
78 ## Hamming distance: the quote of the number of coordinates that differ.
81 ## One minus the Jaccard coefficient, the quote of nonzero
82 ## coordinates that differ.
85 ## Chebychev distance: the maximum coordinate difference.
87 ## @seealso{linkage, mahalanobis, squareform}
90 ## Author: Francesco Potortì <pot@gnu.org>
92 function y = pdist (x, metric, varargin)
98 && ! isa (metric, "function_handle"))
99 error (["pdist: the distance function must be either a string or a "
100 "function handle."]);
104 metric = "euclidean";
107 if (! ismatrix (x) || isempty (x))
108 error ("pdist: x must be a nonempty matrix");
109 elseif (length (size (x)) > 2)
110 error ("pdist: x must be 1 or 2 dimensional");
119 order = nchoosek(1:rows(x),2);
123 metric = lower (metric);
126 d = X(:,Xi) - X(:,Yi);
127 if (str2num(version()(1:3)) > 3.1)
128 y = norm (d, "cols");
130 y = sqrt (sumsq (d, 1));
134 d = X(:,Xi) - X(:,Yi);
135 weights = inv (diag (var (x, 0, 1)));
136 y = sqrt (sum ((weights * d) .* d, 1));
139 d = X(:,Xi) - X(:,Yi);
140 weights = inv (cov (x));
141 y = sqrt (sum ((weights * d) .* d, 1));
144 d = X(:,Xi) - X(:,Yi);
145 if (str2num(version()(1:3)) > 3.1)
146 y = norm (d, 1, "cols");
148 y = sum (abs (d), 1);
152 d = X(:,Xi) - X(:,Yi);
155 p = varargin{1}; # explicitly assigned
157 if (str2num(version()(1:3)) > 3.1)
158 y = norm (d, p, "cols");
160 y = (sum ((abs (d)).^p, 1)).^(1/p);
164 prod = X(:,Xi) .* X(:,Yi);
165 weights = sumsq (X(:,Xi), 1) .* sumsq (X(:,Yi), 1);
166 y = 1 - sum (prod, 1) ./ sqrt (weights);
170 error ("pdist: correlation distance between scalars not defined")
173 y = 1 - corr (sub2ind (size (corr), Xi, Yi))';
177 error ("pdist: spearman distance between scalars not defined")
180 y = 1 - corr (sub2ind (size (corr), Xi, Yi))';
183 d = logical (X(:,Xi) - X(:,Yi));
184 y = sum (d, 1) / rows (X);
187 d = logical (X(:,Xi) - X(:,Yi));
188 weights = X(:,Xi) | X(:,Yi);
189 y = sum (d & weights, 1) ./ sum (weights, 1);
192 d = X(:,Xi) - X(:,Yi);
193 if (str2num(version()(1:3)) > 3.1)
194 y = norm (d, Inf, "cols");
196 y = max (abs (d), [], 1);
203 ## Metric is a function handle or the name of an external function
205 y = zeros (1, nchoosek (l, 2));
209 y(idx++) = feval (metric, x(ii,:), x, varargin{:})(jj);
217 %! xy = [0 1; 0 2; 7 6; 5 6];
219 %! eucl = @(v,m) sqrt(sumsq(repmat(v,rows(m),1)-m,2));
220 %!assert(pdist(xy), [1.000 8.602 7.071 8.062 6.403 2.000],t);
221 %!assert(pdist(xy,eucl), [1.000 8.602 7.071 8.062 6.403 2.000],t);
222 %!assert(pdist(xy,"euclidean"), [1.000 8.602 7.071 8.062 6.403 2.000],t);
223 %!assert(pdist(xy,"seuclidean"), [0.380 2.735 2.363 2.486 2.070 0.561],t);
224 %!assert(pdist(xy,"mahalanobis"),[1.384 1.967 2.446 2.384 1.535 2.045],t);
225 %!assert(pdist(xy,"cityblock"), [1.000 12.00 10.00 11.00 9.000 2.000],t);
226 %!assert(pdist(xy,"minkowski"), [1.000 8.602 7.071 8.062 6.403 2.000],t);
227 %!assert(pdist(xy,"minkowski",3),[1.000 7.763 6.299 7.410 5.738 2.000],t);
228 %!assert(pdist(xy,"cosine"), [0.000 0.349 0.231 0.349 0.231 0.013],t);
229 %!assert(pdist(xy,"correlation"),[0.000 2.000 0.000 2.000 0.000 2.000],t);
230 %!assert(pdist(xy,"spearman"), [0.000 2.000 0.000 2.000 0.000 2.000],t);
231 %!assert(pdist(xy,"hamming"), [0.500 1.000 1.000 1.000 1.000 0.500],t);
232 %!assert(pdist(xy,"jaccard"), [1.000 1.000 1.000 1.000 1.000 0.500],t);
233 %!assert(pdist(xy,"chebychev"), [1.000 7.000 5.000 7.000 5.000 2.000],t);