X-Git-Url: https://git.creatis.insa-lyon.fr/pubgit/?p=CreaPhase.git;a=blobdiff_plain;f=octave_packages%2Fstatistics-1.1.3%2Flinkage.m;fp=octave_packages%2Fstatistics-1.1.3%2Flinkage.m;h=84dadb60fad346c797fa827bc861f6abc64007f2;hp=0000000000000000000000000000000000000000;hb=c880e8788dfc484bf23ce13fa2787f2c6bca4863;hpb=1705066eceaaea976f010f669ce8e972f3734b05 diff --git a/octave_packages/statistics-1.1.3/linkage.m b/octave_packages/statistics-1.1.3/linkage.m new file mode 100644 index 0000000..84dadb6 --- /dev/null +++ b/octave_packages/statistics-1.1.3/linkage.m @@ -0,0 +1,231 @@ +## Copyright (C) 2008 Francesco Potortì +## +## This program is free software; you can redistribute it and/or modify it under +## the terms of the GNU General Public License as published by the Free Software +## Foundation; either version 3 of the License, or (at your option) any later +## version. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License along with +## this program; if not, see . + +## -*- texinfo -*- +## @deftypefn {Function File} {@var{y} =} linkage (@var{d}) +## @deftypefnx {Function File} {@var{y} =} linkage (@var{d}, @var{method}) +## @deftypefnx {Function File} @ +## {@var{y} =} linkage (@var{x}, @var{method}, @var{metric}) +## @deftypefnx {Function File} @ +## {@var{y} =} linkage (@var{x}, @var{method}, @var{arglist}) +## +## Produce a hierarchical clustering dendrogram +## +## @var{d} is the dissimilarity matrix relative to @var{n} observations, +## formatted as a @math{(n-1)*n/2}x1 vector as produced by @code{pdist}. +## Alternatively, @var{x} contains data formatted for input to +## @code{pdist}, @var{metric} is a metric for @code{pdist} and +## @var{arglist} is a cell array containing arguments that are passed to +## @code{pdist}. +## +## @code{linkage} starts by putting each observation into a singleton +## cluster and numbering those from 1 to @var{n}. Then it merges two +## clusters, chosen according to @var{method}, to create a new cluster +## numbered @var{n+1}, and so on until all observations are grouped into +## a single cluster numbered @var{2*n-1}. Row @var{m} of the +## @math{m-1}x3 output matrix relates to cluster @math{n+m}: the first +## two columns are the numbers of the two component clusters and column +## 3 contains their distance. +## +## @var{method} defines the way the distance between two clusters is +## computed and how they are recomputed when two clusters are merged: +## +## @table @samp +## @item "single" (default) +## Distance between two clusters is the minimum distance between two +## elements belonging each to one cluster. Produces a cluster tree +## known as minimum spanning tree. +## +## @item "complete" +## Furthest distance between two elements belonging each to one cluster. +## +## @item "average" +## Unweighted pair group method with averaging (UPGMA). +## The mean distance between all pair of elements each belonging to one +## cluster. +## +## @item "weighted" +## Weighted pair group method with averaging (WPGMA). +## When two clusters A and B are joined together, the new distance to a +## cluster C is the mean between distances A-C and B-C. +## +## @item "centroid" +## Unweighted Pair-Group Method using Centroids (UPGMC). +## Assumes Euclidean metric. The distance between cluster centroids, +## each centroid being the center of mass of a cluster. +## +## @item "median" +## Weighted pair-group method using centroids (WPGMC). +## Assumes Euclidean metric. Distance between cluster centroids. When +## two clusters are joined together, the new centroid is the midpoint +## between the joined centroids. +## +## @item "ward" +## Ward's sum of squared deviations about the group mean (ESS). +## Also known as minimum variance or inner squared distance. +## Assumes Euclidean metric. How much the moment of inertia of the +## merged cluster exceeds the sum of those of the individual clusters. +## @end table +## +## @strong{Reference} +## Ward, J. H. Hierarchical Grouping to Optimize an Objective Function +## J. Am. Statist. Assoc. 1963, 58, 236-244, +## @url{http://iv.slis.indiana.edu/sw/data/ward.pdf}. +## @end deftypefn +## +## @seealso{pdist,squareform} + +## Author: Francesco Potortì + +function dgram = linkage (d, method = "single", distarg) + + ## check the input + if (nargin < 1) || (nargin > 3) + print_usage (); + endif + + if (isempty (d)) + error ("linkage: d cannot be empty"); + elseif ( nargin < 3 && ~ isvector (d)) + error ("linkage: d must be a vector"); + endif + + methods = struct ... + ("name", { "single"; "complete"; "average"; "weighted"; + "centroid"; "median"; "ward" }, + "distfunc", {(@(x) min(x)) # single + (@(x) max(x)) # complete + (@(x,i,j,w) sum(diag(q=w([i,j]))*x)/sum(q)) # average + (@(x) mean(x)) # weighted + (@massdist) # centroid + (@(x,i) massdist(x,i)) # median + (@inertialdist) # ward + }); + mask = strcmp (lower (method), {methods.name}); + if (! any (mask)) + error ("linkage: %s: unknown method", method); + endif + dist = {methods.distfunc}{mask}; + + if (nargin == 3) + if (ischar (distarg)) + d = pdist (d, distarg); + elseif (iscell (distarg)) + d = pdist (d, distarg{:}); + else + print_usage (); + endif + endif + + d = squareform (d, "tomatrix"); # dissimilarity NxN matrix + n = rows (d); # the number of observations + diagidx = sub2ind ([n,n], 1:n, 1:n); # indices of diagonal elements + d(diagidx) = Inf; # consider a cluster as far from itself + ## For equal-distance nodes, the order in which clusters are + ## merged is arbitrary. Rotating the initial matrix produces an + ## ordering similar to Matlab's. + cname = n:-1:1; # cluster names in d + d = rot90 (d, 2); # exchange low and high cluster numbers + weight = ones (1, n); # cluster weights + dgram = zeros (n-1, 3); # clusters from n+1 to 2*n-1 + for cluster = n+1:2*n-1 + ## Find the two nearest clusters + [m midx] = min (d(:)); + [r, c] = ind2sub (size (d), midx); + ## Here is the new cluster + dgram(cluster-n, :) = [cname(r) cname(c) d(r, c)]; + ## Put it in place of the first one and remove the second + cname(r) = cluster; + cname(c) = []; + ## Compute the new distances + newd = dist (d([r c], :), r, c, weight); + newd(r) = Inf; # take care of the diagonal element + ## Put distances in place of the first ones, remove the second ones + d(r,:) = newd; + d(:,r) = newd'; + d(c,:) = []; + d(:,c) = []; + ## The new weight is the sum of the components' weights + weight(r) += weight(c); + weight(c) = []; + endfor + ## Sort the cluster numbers, as Matlab does + dgram(:,1:2) = sort (dgram(:,1:2), 2); + + ## Check that distances are monotonically increasing + if (any (diff (dgram(:,3)) < 0)) + warning ("clustering", + "linkage: cluster distances do not monotonically increase\n\ + you should probably use a method different from \"%s\"", method); + endif + +endfunction + +## Take two row vectors, which are the Euclidean distances of clusters I +## and J from the others. Column I of second row contains the distance +## between clusters I and J. The centre of gravity of the new cluster +## is on the segment joining the old ones. W are the weights of all +## clusters. Use the law of cosines to find the distances of the new +## cluster from all the others. +function y = massdist (x, i, j, w) + x .^= 2; # squared Euclidean distances + if (nargin == 2) # median distance + qi = 0.5; # equal weights ("weighted") + else # centroid distance + qi = 1 / (1 + w(j) / w(i)); # proportional weights ("unweighted") + endif + y = sqrt (qi*x(1,:) + (1-qi)*(x(2,:) - qi*x(2,i))); +endfunction + +## Take two row vectors, which are the inertial distances of clusters I +## and J from the others. Column I of second row contains the inertial +## distance between clusters I and J. The centre of gravity of the new +## cluster K is on the segment joining I and J. W are the weights of +## all clusters. Convert inertial to Euclidean distances, then use the +## law of cosines to find the Euclidean distances of K from all the +## other clusters, convert them back to inertial distances and return +## them. +function y = inertialdist (x, i, j, w) + wi = w(i); wj = w(j); # the cluster weights + s = [wi + w; wj + w]; # sum of weights for all cluster pairs + p = [wi * w; wj * w]; # product of weights for all cluster pairs + x = x.^2 .* s ./ p; # convert inertial dist. to squared Eucl. + sij = wi + wj; # sum of weights of I and J + qi = wi/sij; # normalise the weight of I + ## Squared Euclidean distances between all clusters and new cluster K + x = qi*x(1,:) + (1-qi)*(x(2,:) - qi*x(2,i)); + y = sqrt (x * sij .* w ./ (sij + w)); # convert Eucl. dist. to inertial +endfunction + +%!shared x, t +%! x = reshape(mod(magic(6),5),[],3); +%! t = 1e-6; +%!assert (cond (linkage (pdist (x))), 34.119045,t); +%!assert (cond (linkage (pdist (x), "complete")), 21.793345,t); +%!assert (cond (linkage (pdist (x), "average")), 27.045012,t); +%!assert (cond (linkage (pdist (x), "weighted")), 27.412889,t); +%! lastwarn(); # Clear last warning before the test +%!warning linkage (pdist (x), "centroid"); +%!test warning off clustering +%! assert (cond (linkage (pdist (x), "centroid")), 27.457477,t); +%! warning on clustering +%!warning linkage (pdist (x), "median"); +%!test warning off clustering +%! assert (cond (linkage (pdist (x), "median")), 27.683325,t); +%! warning on clustering +%!assert (cond (linkage (pdist (x), "ward")), 17.195198,t); +%!assert (cond (linkage(x,"ward","euclidean")), 17.195198,t); +%!assert (cond (linkage(x,"ward",{"euclidean"})), 17.195198,t); +%!assert (cond (linkage(x,"ward",{"minkowski",2})),17.195198,t);