1 ## Copyright (C) 2008 Francesco Potortì <pot@gnu.org>
3 ## This program is free software; you can redistribute it and/or modify it under
4 ## the terms of the GNU General Public License as published by the Free Software
5 ## Foundation; either version 3 of the License, or (at your option) any later
8 ## This program is distributed in the hope that it will be useful, but WITHOUT
9 ## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 ## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
13 ## You should have received a copy of the GNU General Public License along with
14 ## this program; if not, see <http://www.gnu.org/licenses/>.
17 ## @deftypefn {Function File} {@var{y} =} linkage (@var{d})
18 ## @deftypefnx {Function File} {@var{y} =} linkage (@var{d}, @var{method})
19 ## @deftypefnx {Function File} @
20 ## {@var{y} =} linkage (@var{x}, @var{method}, @var{metric})
21 ## @deftypefnx {Function File} @
22 ## {@var{y} =} linkage (@var{x}, @var{method}, @var{arglist})
24 ## Produce a hierarchical clustering dendrogram
26 ## @var{d} is the dissimilarity matrix relative to @var{n} observations,
27 ## formatted as a @math{(n-1)*n/2}x1 vector as produced by @code{pdist}.
28 ## Alternatively, @var{x} contains data formatted for input to
29 ## @code{pdist}, @var{metric} is a metric for @code{pdist} and
30 ## @var{arglist} is a cell array containing arguments that are passed to
33 ## @code{linkage} starts by putting each observation into a singleton
34 ## cluster and numbering those from 1 to @var{n}. Then it merges two
35 ## clusters, chosen according to @var{method}, to create a new cluster
36 ## numbered @var{n+1}, and so on until all observations are grouped into
37 ## a single cluster numbered @var{2*n-1}. Row @var{m} of the
38 ## @math{m-1}x3 output matrix relates to cluster @math{n+m}: the first
39 ## two columns are the numbers of the two component clusters and column
40 ## 3 contains their distance.
42 ## @var{method} defines the way the distance between two clusters is
43 ## computed and how they are recomputed when two clusters are merged:
46 ## @item "single" (default)
47 ## Distance between two clusters is the minimum distance between two
48 ## elements belonging each to one cluster. Produces a cluster tree
49 ## known as minimum spanning tree.
52 ## Furthest distance between two elements belonging each to one cluster.
55 ## Unweighted pair group method with averaging (UPGMA).
56 ## The mean distance between all pair of elements each belonging to one
60 ## Weighted pair group method with averaging (WPGMA).
61 ## When two clusters A and B are joined together, the new distance to a
62 ## cluster C is the mean between distances A-C and B-C.
65 ## Unweighted Pair-Group Method using Centroids (UPGMC).
66 ## Assumes Euclidean metric. The distance between cluster centroids,
67 ## each centroid being the center of mass of a cluster.
70 ## Weighted pair-group method using centroids (WPGMC).
71 ## Assumes Euclidean metric. Distance between cluster centroids. When
72 ## two clusters are joined together, the new centroid is the midpoint
73 ## between the joined centroids.
76 ## Ward's sum of squared deviations about the group mean (ESS).
77 ## Also known as minimum variance or inner squared distance.
78 ## Assumes Euclidean metric. How much the moment of inertia of the
79 ## merged cluster exceeds the sum of those of the individual clusters.
83 ## Ward, J. H. Hierarchical Grouping to Optimize an Objective Function
84 ## J. Am. Statist. Assoc. 1963, 58, 236-244,
85 ## @url{http://iv.slis.indiana.edu/sw/data/ward.pdf}.
88 ## @seealso{pdist,squareform}
90 ## Author: Francesco Potortì <pot@gnu.org>
92 function dgram = linkage (d, method = "single", distarg)
95 if (nargin < 1) || (nargin > 3)
100 error ("linkage: d cannot be empty");
101 elseif ( nargin < 3 && ~ isvector (d))
102 error ("linkage: d must be a vector");
106 ("name", { "single"; "complete"; "average"; "weighted";
107 "centroid"; "median"; "ward" },
108 "distfunc", {(@(x) min(x)) # single
109 (@(x) max(x)) # complete
110 (@(x,i,j,w) sum(diag(q=w([i,j]))*x)/sum(q)) # average
111 (@(x) mean(x)) # weighted
112 (@massdist) # centroid
113 (@(x,i) massdist(x,i)) # median
114 (@inertialdist) # ward
116 mask = strcmp (lower (method), {methods.name});
118 error ("linkage: %s: unknown method", method);
120 dist = {methods.distfunc}{mask};
123 if (ischar (distarg))
124 d = pdist (d, distarg);
125 elseif (iscell (distarg))
126 d = pdist (d, distarg{:});
132 d = squareform (d, "tomatrix"); # dissimilarity NxN matrix
133 n = rows (d); # the number of observations
134 diagidx = sub2ind ([n,n], 1:n, 1:n); # indices of diagonal elements
135 d(diagidx) = Inf; # consider a cluster as far from itself
136 ## For equal-distance nodes, the order in which clusters are
137 ## merged is arbitrary. Rotating the initial matrix produces an
138 ## ordering similar to Matlab's.
139 cname = n:-1:1; # cluster names in d
140 d = rot90 (d, 2); # exchange low and high cluster numbers
141 weight = ones (1, n); # cluster weights
142 dgram = zeros (n-1, 3); # clusters from n+1 to 2*n-1
143 for cluster = n+1:2*n-1
144 ## Find the two nearest clusters
145 [m midx] = min (d(:));
146 [r, c] = ind2sub (size (d), midx);
147 ## Here is the new cluster
148 dgram(cluster-n, :) = [cname(r) cname(c) d(r, c)];
149 ## Put it in place of the first one and remove the second
152 ## Compute the new distances
153 newd = dist (d([r c], :), r, c, weight);
154 newd(r) = Inf; # take care of the diagonal element
155 ## Put distances in place of the first ones, remove the second ones
160 ## The new weight is the sum of the components' weights
161 weight(r) += weight(c);
164 ## Sort the cluster numbers, as Matlab does
165 dgram(:,1:2) = sort (dgram(:,1:2), 2);
167 ## Check that distances are monotonically increasing
168 if (any (diff (dgram(:,3)) < 0))
169 warning ("clustering",
170 "linkage: cluster distances do not monotonically increase\n\
171 you should probably use a method different from \"%s\"", method);
176 ## Take two row vectors, which are the Euclidean distances of clusters I
177 ## and J from the others. Column I of second row contains the distance
178 ## between clusters I and J. The centre of gravity of the new cluster
179 ## is on the segment joining the old ones. W are the weights of all
180 ## clusters. Use the law of cosines to find the distances of the new
181 ## cluster from all the others.
182 function y = massdist (x, i, j, w)
183 x .^= 2; # squared Euclidean distances
184 if (nargin == 2) # median distance
185 qi = 0.5; # equal weights ("weighted")
186 else # centroid distance
187 qi = 1 / (1 + w(j) / w(i)); # proportional weights ("unweighted")
189 y = sqrt (qi*x(1,:) + (1-qi)*(x(2,:) - qi*x(2,i)));
192 ## Take two row vectors, which are the inertial distances of clusters I
193 ## and J from the others. Column I of second row contains the inertial
194 ## distance between clusters I and J. The centre of gravity of the new
195 ## cluster K is on the segment joining I and J. W are the weights of
196 ## all clusters. Convert inertial to Euclidean distances, then use the
197 ## law of cosines to find the Euclidean distances of K from all the
198 ## other clusters, convert them back to inertial distances and return
200 function y = inertialdist (x, i, j, w)
201 wi = w(i); wj = w(j); # the cluster weights
202 s = [wi + w; wj + w]; # sum of weights for all cluster pairs
203 p = [wi * w; wj * w]; # product of weights for all cluster pairs
204 x = x.^2 .* s ./ p; # convert inertial dist. to squared Eucl.
205 sij = wi + wj; # sum of weights of I and J
206 qi = wi/sij; # normalise the weight of I
207 ## Squared Euclidean distances between all clusters and new cluster K
208 x = qi*x(1,:) + (1-qi)*(x(2,:) - qi*x(2,i));
209 y = sqrt (x * sij .* w ./ (sij + w)); # convert Eucl. dist. to inertial
213 %! x = reshape(mod(magic(6),5),[],3);
215 %!assert (cond (linkage (pdist (x))), 34.119045,t);
216 %!assert (cond (linkage (pdist (x), "complete")), 21.793345,t);
217 %!assert (cond (linkage (pdist (x), "average")), 27.045012,t);
218 %!assert (cond (linkage (pdist (x), "weighted")), 27.412889,t);
219 %! lastwarn(); # Clear last warning before the test
220 %!warning <monotonically> linkage (pdist (x), "centroid");
221 %!test warning off clustering
222 %! assert (cond (linkage (pdist (x), "centroid")), 27.457477,t);
223 %! warning on clustering
224 %!warning <monotonically> linkage (pdist (x), "median");
225 %!test warning off clustering
226 %! assert (cond (linkage (pdist (x), "median")), 27.683325,t);
227 %! warning on clustering
228 %!assert (cond (linkage (pdist (x), "ward")), 17.195198,t);
229 %!assert (cond (linkage(x,"ward","euclidean")), 17.195198,t);
230 %!assert (cond (linkage(x,"ward",{"euclidean"})), 17.195198,t);
231 %!assert (cond (linkage(x,"ward",{"minkowski",2})),17.195198,t);