1 ## Copyright (C) 2011 Soren Hauberg <soren@hauberg.org>
2 ## Copyright (C) 2012 Daniel Ward <dwa012@gmail.com>
4 ## This program is free software; you can redistribute it and/or modify it under
5 ## the terms of the GNU General Public License as published by the Free Software
6 ## Foundation; either version 3 of the License, or (at your option) any later
9 ## This program is distributed in the hope that it will be useful, but WITHOUT
10 ## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 ## FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
14 ## You should have received a copy of the GNU General Public License along with
15 ## this program; if not, see <http://www.gnu.org/licenses/>.
18 ## @deftypefn {Function File} {[@var{idx}, @var{centers}] =} kmeans (@var{data}, @var{k}, @var{param1}, @var{value1}, @dots{})
19 ## K-means clustering.
24 function [classes, centers, sumd, D] = kmeans (data, k, varargin)
25 [reg, prop] = parseparams (varargin);
27 ## defaults for options
28 emptyaction = "error";
31 #used for getting the number of samples
34 ## used to hold the distances from each sample to each class
37 #used for convergence of the centroids
40 #initial sum of distances
43 ## Input checking, validate the matrix and k
44 if (!isnumeric (data) || !ismatrix (data) || !isreal (data))
45 error ("kmeans: first input argument must be a DxN real data matrix");
46 elseif (!isscalar (k))
47 error ("kmeans: second input argument must be a scalar");
50 if (length (varargin) > 0)
51 ## check for the 'emptyaction' property
52 found = find (strcmpi (prop, "emptyaction") == 1);
53 switch (lower (prop{found+1}))
55 emptyaction = "singleton";
57 error ("kmeans: unsupported empty cluster action parameter");
61 ## check for the 'start' property
62 switch (lower (start))
64 idx = randperm (nRows) (1:k);
65 centers = data (idx, :);
67 error ("kmeans: unsupported initial clustering parameter");
74 D (:, i) = sumsq (data - repmat (centers(i, :), nRows, 1), 2);
78 [tmp, classes] = min (D, [], 2);
80 ## Calcualte new centroids
82 ## Check for empty clusters
83 if (sum (classes == i) ==0 || length (mean (data(classes == i, :))) == 0)
86 ## if 'singleton', then find the point that is the
87 ## farthest and add it to the empty cluster
89 classes(maxCostSampleIndex (data, centers(i,:))) = i;
90 ## if 'error' then throw the error
92 error ("kmeans: empty cluster created");
94 endif ## end check for empty clusters
96 ## update the centroids
97 centers(i, :) = mean (data(classes == i, :));
100 ## calculate the differnece in the sum of distances
101 err = sumd - objCost (data, classes, centers);
102 ## update the current sum of distances
103 sumd = objCost (data, classes, centers);
107 ## calculate the sum of distances
108 function obj = objCost (data, classes, centers)
111 obj = obj + sumsq (data(i,:) - centers(classes(i),:));
115 function index = maxCostSampleIndex (data, centers)
117 for index = 1:rows (data)
118 if cost < sumsq (data(index,:) - centers)
119 cost = sumsq (data(index,:) - centers);
125 %! ## Generate a two-cluster problem
126 %! C1 = randn (100, 2) + 1;
127 %! C2 = randn (100, 2) - 1;
130 %! ## Perform clustering
131 %! [idx, centers] = kmeans (data, 2);
133 %! ## Plot the result
135 %! plot (data (idx==1, 1), data (idx==1, 2), 'ro');
137 %! plot (data (idx==2, 1), data (idx==2, 2), 'bs');
138 %! plot (centers (:, 1), centers (:, 2), 'kv', 'markersize', 10);