Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
A simple k-means clustering implementation for GNU Octave.
% This program is free software: you can redistribute it and/or modify
% it under the terms of the GNU Affero General Public License as
% published by the Free Software Foundation, either version 3 of the
% License, or any later version.
%
% This program is distributed in the hope that it will be useful,
% but WITHOUT ANY WARRANTY; without even the implied warranty of
% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
% GNU Affero General Public License for more details. It can be found
% at <http://www.gnu.org/licenses/>.
%
%==========================================================================
% FUNCTION: [assignments, centers] = kmeans(X, k, centers = 0, maxiter = 200)
% DESCRIPTION: This function performs k-means clustering algorithm on a given
% dataset.
%
% INPUTS: X = N*d matrix of dataset, rows of X correspond to N data points;
% columns correspond to attributes.
% k = number of clusters
% centers = Starting centers of clusters.
% maxiter = Maximum iteration count for convergence.
%
% OUTPUTS: assignments = Integer vector that holds
%==========================================================================
% copyright (c) 2010 M. Emin Aksehirli
%==========================================================================
function [assignments, centers] = kmeans(X, k, centers = 0, maxiter = 200)
if (centers == 0)
centerRows = randperm(size(X)(1));
centers = X(centerRows(1:k), :);
endif
numOfRows = length(X(:,1));
numOfFeatures = length(X(1,:));
assignments = ones(1, numOfRows);
for iter = 1:maxiter
clusterTotals = zeros(k, numOfFeatures);
clusterSizes = zeros(k, 1);
for rowIx = 1:numOfRows
minDist = realmax;
assignTo = 0;
for centerIx = 1:k
% Euclidian distance is used.
dist = sqrt(sum((X(rowIx, : ) - centers(centerIx, :)).^2));
if dist < minDist
minDist = dist;
assignTo = centerIx;
endif
endfor
assignments(rowIx) = assignTo;
% Keep these information to calculate cluster centers.
clusterTotals(assignTo, :) += X(rowIx, :);
clusterSizes(assignTo)++;
endfor
% This process is called 'singleton' in terms of Matlab.
% If a cluster is empty choose a random data point as new
% cluster cener.
for clusterIx = 1:k
if (clusterSizes(clusterIx) == 0)
randomRow = round(1 + rand() * (numOfRows - 1) );
clusterTotals(clusterIx, :) = X(randomRow, :);
clusterSizes(clusterIx) = 1;
endif
endfor
newCenters = zeros(k, numOfFeatures);
for centerIx = 1:k
newCenters(centerIx, :) = clusterTotals(centerIx, : ) / clusterSizes(centerIx);
endfor
diff = sum(sum(abs(newCenters - centers)));
if diff < eps
%disp('Centers are same, which means we converged before maxiteration count. This is a good thing!')
break;
endif
centers = newCenters;
endfor
assignments = assignments';
%printf('iter: %d, diff: %f\n', iter, diff);
endfunction
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment