Created
November 12, 2010 03:45
-
-
Save vault/673698 to your computer and use it in GitHub Desktop.
Fuzzy Kmeans Algorithm
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function [mu, C, iters, err] = fkmeansMA(X, k) | |
% find the centers of k clusters in the set of data X | |
% X is an m x n matrix of n, m-dimensional points | |
% k is the number of desired centers for the data | |
% mu is the coordinates of the centers of each cluster | |
% C is a vector, where each point in X, i is in cluster C(i) | |
% iters is the number of iterations required to converge on mu | |
fuzz = 3.75; | |
[m n] = size(X); | |
mn = min(min(X)); | |
mx = max(max(X)); | |
% initial guesses are random numbers between the minimum | |
% and maximum values in X. This is probably the part that would most | |
% benefit from a slightly more intelligent method. | |
mu = randn(m,k)*(mx - mn) + mn; | |
% initialize probability matrices | |
% probs is a k by n matrix, where each row i represents cluster i | |
% and each column j represents X(i) | |
prev = zeros(k,n); | |
probs = ones(k,n); | |
% count how many iterations it takes to converge | |
% we restrict this to 100 iterations so it doesn't get stuck | |
iters = 0; | |
while ~isGoodEnough(probs, prev) && iters < 100 | |
prev = probs; % store the probabilities calculated in the last run | |
% find the probabilities that each point i is in each cluster | |
for i = 1:n | |
dists = calcDistances(X(:,i), mu); | |
probs(:,i) = clusterProb(dists); | |
end | |
% update our guess for each center by taking the weighted average | |
% of all of the points, using the previously calculated | |
% probabilities | |
for i = 1:k | |
P = probs(i,:); | |
mu(:,i) = weightedAverage(X, P, fuzz); | |
end | |
iters = iters + 1; | |
end | |
[i, C] = max(probs); | |
err = 0; | |
for i = 1:k | |
for j = X(C==k) | |
err = err + dist(j,mu(:,i)).^2; | |
end | |
end | |
end | |
function avg = weightedAverage(points, probs, fuzz) | |
% calculate the weighted averages | |
% points is a matrix of all the points | |
% probabilities is the probability that each point is in cluster i | |
[m n] = size(points); | |
avg = zeros(m,1); | |
div = 0; | |
% weighted average formula taken from clustering article on wikipedia | |
for j = 1:n | |
avg = avg + (probs(j).^fuzz * points(:,j)); | |
div = div + probs(j).^fuzz; | |
end | |
avg = avg/div; | |
end | |
function dists = calcDistances(point, centers) | |
% calculate the distances of point from each of the centers | |
[m, k] = size(centers); | |
dists = zeros(1,k); | |
for j = 1:k | |
dists(j) = dist(point, centers(:,j)); | |
end | |
end | |
function good = isGoodEnough(probs, prev) | |
% tests for convergence | |
% we claim that we've converged when the difference between two iterations | |
% of kmeans is less than iota | |
iota = .001; | |
difference = abs(probs - prev); | |
good = all(all(difference < iota)); | |
end | |
function probs = clusterProb(dists) | |
% calculate the probability that a point is in a given cluster based on its | |
% distance from that cluster. We use the reciprocal of the distances since | |
% smaller numbers are better | |
invs = 1 ./ dists; | |
tot = sum(invs); | |
probs = invs ./ tot; | |
end | |
function r = dist(x,y) | |
% calculate the distance between x and y. | |
% NOTE: doesn't actually calculate the distance. we only care about | |
% relative values, and taking the square root here will be slower, but | |
% won't change minimum distances | |
r = sqrt(sum((x-y).^2)); | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment