vault/gist:673698

## gistfile1.matlab
function [mu, C, iters, err] = fkmeansMA(X, k)
% find the centers of k clusters in the set of data X
% X is an m x n matrix of n, m-dimensional points
% k is the number of desired centers for the data
% mu is the coordinates of the centers of each cluster
% C is a vector, where each point in X, i is in cluster C(i)
% iters is the number of iterations required to converge on mu

    fuzz = 3.75;

    [m n] = size(X);
    mn = min(min(X));
    mx = max(max(X));

    % initial guesses are random numbers between the minimum
    % and maximum values in X. This is probably the part that would most
    % benefit from a slightly more intelligent method.
    mu = randn(m,k)*(mx - mn) + mn;

    % initialize probability matrices
    % probs is a k by n matrix, where each row i represents cluster i
    % and each column j represents X(i)
    prev = zeros(k,n);
    probs = ones(k,n);

    % count how many iterations it takes to converge
    % we restrict this to 100 iterations so it doesn't get stuck
    iters = 0;
    while ~isGoodEnough(probs, prev) && iters < 100
        prev = probs; % store the probabilities calculated in the last run

        % find the probabilities that each point i is in each cluster
        for i = 1:n
            dists = calcDistances(X(:,i), mu);
            probs(:,i) = clusterProb(dists);
        end

        % update our guess for each center by taking the weighted average
        % of all of the points, using the previously calculated
        % probabilities
        for i = 1:k
            P = probs(i,:);
            mu(:,i) = weightedAverage(X, P, fuzz);
        end

        iters = iters + 1;
    end
    [i, C] = max(probs);

    err = 0;
    for i = 1:k
        for j = X(C==k)
            err = err + dist(j,mu(:,i)).^2;
        end
    end
end

function avg = weightedAverage(points, probs, fuzz)
% calculate the weighted averages
% points is a matrix of all the points
% probabilities is the probability that each point is in cluster i
    [m n] = size(points);
    avg = zeros(m,1);
    div = 0;
    % weighted average formula taken from clustering article on wikipedia
    for j = 1:n
        avg = avg + (probs(j).^fuzz * points(:,j));
        div = div + probs(j).^fuzz;
    end
    avg = avg/div;
end

function dists = calcDistances(point, centers)
% calculate the distances of point from each of the centers
   [m, k] = size(centers);
   dists = zeros(1,k);
   for j = 1:k
       dists(j) = dist(point, centers(:,j));
   end
end

function good = isGoodEnough(probs, prev)
% tests for convergence
% we claim that we've converged when the difference between two iterations
% of kmeans is less than iota
    iota = .001;
    difference = abs(probs - prev);
    good = all(all(difference < iota));
end

function probs = clusterProb(dists)
% calculate the probability that a point is in a given cluster based on its
% distance from that cluster. We use the reciprocal of the distances since
% smaller numbers are better
    invs = 1 ./ dists;
    tot = sum(invs);
    probs = invs ./ tot;
end

function r = dist(x,y)
% calculate the distance between x and y.
% NOTE: doesn't actually calculate the distance. we only care about
% relative values, and taking the square root here will be slower, but
% won't change minimum distances
    r = sqrt(sum((x-y).^2));
end
	function [mu, C, iters, err] = fkmeansMA(X, k)
	% find the centers of k clusters in the set of data X
	% X is an m x n matrix of n, m-dimensional points
	% k is the number of desired centers for the data
	% mu is the coordinates of the centers of each cluster
	% C is a vector, where each point in X, i is in cluster C(i)
	% iters is the number of iterations required to converge on mu

	fuzz = 3.75;

	[m n] = size(X);
	mn = min(min(X));
	mx = max(max(X));

	% initial guesses are random numbers between the minimum
	% and maximum values in X. This is probably the part that would most
	% benefit from a slightly more intelligent method.
	mu = randn(m,k)*(mx - mn) + mn;

	% initialize probability matrices
	% probs is a k by n matrix, where each row i represents cluster i
	% and each column j represents X(i)
	prev = zeros(k,n);
	probs = ones(k,n);

	% count how many iterations it takes to converge
	% we restrict this to 100 iterations so it doesn't get stuck
	iters = 0;
	while ~isGoodEnough(probs, prev) && iters < 100
	prev = probs; % store the probabilities calculated in the last run

	% find the probabilities that each point i is in each cluster
	for i = 1:n
	dists = calcDistances(X(:,i), mu);
	probs(:,i) = clusterProb(dists);
	end

	% update our guess for each center by taking the weighted average
	% of all of the points, using the previously calculated
	% probabilities
	for i = 1:k
	P = probs(i,:);
	mu(:,i) = weightedAverage(X, P, fuzz);
	end

	iters = iters + 1;
	end
	[i, C] = max(probs);

	err = 0;
	for i = 1:k
	for j = X(C==k)
	err = err + dist(j,mu(:,i)).^2;
	end
	end
	end

	function avg = weightedAverage(points, probs, fuzz)
	% calculate the weighted averages
	% points is a matrix of all the points
	% probabilities is the probability that each point is in cluster i
	[m n] = size(points);
	avg = zeros(m,1);
	div = 0;
	% weighted average formula taken from clustering article on wikipedia
	for j = 1:n
	avg = avg + (probs(j).^fuzz * points(:,j));
	div = div + probs(j).^fuzz;
	end
	avg = avg/div;
	end

	function dists = calcDistances(point, centers)
	% calculate the distances of point from each of the centers
	[m, k] = size(centers);
	dists = zeros(1,k);
	for j = 1:k
	dists(j) = dist(point, centers(:,j));
	end
	end

	function good = isGoodEnough(probs, prev)
	% tests for convergence
	% we claim that we've converged when the difference between two iterations
	% of kmeans is less than iota
	iota = .001;
	difference = abs(probs - prev);
	good = all(all(difference < iota));
	end

	function probs = clusterProb(dists)
	% calculate the probability that a point is in a given cluster based on its
	% distance from that cluster. We use the reciprocal of the distances since
	% smaller numbers are better
	invs = 1 ./ dists;
	tot = sum(invs);
	probs = invs ./ tot;
	end

	function r = dist(x,y)
	% calculate the distance between x and y.
	% NOTE: doesn't actually calculate the distance. we only care about
	% relative values, and taking the square root here will be slower, but
	% won't change minimum distances
	r = sqrt(sum((x-y).^2));
	end