jurgenvinju/KMeans.rsc

## KMeans.rsc
module analysis::learning::KMeans

import util::Math;
import List;
import Set;
import IO;

data Point = point(list[real] vec);

Point arbPoint(int d) = point([arbReal() | _ <- [0..d]]);

private real sqr(real r) = r * r;

@doc{
.Synopsis A measure of distance (the square of the Euclidean distance in fact)
.Description

We compute pairwise the square of the absolute distance of each dimension to arrive
at the square of the Euclidian distance between two n-dimensional points.

The dist function is robust against points with different dimensionalities and will
fill the shorter point with .0 values until the dimensions are the same.
}
real dist(Point p, Point q)
  = (.0 | it + sqr(abs((p.vec[i]?.0) - (q.vec[i]?.0))) | i <- [0..max(size(p.vec),size(q.vec))]);

@doc{
.Synopsis: computes the average of all dimensions of all points
.Description:
A centroid is the "average" point of a set of points, composed as the
pairwise mean value of all points for each dimenions.
}
@memo
private Point centroid(set[Point] cl:{Point h, *_}) =
  point([ sum([p.vec[i] | p <- cl]) / size(cl) | i <- index(h.vec)]);

alias Clusters = set[set[Point]];

@doc{
.Synopsis: simple partition simply breaks the input up in almost even parts, in order of appearance
.Description:
This partioning algorithm serves as a demonstration of what an initial partioning in clusters
should do before the kmeans algorithm is applied.

The current algorithm is not very smart and will lead to longer running times if kmeans
is applied to it, because the chances are that many points have to be moved to new clusters
in many iterations before the optimum is reached.

The advice is to create your own partioning function. Such a function would first guess
a set of initial centroids using any distribution (using knowledge on the input data),
and then sample efficiently sample around these centroids using another
distribution function.
}
Clusters simplePartition([], 0)             = {};
Clusters simplePartition(list[Point] cl, 1) = {{*cl}};
default Clusters simplePartition(list[Point] cl, int k)
  = {{*cl[..s/k]}, *simplePartition(cl[s/k..], k - 1)}
  when s := size(cl);


@doc{
.Synopsis Kmeans optimizes the initial arbitrary clustering by reassigning points to a nearest averaged middle point for each cluster (centroid).
.Description

The efficiency of kmeans is greatly influenced by the initial clustering provided. The closer
the initial cluster is to the optimum which kmeans computes, the fewer steps it takes to reach it.

Kmeans will return no more clusters than the initial clustering, but it may return fewer
clusters. This may happen when the final elements in a cluster are closer to another centroid
than they were to the previous centroid of that cluster: then one cluster completely eats up another.
It's an indication that the initial amount of clusters was not appropriate for the input data,
but it can also happen accidentally in weird cases when the initial clustering is completely
unbalanced.
}
Clusters kmeans(Clusters prop) {
  // we assign each point to a cluster, identified by its current centroid
  assignment = {<c, p> | cl <- prop, c := centroid(cl), p <- cl};

  solve (assignment) {
    // the current centroids are used to index each cluster for each iteration
    centroids  = assignment<0>;

    // we find which other cluster is currently closer for each point in each cluster:
    movement   = {<(c | dist(d, p) < dist(it, p) ? d : it | d <- centroids), p> | c <- centroids, p <- assignment[c]};

    // then we compute for each new cluster what the new centroid should be
    assignment = {<nc, p>  | c <- centroids, cl := movement[c], cl != {}, nc := centroid(cl), p <- cl};
  }

  // drop the centroids and simply return a set of clusters
  return { assignment[cl] | cl <- assignment<0>};
}
	module analysis::learning::KMeans

	import util::Math;
	import List;
	import Set;
	import IO;

	data Point = point(list[real] vec);

	Point arbPoint(int d) = point([arbReal() \| _ <- [0..d]]);

	private real sqr(real r) = r * r;

	@doc{
	.Synopsis A measure of distance (the square of the Euclidean distance in fact)
	.Description

	We compute pairwise the square of the absolute distance of each dimension to arrive
	at the square of the Euclidian distance between two n-dimensional points.

	The dist function is robust against points with different dimensionalities and will
	fill the shorter point with .0 values until the dimensions are the same.
	}
	real dist(Point p, Point q)
	= (.0 \| it + sqr(abs((p.vec[i]?.0) - (q.vec[i]?.0))) \| i <- [0..max(size(p.vec),size(q.vec))]);

	@doc{
	.Synopsis: computes the average of all dimensions of all points
	.Description:
	A centroid is the "average" point of a set of points, composed as the
	pairwise mean value of all points for each dimenions.
	}
	@memo
	private Point centroid(set[Point] cl:{Point h, *_}) =
	point([ sum([p.vec[i] \| p <- cl]) / size(cl) \| i <- index(h.vec)]);

	alias Clusters = set[set[Point]];

	@doc{
	.Synopsis: simple partition simply breaks the input up in almost even parts, in order of appearance
	.Description:
	This partioning algorithm serves as a demonstration of what an initial partioning in clusters
	should do before the kmeans algorithm is applied.

	The current algorithm is not very smart and will lead to longer running times if kmeans
	is applied to it, because the chances are that many points have to be moved to new clusters
	in many iterations before the optimum is reached.

	The advice is to create your own partioning function. Such a function would first guess
	a set of initial centroids using any distribution (using knowledge on the input data),
	and then sample efficiently sample around these centroids using another
	distribution function.
	}
	Clusters simplePartition([], 0) = {};
	Clusters simplePartition(list[Point] cl, 1) = {{*cl}};
	default Clusters simplePartition(list[Point] cl, int k)
	= {{cl[..s/k]}, simplePartition(cl[s/k..], k - 1)}
	when s := size(cl);


	@doc{
	.Synopsis Kmeans optimizes the initial arbitrary clustering by reassigning points to a nearest averaged middle point for each cluster (centroid).
	.Description

	The efficiency of kmeans is greatly influenced by the initial clustering provided. The closer
	the initial cluster is to the optimum which kmeans computes, the fewer steps it takes to reach it.

	Kmeans will return no more clusters than the initial clustering, but it may return fewer
	clusters. This may happen when the final elements in a cluster are closer to another centroid
	than they were to the previous centroid of that cluster: then one cluster completely eats up another.
	It's an indication that the initial amount of clusters was not appropriate for the input data,
	but it can also happen accidentally in weird cases when the initial clustering is completely
	unbalanced.
	}
	Clusters kmeans(Clusters prop) {
	// we assign each point to a cluster, identified by its current centroid
	assignment = {<c, p> \| cl <- prop, c := centroid(cl), p <- cl};

	solve (assignment) {
	// the current centroids are used to index each cluster for each iteration
	centroids = assignment<0>;

	// we find which other cluster is currently closer for each point in each cluster:
	movement = {<(c \| dist(d, p) < dist(it, p) ? d : it \| d <- centroids), p> \| c <- centroids, p <- assignment[c]};

	// then we compute for each new cluster what the new centroid should be
	assignment = {<nc, p> \| c <- centroids, cl := movement[c], cl != {}, nc := centroid(cl), p <- cl};
	}

	// drop the centroids and simply return a set of clusters
	return { assignment[cl] \| cl <- assignment<0>};
	}