Skip to content

Instantly share code, notes, and snippets.

@nl-hugo
Last active September 4, 2015 11:03
Show Gist options
  • Save nl-hugo/7e5eeb2af649b9131c0b to your computer and use it in GitHub Desktop.
Save nl-hugo/7e5eeb2af649b9131c0b to your computer and use it in GitHub Desktop.
k-Means clustering

From wikipedia

k-means clustering is a method of vector quantization, originally from signal processing, that is popular for cluster analysis in data mining. k-means clustering aims to partition n observations into k clusters in which each observation belongs to the cluster with the nearest mean, serving as a prototype of the cluster. This results in a partitioning of the data space into Voronoi cells.

Data

Data are randomly generated x,y-coordinates between 0 and the width/height of the canvas.

The Algorithm

  1. The algorithm is initialized with n random points, along with k randomly generated 'means' (centroid) within the data domain.
  2. When the algorithm starts, each point is assigned the color of the closest centroid, forming k clusters. Closest is defined as the smallest Euclidean distance between two points.
  3. The centroids are moved to the center of the cluster.
  4. Steps 2 and 3 are repeated until the maximum number of iterations is reached.
<!DOCTYPE html>
<meta charset="utf-8">
<style>
circle {
fill: #ccc;
}
.centroid {
stroke: #000;
stroke-width: 2px;
}
text.label {
font-family: Helvetica, Arial, sans-serif;
font-size: 12px;
fill: #333;
}
</style>
<svg width="550" height="550"></svg>
<script src="https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.5/d3.min.js"></script>
<script type="text/javascript">
var iter = 1,
centroids = [],
points = [],
numPoints = 1000,
numClusters = 5,
maxIter = 10;
var margin = {top: 30, right: 20, bottom: 20, left: 30},
width = 500 - margin.left - margin.right,
height = 500 - margin.top - margin.bottom;
var colors = d3.scale.category20().range();
var svg = d3.select("svg");
var group = svg.append("g")
.attr("transform", "translate(" + margin.left + "," + margin.top + ")");
svg.append("g")
.append("text")
.attr("class", "label")
.attr("transform", "translate(" + (width - margin.left - margin.right) +
"," + (height + margin.top + margin.bottom) + ")")
.text("");
initialize();
/**
* Computes the euclidian distance between two points.
*/
function getEuclidianDistance(a, b) {
var dx = b.x - a.x,
dy = b.y - a.y;
return Math.sqrt(Math.pow(dx, 2) + Math.pow(dy, 2));
}
/**
* Returns a point with the specified type and fill color and with random
* x,y-coordinates.
*/
function getRandomPoint(type, fill) {
return {
x: Math.round(Math.random() * width),
y: Math.round(Math.random() * height),
type: type,
fill: fill
};
}
/**
* Generates a specified number of random points of the specified type.
*/
function initializePoints(num, type) {
var result = [];
for (var i = 0; i < num; i++) {
var color = colors[i];
if (type !== "centroid") {
color = "#ccc";
}
var point = getRandomPoint(type, color);
point.id = point.type + "-" + i;
result.push(point);
}
return result;
}
/**
* Find the centroid that is closest to the specified point.
*/
function findClosestCentroid(point) {
var closest = {i: -1, distance: width * 2};
centroids.forEach(function(d, i) {
var distance = getEuclidianDistance(d, point);
// Only update when the centroid is closer
if (distance < closest.distance) {
closest.i = i;
closest.distance = distance;
}
});
return (centroids[closest.i]);
}
/**
* All points assume the color of the closest centroid.
*/
function colorizePoints() {
points.forEach(function(d) {
var closest = findClosestCentroid(d);
d.fill = closest.fill;
});
}
/**
* Computes the center of the cluster by taking the mean of the x and y
* coordinates.
*/
function computeClusterCenter(cluster) {
return [
d3.mean(cluster, function(d) { return d.x; }),
d3.mean(cluster, function(d) { return d.y; })
];
}
/**
* Moves the centroids to the center of their cluster.
*/
function moveCentroids() {
centroids.forEach(function(d) {
// Get clusters based on their fill color
var cluster = points.filter(function(e) {
return e.fill === d.fill;
});
// Compute the cluster centers
var center = computeClusterCenter(cluster);
// Move the centroid
d.x = center[0];
d.y = center[1];
});
}
/**
* Updates the chart.
*/
function update() {
var data = points.concat(centroids);
var circle = group.selectAll("circle").data(data);
// Create new elements as needed
circle.enter().append("circle")
.attr("id", function(d) { return d.id; })
.attr("class", function(d) { return d.type; })
.attr("r", 5);
// Update old elements as needed
circle.transition().delay(100).duration(1000)
.attr("cx", function(d) { return d.x; })
.attr("cy", function(d) { return d.y; })
.style("fill", function(d) { return d.fill; });
// Remove old nodes, if needed
circle.exit().remove();
}
/**
* Updates the text in the label.
*/
function setText(text) {
svg.selectAll(".label").text(text);
}
/**
* Executes one iteration of the algorithm:
* - Fill the points with the color of the closest centroid (this makes it
* part of its cluster)
* - Move the centroids to the center of their cluster.
*/
function iterate() {
setText("Iteration " + iter + " of " + maxIter);
colorizePoints();
moveCentroids();
update();
}
/**
* The main function initializes the algorithm and calls an iteration every
* two seconds.
*/
function initialize() {
// Initialize random points and centroids
centroids = initializePoints(numClusters, "centroid");
points = initializePoints(numPoints, "point");
// initial drawing
update();
var interval = setInterval(function() {
if(iter < maxIter + 1) {
iterate();
iter++;
} else {
clearInterval(interval);
setText("Done");
}
}, 2 * 1000);
}
</script>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment