Skip to content

Instantly share code, notes, and snippets.

@maptracker
Last active January 22, 2016 15:44
Show Gist options
  • Save maptracker/07328d13c143ae9b9e53 to your computer and use it in GitHub Desktop.
Save maptracker/07328d13c143ae9b9e53 to your computer and use it in GitHub Desktop.
Exploring #kmeans stability in #R
## Code was initially taken From Coursera lecture:
## https://www.coursera.org/learn/exploratory-data-analysis/lecture/6hOqi/k-means-clustering-part-2
set.seed(1234)
## Generate 30 points randomly assorted around 3 centroids: (1,1), (2,2), (3,1)
numPoints <- 30
x <- rnorm(numPoints, mean = rep(1:3, each = numPoints/3), sd = 0.2)
y <- rnorm(numPoints, mean = rep(c(1, 2, 1), each = numPoints/3), sd = 0.2)
## Organize as DF
dataFrame <- data.frame(x = x, y = y, row.names = 1:numPoints)
## Look for the clusters. Because we generated three clusters above,
## unambiguous results should be found when looking for 3. Trying
## other cluster numbers (searchFor) will result in different
## outcomes each time you re-run the analysis
clusterAndPlot <- function( searchFor ) {
kmeansObj <- kmeans(dataFrame, centers = searchFor)
## Plot and label the input
plot(x, y, col = kmeansObj$cluster, pch = 1, cex = 2)
text(x + 0.05, y + 0.05, labels = as.character(1:numPoints), col = "blue")
## Add the centroids
points(kmeansObj$centers, col = 1:searchFor, pch = 3, cex = 3, lwd = 3)
}
clusterAndPlot(3)
dev.new( width = 12 )
par( mfrow = c(1,2) )
trackInstability <- function( searchFor, iter = 100 ) {
## This function tracks how often specific k-means clusters
## appear. It generates a scatter plot with all observed
## centroids labeled, plus a bar plot showing the frequency of
## occurance of each particular cluster (based on its object
## membership)
## The goal here is to see how consistent clustering is for the
## data generated above. Do we always find the same clusters? If
## not, how often are given clusters observed?
hist <- data.frame()
lookup <- list()
nr <- nrow(dataFrame)
for (i in 1:iter) {
## Cluster the data
ko <- kmeans(dataFrame, centers = searchFor)
## Find object membership for each cluster, stored in a list
clust <- list()
for (j in 1:nr) {
cnum <- as.character(ko$cluster[j])
clust[[ cnum ]] <- c(clust[[ cnum ]], j)
}
## Now turn each cluster into a unique key, and use that key
## to increment our tally list (counts)
for (k in 1:searchFor) {
cnum <- as.character(k)
key <- paste(sort(clust[[cnum]]), collapse = ",")
rowNum <- lookup[[ key ]]
if (is.null(rowNum)) {
# First time seeing this cluster
row <- data.frame(Count = 1, Members = key,
Size = length(clust[[cnum]]),
X = ko$centers[k,1], Y = ko$centers[k,2])
hist <- rbind(hist, row)
lookup[[ key ]] <- nrow(hist)
} else {
hist$Count[ rowNum ] <- hist$Count[ rowNum ] + 1
}
}
}
numc <- nrow(hist)
## Sort our clusters by size
hist <- hist[order(hist$Size), ]
hist$Name <- LETTERS[1:numc]
hist$Freq <- hist$Count / iter
## Plot the points
plot(dataFrame$x, dataFrame$y, col = "blue", pch = 1, cex = 1,
ylab = "", xlab = "")
text(x + 0.05, y + 0.05, labels = rownames(dataFrame),
col = "blue", cex = 0.6)
## Add the centroids, labeled by name
text(hist$X, hist$Y, labels = hist$Name, col = 1:numc, cex = 1, font = 2)
## I also wanted to add an ellipse defining the cluster, but the
## solution for that seems to involve the "car" package, which is
## pretty heavyweight: https://stackoverflow.com/a/26812165
## Plot the data as a barchart,
## https://stackoverflow.com/a/4217305 # Get x-coordinate of each bar
bp <- barplot(hist$Freq, col = "white", names.arg = hist$Name,
border = 1:numc, ylab = "Frequency cluster is observed",
xlab = paste(c("Cluster Membership: centers =",searchFor),
collapse = " "))
## Add a list of the members for each cluster
text(y = 0, x = bp - 0.3, labels = hist$Members, srt = 90, col = 1:numc,
pos = 4)
hist
}
ti <- trackInstability( 4, 300 )
@maptracker
Copy link
Author

kmeans

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment