maptracker/PlayingWithKmeans.R

## PlayingWithKmeans.R
## Code was initially taken From Coursera lecture:
## https://www.coursera.org/learn/exploratory-data-analysis/lecture/6hOqi/k-means-clustering-part-2
set.seed(1234)
## Generate 30 points randomly assorted around 3 centroids: (1,1), (2,2), (3,1)
numPoints <- 30
x <- rnorm(numPoints, mean = rep(1:3, each = numPoints/3), sd = 0.2)
y <- rnorm(numPoints, mean = rep(c(1, 2, 1), each = numPoints/3), sd = 0.2)
## Organize as DF
dataFrame <- data.frame(x = x, y = y, row.names = 1:numPoints)

## Look for the clusters. Because we generated three clusters above,
## unambiguous results should be found when looking for 3. Trying
## other cluster numbers (searchFor) will result in different
## outcomes each time you re-run the analysis
clusterAndPlot <- function( searchFor ) {
    kmeansObj <- kmeans(dataFrame, centers = searchFor)
    ## Plot and label the input
    plot(x, y, col = kmeansObj$cluster, pch = 1, cex = 2)
    text(x + 0.05, y + 0.05, labels = as.character(1:numPoints), col = "blue")
    ## Add the centroids
    points(kmeansObj$centers, col = 1:searchFor, pch = 3, cex = 3, lwd = 3)
}
clusterAndPlot(3)

dev.new( width = 12 )
par( mfrow = c(1,2) )
trackInstability <- function( searchFor, iter = 100 ) {
    ## This function tracks how often specific k-means clusters
    ## appear.  It generates a scatter plot with all observed
    ## centroids labeled, plus a bar plot showing the frequency of
    ## occurance of each particular cluster (based on its object
    ## membership)

    ## The goal here is to see how consistent clustering is for the
    ## data generated above. Do we always find the same clusters? If
    ## not, how often are given clusters observed?
    hist <- data.frame()
    lookup <- list()
    nr     <- nrow(dataFrame)
    for (i in 1:iter) {
        ## Cluster the data
        ko <- kmeans(dataFrame, centers = searchFor)
        ## Find object membership for each cluster, stored in a list
        clust <- list()
        for (j in 1:nr) {
            cnum <- as.character(ko$cluster[j])
            clust[[ cnum ]] <- c(clust[[ cnum ]], j)
        }
        ## Now turn each cluster into a unique key, and use that key
        ## to increment our tally list (counts)
        for (k in 1:searchFor) {
            cnum   <- as.character(k)
            key    <- paste(sort(clust[[cnum]]), collapse = ",")
            rowNum <- lookup[[ key ]]
            if (is.null(rowNum)) {
                # First time seeing this cluster
                row <- data.frame(Count = 1, Members = key,
                                  Size = length(clust[[cnum]]),
                                  X = ko$centers[k,1], Y = ko$centers[k,2])
                hist <- rbind(hist, row)
                lookup[[ key ]] <- nrow(hist)
            } else {
                hist$Count[ rowNum ] <- hist$Count[ rowNum ] + 1
            }
        }
    }
    numc <- nrow(hist)
    ## Sort our clusters by size
    hist      <- hist[order(hist$Size), ]
    hist$Name <- LETTERS[1:numc]
    hist$Freq <- hist$Count / iter

    ## Plot the points
    plot(dataFrame$x, dataFrame$y, col = "blue", pch = 1, cex = 1,
         ylab = "", xlab = "")
    text(x + 0.05, y + 0.05, labels = rownames(dataFrame),
         col = "blue", cex = 0.6)
    ## Add the centroids, labeled by name
    text(hist$X, hist$Y, labels = hist$Name, col = 1:numc, cex = 1, font = 2)
    ## I also wanted to add an ellipse defining the cluster, but the
    ## solution for that seems to involve the "car" package, which is
    ## pretty heavyweight: https://stackoverflow.com/a/26812165

    ## Plot the data as a barchart,
    ## https://stackoverflow.com/a/4217305 # Get x-coordinate of each bar
    bp <- barplot(hist$Freq, col = "white", names.arg = hist$Name,
                  border = 1:numc, ylab = "Frequency cluster is observed",
                  xlab = paste(c("Cluster Membership: centers =",searchFor),
                               collapse = " "))
    ## Add a list of the members for each cluster
    text(y = 0, x = bp - 0.3, labels = hist$Members, srt = 90, col = 1:numc,
         pos = 4)
    hist
}
ti <- trackInstability( 4, 300 )
	## Code was initially taken From Coursera lecture:
	## https://www.coursera.org/learn/exploratory-data-analysis/lecture/6hOqi/k-means-clustering-part-2
	set.seed(1234)
	## Generate 30 points randomly assorted around 3 centroids: (1,1), (2,2), (3,1)
	numPoints <- 30
	x <- rnorm(numPoints, mean = rep(1:3, each = numPoints/3), sd = 0.2)
	y <- rnorm(numPoints, mean = rep(c(1, 2, 1), each = numPoints/3), sd = 0.2)
	## Organize as DF
	dataFrame <- data.frame(x = x, y = y, row.names = 1:numPoints)

	## Look for the clusters. Because we generated three clusters above,
	## unambiguous results should be found when looking for 3. Trying
	## other cluster numbers (searchFor) will result in different
	## outcomes each time you re-run the analysis
	clusterAndPlot <- function( searchFor ) {
	kmeansObj <- kmeans(dataFrame, centers = searchFor)
	## Plot and label the input
	plot(x, y, col = kmeansObj$cluster, pch = 1, cex = 2)
	text(x + 0.05, y + 0.05, labels = as.character(1:numPoints), col = "blue")
	## Add the centroids
	points(kmeansObj$centers, col = 1:searchFor, pch = 3, cex = 3, lwd = 3)
	}
	clusterAndPlot(3)

	dev.new( width = 12 )
	par( mfrow = c(1,2) )
	trackInstability <- function( searchFor, iter = 100 ) {
	## This function tracks how often specific k-means clusters
	## appear. It generates a scatter plot with all observed
	## centroids labeled, plus a bar plot showing the frequency of
	## occurance of each particular cluster (based on its object
	## membership)

	## The goal here is to see how consistent clustering is for the
	## data generated above. Do we always find the same clusters? If
	## not, how often are given clusters observed?
	hist <- data.frame()
	lookup <- list()
	nr <- nrow(dataFrame)
	for (i in 1:iter) {
	## Cluster the data
	ko <- kmeans(dataFrame, centers = searchFor)
	## Find object membership for each cluster, stored in a list
	clust <- list()
	for (j in 1:nr) {
	cnum <- as.character(ko$cluster[j])
	clust[[ cnum ]] <- c(clust[[ cnum ]], j)
	}
	## Now turn each cluster into a unique key, and use that key
	## to increment our tally list (counts)
	for (k in 1:searchFor) {
	cnum <- as.character(k)
	key <- paste(sort(clust[[cnum]]), collapse = ",")
	rowNum <- lookup[[ key ]]
	if (is.null(rowNum)) {
	# First time seeing this cluster
	row <- data.frame(Count = 1, Members = key,
	Size = length(clust[[cnum]]),
	X = ko$centers[k,1], Y = ko$centers[k,2])
	hist <- rbind(hist, row)
	lookup[[ key ]] <- nrow(hist)
	} else {
	hist$Count[ rowNum ] <- hist$Count[ rowNum ] + 1
	}
	}
	}
	numc <- nrow(hist)
	## Sort our clusters by size
	hist <- hist[order(hist$Size), ]
	hist$Name <- LETTERS[1:numc]
	hist$Freq <- hist$Count / iter

	## Plot the points
	plot(dataFrame$x, dataFrame$y, col = "blue", pch = 1, cex = 1,
	ylab = "", xlab = "")
	text(x + 0.05, y + 0.05, labels = rownames(dataFrame),
	col = "blue", cex = 0.6)
	## Add the centroids, labeled by name
	text(hist$X, hist$Y, labels = hist$Name, col = 1:numc, cex = 1, font = 2)
	## I also wanted to add an ellipse defining the cluster, but the
	## solution for that seems to involve the "car" package, which is
	## pretty heavyweight: https://stackoverflow.com/a/26812165

	## Plot the data as a barchart,
	## https://stackoverflow.com/a/4217305 # Get x-coordinate of each bar
	bp <- barplot(hist$Freq, col = "white", names.arg = hist$Name,
	border = 1:numc, ylab = "Frequency cluster is observed",
	xlab = paste(c("Cluster Membership: centers =",searchFor),
	collapse = " "))
	## Add a list of the members for each cluster
	text(y = 0, x = bp - 0.3, labels = hist$Members, srt = 90, col = 1:numc,
	pos = 4)
	hist
	}
	ti <- trackInstability( 4, 300 )