Skip to content

Instantly share code, notes, and snippets.

@abelsonlive
Forked from dsparks/kMeansPP.R
Created November 29, 2012 15:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save abelsonlive/4169810 to your computer and use it in GitHub Desktop.
Save abelsonlive/4169810 to your computer and use it in GitHub Desktop.
k-Means ++ center initialization algorithm
toInstall <- c("proxy")
if(doInstall){install.packages(toInstall, repos = "http://cran.us.r-project.org")}
lapply(toInstall, library, character.only = TRUE)
# kmeans++ center initialization algorithm
kMeansPP <- function(df, k, doPlot = TRUE){
kCenters <- data.frame(matrix(NA, ncol = ncol(df), nrow = k))
whichPoints <- rep(NA, k)
whichPoints[1] <- sample(1:nrow(df), 1)
kCenters[1, ] <- df[whichPoints[1], ] # Initial center
for(kk in 2:k){
distMat <- proxy::dist(df, kCenters[1:(kk-1), ])
distToNearestCenter <- apply(distMat, 1, min)
whichPoints[kk] <- sample(1:nrow(df), 1, prob = distToNearestCenter^2)
kCenters[kk, ] <- df[whichPoints[kk], ]
}
if(doPlot == TRUE){
plot(df[, 1:2], col = "GRAY")
points(kCenters[, 1:2], col = 1:k, pch = 20)
}
outList <- NULL
outList$Centers <- kCenters
outList$whichPoints <- whichPoints
return(outList)
}
# Test it
myData <- data.frame(x = rnorm(100), y = rnorm(100), z = rnorm(100))
plot(myData)
PPresult <- kMeansPP(myData, 3)
convergedClusters <- kmeans(myData, centers = PPresult$Centers)
points(convergedClusters$centers, col = 1:k, pch = 10)
points(myData, col = convergedClusters$cluster, cex = 1/2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment