Created
November 13, 2014 02:11
-
-
Save gdevenyi/c2e545982bcc2c93ffa4 to your computer and use it in GitHub Desktop.
KMeansClustering sampling in R
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library (sparcl) | |
#input data | |
data <- read.csv("PIBPETSUVR_firstscan.csv", header=TRUE, row.names="RID") | |
#Throw away columns with dates and such not needed | |
data[1:5] <- list(NULL) | |
data[11] <- list(NULL) | |
data[ncol(data)] <- list(NULL) | |
#Initialize Empty Classification Table | |
classifications <- data.frame(cluster1=0*1:nrow(data),cluster2=0*1:nrow(data), row.names=row.names(data)) | |
for (i in 1:10000) { | |
#Pick out 70% of the dataset | |
sample <- data[sample(nrow(data),0.7*nrow(data)),] | |
#Remaining 30% (data not in sample) | |
remaining <- subset(data, !row.names(data) %in% row.names(sample)) | |
#Let cluster permute choose optimal tuning parameter | |
cat("Permutation", i, "\n") | |
result <- try(km.perm <- KMeansSparseCluster.permute(sample,K=2,nperms=100, silent=TRUE)) | |
if(class(result) == "try-error") next; | |
# run sparse k-means with optimal tuning parameter | |
cat("Cluster", i, "\n") | |
km.out <- KMeansSparseCluster(sample,K=2,wbounds=km.perm$bestw) | |
#Extract weights and clusters | |
weights <- km.out[1][[1]]$ws | |
clusters <- as.data.frame(km.out[1][[1]]$Cs) | |
#Define Subset of Cluster1 and Cluster2 from Original Data | |
#Done by extracting subset from original data where the row names exist | |
#in the clustering results for cluster 1 or 2 | |
cluster1 <- subset(data, row.names(data) %in% row.names(subset(clusters, clusters==1))) | |
cluster2 <- subset(data, row.names(data) %in% row.names(subset(clusters, clusters==2))) | |
#Compute Means Score of Cluster1 and Cluster2 | |
#Multiply each column in Cluster by the corresponding element in ws | |
#Find the sum row-wise | |
#Find the mean of the sums | |
cluster1.mean <- mean(rowSums(as.data.frame(t(apply(cluster1,1,function(a)a*weights))))) | |
cluster2.mean <- mean(rowSums(as.data.frame(t(apply(cluster2,1,function(a)a*weights))))) | |
#Compute Mean of Mean (midpoint) of Cluster Values | |
cluster.midpoint <- mean(c(cluster1.mean,cluster2.mean)) | |
#Print them out | |
print(cluster1.mean) | |
print(cluster2.mean) | |
print(cluster.midpoint) | |
#Compute scores of remaining data using weights, to allow for classification | |
remaining.scores <- rowSums(as.data.frame(t(apply(remaining,1,function(a)a*weights)))) | |
#Because of the way the clustering code works, sometimes cluster1 has a larger mean, | |
#sometimes cluster2 has a larger mean | |
#We choose to always define cluster1 as having the larger mean, so we need to check | |
#Determine which RIDs are assigned to which cluster based on comparison to the midpoint | |
cluster1.assigned <- row.names(as.data.frame(remaining.scores[remaining.scores > cluster.midpoint])) | |
cluster2.assigned <- row.names(as.data.frame(remaining.scores[remaining.scores < cluster.midpoint])) | |
classifications[cluster1.assigned,'cluster1'] <- classifications[cluster1.assigned,'cluster1'] + 1 | |
classifications[cluster2.assigned,'cluster2'] <- classifications[cluster2.assigned,'cluster2'] + 1 | |
} | |
classifications | |
write.csv(classifications, file = "classifications.csv") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Dear Dr. Devenyi,
I came to this script from the "https://github.com/jelman/SKM_cluster".
I was having errors using the SKM_cluster by jelman due to compatibility issues with pandas, rpy2, etc. >.<
I'm running your R script and it's been going through well, although the iterations have not been completed yet. I'll see it in couple of hours, I hope.
I wanted to thank you for your work, and was wondering if I can email you with questions if I come across problems with this.
Thank you so much for your time!
Dahyun
--