gdevenyi/SKM.R

## SKM.R
library (sparcl)

#input data
data <- read.csv("PIBPETSUVR_firstscan.csv", header=TRUE, row.names="RID")
#Throw away columns with dates and such not needed
data[1:5] <- list(NULL)
data[11] <- list(NULL)
data[ncol(data)] <- list(NULL)

#Initialize Empty Classification Table
classifications <- data.frame(cluster1=0*1:nrow(data),cluster2=0*1:nrow(data), row.names=row.names(data))

for (i in 1:10000) {

#Pick out 70% of the dataset
sample <- data[sample(nrow(data),0.7*nrow(data)),]
#Remaining 30% (data not in sample)
remaining <- subset(data, !row.names(data) %in% row.names(sample))

#Let cluster permute choose optimal tuning parameter
cat("Permutation", i, "\n")
result <- try(km.perm <- KMeansSparseCluster.permute(sample,K=2,nperms=100, silent=TRUE))
if(class(result) == "try-error") next;


# run sparse k-means with optimal tuning parameter
cat("Cluster", i, "\n")
km.out <- KMeansSparseCluster(sample,K=2,wbounds=km.perm$bestw)

#Extract weights and clusters
weights <- km.out[1][[1]]$ws
clusters <- as.data.frame(km.out[1][[1]]$Cs)

#Define Subset of Cluster1 and Cluster2 from Original Data
#Done by extracting subset from original data where the row names exist
#in the clustering results for cluster 1 or 2
cluster1 <- subset(data, row.names(data) %in% row.names(subset(clusters, clusters==1)))
cluster2 <- subset(data, row.names(data) %in% row.names(subset(clusters, clusters==2)))

#Compute Means Score of Cluster1 and Cluster2
#Multiply each column in Cluster by the corresponding element in ws
#Find the sum row-wise
#Find the mean of the sums
cluster1.mean <- mean(rowSums(as.data.frame(t(apply(cluster1,1,function(a)a*weights)))))
cluster2.mean <- mean(rowSums(as.data.frame(t(apply(cluster2,1,function(a)a*weights)))))

#Compute Mean of Mean (midpoint) of Cluster Values
cluster.midpoint <- mean(c(cluster1.mean,cluster2.mean))

#Print them out
print(cluster1.mean)
print(cluster2.mean)
print(cluster.midpoint)

#Compute scores of remaining data using weights, to allow for classification
remaining.scores <- rowSums(as.data.frame(t(apply(remaining,1,function(a)a*weights))))

#Because of the way the clustering code works, sometimes cluster1 has a larger mean,
#sometimes cluster2 has a larger mean
#We choose to always define cluster1 as having the larger mean, so we need to check

#Determine which RIDs are assigned to which cluster based on comparison to the midpoint
cluster1.assigned <- row.names(as.data.frame(remaining.scores[remaining.scores > cluster.midpoint]))
cluster2.assigned <- row.names(as.data.frame(remaining.scores[remaining.scores < cluster.midpoint]))

classifications[cluster1.assigned,'cluster1'] <- classifications[cluster1.assigned,'cluster1'] + 1
classifications[cluster2.assigned,'cluster2'] <- classifications[cluster2.assigned,'cluster2'] + 1
}
classifications
write.csv(classifications, file = "classifications.csv")
	library (sparcl)

	#input data
	data <- read.csv("PIBPETSUVR_firstscan.csv", header=TRUE, row.names="RID")
	#Throw away columns with dates and such not needed
	data[1:5] <- list(NULL)
	data[11] <- list(NULL)
	data[ncol(data)] <- list(NULL)

	#Initialize Empty Classification Table
	classifications <- data.frame(cluster1=01:nrow(data),cluster2=01:nrow(data), row.names=row.names(data))

	for (i in 1:10000) {

	#Pick out 70% of the dataset
	sample <- data[sample(nrow(data),0.7*nrow(data)),]
	#Remaining 30% (data not in sample)
	remaining <- subset(data, !row.names(data) %in% row.names(sample))

	#Let cluster permute choose optimal tuning parameter
	cat("Permutation", i, "\n")
	result <- try(km.perm <- KMeansSparseCluster.permute(sample,K=2,nperms=100, silent=TRUE))
	if(class(result) == "try-error") next;


	# run sparse k-means with optimal tuning parameter
	cat("Cluster", i, "\n")
	km.out <- KMeansSparseCluster(sample,K=2,wbounds=km.perm$bestw)

	#Extract weights and clusters
	weights <- km.out[1][[1]]$ws
	clusters <- as.data.frame(km.out[1][[1]]$Cs)

	#Define Subset of Cluster1 and Cluster2 from Original Data
	#Done by extracting subset from original data where the row names exist
	#in the clustering results for cluster 1 or 2
	cluster1 <- subset(data, row.names(data) %in% row.names(subset(clusters, clusters==1)))
	cluster2 <- subset(data, row.names(data) %in% row.names(subset(clusters, clusters==2)))

	#Compute Means Score of Cluster1 and Cluster2
	#Multiply each column in Cluster by the corresponding element in ws
	#Find the sum row-wise
	#Find the mean of the sums
	cluster1.mean <- mean(rowSums(as.data.frame(t(apply(cluster1,1,function(a)a*weights)))))
	cluster2.mean <- mean(rowSums(as.data.frame(t(apply(cluster2,1,function(a)a*weights)))))

	#Compute Mean of Mean (midpoint) of Cluster Values
	cluster.midpoint <- mean(c(cluster1.mean,cluster2.mean))

	#Print them out
	print(cluster1.mean)
	print(cluster2.mean)
	print(cluster.midpoint)

	#Compute scores of remaining data using weights, to allow for classification
	remaining.scores <- rowSums(as.data.frame(t(apply(remaining,1,function(a)a*weights))))

	#Because of the way the clustering code works, sometimes cluster1 has a larger mean,
	#sometimes cluster2 has a larger mean
	#We choose to always define cluster1 as having the larger mean, so we need to check

	#Determine which RIDs are assigned to which cluster based on comparison to the midpoint
	cluster1.assigned <- row.names(as.data.frame(remaining.scores[remaining.scores > cluster.midpoint]))
	cluster2.assigned <- row.names(as.data.frame(remaining.scores[remaining.scores < cluster.midpoint]))

	classifications[cluster1.assigned,'cluster1'] <- classifications[cluster1.assigned,'cluster1'] + 1
	classifications[cluster2.assigned,'cluster2'] <- classifications[cluster2.assigned,'cluster2'] + 1
	}
	classifications
	write.csv(classifications, file = "classifications.csv")