Created
February 12, 2017 16:20
-
-
Save AzharuddinKazi/a1505378ba19f31c6fd349427d924c8b to your computer and use it in GitHub Desktop.
applying simple clustering algorithms like hierarchical and k-means clustering
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
rm(list=ls(all=TRUE)) | |
#Consider mtacrs data of R-datasets | |
data(mtcars) | |
mydata <- data.frame(mtcars) | |
mydata <- na.omit(mydata) # listwise deletion of missing | |
summary(mydata) | |
str(mydata) | |
mydata <- scale(mydata) # standardize variables | |
summary(mydata) | |
###------------------------- Hierarchical Clustering ------------------------### | |
# Ward's method | |
d <- dist(mydata, | |
method = "euclidean") # distance matrix | |
d | |
fit <- hclust(d, method="ward.D2") | |
plot(fit) # display dendogram | |
groups <- cutree(fit, k=6) # cut tree into 5 clusters | |
groups | |
# draw dendogram with red borders around the 5 clusters | |
rect.hclust(fit, k=6, border="red") | |
mtcars$cluster <- groups | |
###------------------------- K- means Clustering ------------------------### | |
# K-Means Cluster Analysis with k = 5 | |
fit <- kmeans(mydata, 5) # 5 cluster solution | |
#study the model and metrics | |
#With-in sum of squares in each cluster | |
fit$withinss | |
sum(fit$withinss) | |
fit$tot.withinss | |
#To check cluster number of each row in data | |
fit$cluster | |
#Cluster Centers | |
fit$centers | |
# get cluster means | |
aggregate(mydata,by=list(fit$cluster), | |
FUN=mean) | |
# append cluster label to the actual data frame | |
mydata <- data.frame(mydata,fit$cluster) | |
head(mydata) | |
# Determine number of clusters by considering the withinness measure | |
wss <- 0 | |
for (i in 1:15) { | |
wss[i] <- sum(kmeans(mydata,centers=i)$withinss) | |
} | |
#Scree Plot | |
plot(1:15, wss, | |
type="b", | |
xlab="Number of Clusters", | |
ylab="Within groups sum of squares") | |
#Now we can perform any classification algorithm on the data to classify a future data point | |
# For unseen data, we compute its distance from all the cluster centroids | |
# and assigns it to that cluster that is nearest to it | |
test_datapoint <- mtcars[sample(1:nrow(mtcars),1),] | |
closest.cluster <- function(x) { | |
cluster.dist <- apply(fit$centers, 1, function(y) sqrt(sum((x-y)^2))) | |
print(cluster.dist) | |
return(which.min(cluster.dist)[1]) | |
} | |
closest.cluster(test_datapoint) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment