pnewhook/Cluster NHL Players.R

## Cluster NHL Players.R
###############################################################################
# Author: @BrockTibert
#
# Used: R Version 2.12.1, Windows 7 Pro, StatET Plugin for Eclipse
#
#
###############################################################################

#-----------------------------------------------------------------------
# set up script level basics
#-----------------------------------------------------------------------

## I code on the screen, so I set it wider for the console output
options(widht=180)

## libraries
library(XML)
library(plyr)
library(gdata)
library(sqldf)


## directory for the project
DIR <- "C:/Users/Brock/Documents/My Dropbox/Projects/NHL"
setwd(DIR)


#-----------------------------------------------------------------------
# Grab the skaters data from the season page
#-----------------------------------------------------------------------

## set the season
SEASON <- "2011"

## create the URL
URL <- paste("http://www.hockey-reference.com/leagues/NHL_", SEASON, "_skaters.html", sep="")


## grab the page -- the table is parsed nicely
tables <- readHTMLTable(URL)
ds.skaters <- tables$stats

## I don't like dealing with factors if I don't have to
## and I prefer lower case
for(i in 1:ncol(ds.skaters)) {
	ds.skaters[,i] <- as.character(ds.skaters[,i])
	names(ds.skaters) <- tolower(colnames(ds.skaters))
}

## fix a couple of the column names
colnames(ds.skaters)
names(ds.skaters)[10] <- "plusmin"
names(ds.skaters)[17] <- "spct"

## finally fix the columns - NAs forced by coercion warnings
for(i in c(1, 3, 6:18)) {
	ds.skaters[,i] <- as.numeric(ds.skaters[, i])
}

## change the avg time on ice to a time format
ds.skaters$atoi <- strptime(ds.skaters$atoi, "%M:%S")

## remove the header and totals row
ds.skaters <- ds.skaters[!is.na(ds.skaters$rk), ]
ds.skaters <- ds.skaters[ds.skaters$tm != "TOT", ]

## save the datafile
write.table(ds.skaters, "Skaters.csv", sep=",", row.names=F)

## convert toi to seconds, and seconds/game
ds.skaters$seconds <- (ds.skaters$toi*60)/ds.skaters$gp

#-----------------------------------------------------------------------
# Quick overview of the 2011 skater data
#-----------------------------------------------------------------------

## quick look of the dataset
summary(ds.skaters)

## Best and worst +/-
head(ds.skaters[ds.skaters$plusmin == max(ds.skaters$plusmin), ])
head(ds.skaters[ds.skaters$plusmin == min(ds.skaters$plusmin), ]) # how is that deal working out for NJ?

## select the columns to analyze
play <- ds.skaters[, c(7:17, 20)]


## fix the NA's to zeros
for(i in 1:ncol(play)) {
#	which(is.na(play[,i])) <- rows that meet the criteria of NA
	play[which(is.na(play[,i])), i] <- 0
}

## look at the correlations
summary(play)
round(cor(play), 2)

#-----------------------------------------------------------------------
# Use PCA and clustering techniques to group the players
#-----------------------------------------------------------------------

## PCA analysis
## http://www.statmethods.net/advstats/factor.html
## the 1st component of princomp(play, cor=F) explained 99% variance
play.pca <- princomp(play, cor=T)


## what did we get for data?
summary(play.pca)
plot(play.pca)
png("PCA biplot.png")
biplot(play.pca)
dev.off()


## lets just use the first 5 = ~= 86% of the variance
play.pca.comp <- play.pca$scores[, 1:5]


## use hierarchical clustering to get a sense of how many clusters
## to feed to the kmeans procedure - need to feed distance matrix
## pull 40 random cases into the procedure
dist.mat <- dist(play.pca.comp[sample(1:nrow(play.pca.comp), 40), ], method="euclidian")
play.hclust <- hclust(dist.mat, method="ward")
png("Dendrogram.png")
plot(play.hclust)
dev.off()

## I am going to try to 4 clusters
kmean4 <- kmeans(play.pca.comp, centers=4, iter.max=12000)
kmean4$withinss
kmean4$size


#-----------------------------------------------------------------------
# Analyze the clusters
#-----------------------------------------------------------------------

## add the cluster membership to the player data
skaters <- cbind(ds.skaters, kmean4$cluster)
names(skaters)[ncol(skaters)] <- "cluster"

## make them a factor
skaters$cluster2 <- as.factor(skaters$cluster)
skaters$cluster2 <- reorder(skaters$cluster2, new.order=c(4,2,1,3))

## profile clusters by a few variables
table(skaters$cluster2)
ddply(skaters[c("cluster", "pts")], .(cluster), mean, na.rm=T)

## boxplot these clusters
png("4 Cluster by Points Boxplot.png")
boxplot(pts ~ cluster2, data=skaters, main="Boxplot of Clusters by Points \nAs of Feb 7, 2011")
dev.off()


## cluster membership by team
addmargins(table(skaters$tm, skaters$cluster2))
(team.dist <- round(prop.table(table(skaters$cluster2, skaters$tm), 2), 3))


## plot of the table
## Rotate labels = http://stackoverflow.com/questions/1828742/rotating-axis-labels-in-r
## http://research.stowers-institute.org/efg/R/Color/Chart/
## type colors() to see names for the graph on link above
mycol <- colors()
mycol <- c("red4", "orangered", "burlywood1",  "beige")
png("Team Distribution.png")
barplot(team.dist, horiz=T, cex.names=.75, las=1,
		col=mycol, main="Distribution of Clusters by Team", xlab="% of Team")
abline(v=.25, lty=2)
abline(v=.5, lty=2)
abline(v=.75, lty=2)
dev.off()
	###############################################################################
	# Author: @BrockTibert
	#
	# Used: R Version 2.12.1, Windows 7 Pro, StatET Plugin for Eclipse
	#
	#
	###############################################################################

	#-----------------------------------------------------------------------
	# set up script level basics
	#-----------------------------------------------------------------------

	## I code on the screen, so I set it wider for the console output
	options(widht=180)

	## libraries
	library(XML)
	library(plyr)
	library(gdata)
	library(sqldf)


	## directory for the project
	DIR <- "C:/Users/Brock/Documents/My Dropbox/Projects/NHL"
	setwd(DIR)



	#-----------------------------------------------------------------------
	# Grab the skaters data from the season page
	#-----------------------------------------------------------------------

	## set the season
	SEASON <- "2011"

	## create the URL
	URL <- paste("http://www.hockey-reference.com/leagues/NHL_", SEASON, "_skaters.html", sep="")


	## grab the page -- the table is parsed nicely
	tables <- readHTMLTable(URL)
	ds.skaters <- tables$stats

	## I don't like dealing with factors if I don't have to
	## and I prefer lower case
	for(i in 1:ncol(ds.skaters)) {
	ds.skaters[,i] <- as.character(ds.skaters[,i])
	names(ds.skaters) <- tolower(colnames(ds.skaters))
	}

	## fix a couple of the column names
	colnames(ds.skaters)
	names(ds.skaters)[10] <- "plusmin"
	names(ds.skaters)[17] <- "spct"

	## finally fix the columns - NAs forced by coercion warnings
	for(i in c(1, 3, 6:18)) {
	ds.skaters[,i] <- as.numeric(ds.skaters[, i])
	}

	## change the avg time on ice to a time format
	ds.skaters$atoi <- strptime(ds.skaters$atoi, "%M:%S")

	## remove the header and totals row
	ds.skaters <- ds.skaters[!is.na(ds.skaters$rk), ]
	ds.skaters <- ds.skaters[ds.skaters$tm != "TOT", ]

	## save the datafile
	write.table(ds.skaters, "Skaters.csv", sep=",", row.names=F)

	## convert toi to seconds, and seconds/game
	ds.skaters$seconds <- (ds.skaters$toi*60)/ds.skaters$gp

	#-----------------------------------------------------------------------
	# Quick overview of the 2011 skater data
	#-----------------------------------------------------------------------

	## quick look of the dataset
	summary(ds.skaters)

	## Best and worst +/-
	head(ds.skaters[ds.skaters$plusmin == max(ds.skaters$plusmin), ])
	head(ds.skaters[ds.skaters$plusmin == min(ds.skaters$plusmin), ]) # how is that deal working out for NJ?

	## select the columns to analyze
	play <- ds.skaters[, c(7:17, 20)]


	## fix the NA's to zeros
	for(i in 1:ncol(play)) {
	# which(is.na(play[,i])) <- rows that meet the criteria of NA
	play[which(is.na(play[,i])), i] <- 0
	}

	## look at the correlations
	summary(play)
	round(cor(play), 2)

	#-----------------------------------------------------------------------
	# Use PCA and clustering techniques to group the players
	#-----------------------------------------------------------------------

	## PCA analysis
	## http://www.statmethods.net/advstats/factor.html
	## the 1st component of princomp(play, cor=F) explained 99% variance
	play.pca <- princomp(play, cor=T)


	## what did we get for data?
	summary(play.pca)
	plot(play.pca)
	png("PCA biplot.png")
	biplot(play.pca)
	dev.off()


	## lets just use the first 5 = ~= 86% of the variance
	play.pca.comp <- play.pca$scores[, 1:5]


	## use hierarchical clustering to get a sense of how many clusters
	## to feed to the kmeans procedure - need to feed distance matrix
	## pull 40 random cases into the procedure
	dist.mat <- dist(play.pca.comp[sample(1:nrow(play.pca.comp), 40), ], method="euclidian")
	play.hclust <- hclust(dist.mat, method="ward")
	png("Dendrogram.png")
	plot(play.hclust)
	dev.off()

	## I am going to try to 4 clusters
	kmean4 <- kmeans(play.pca.comp, centers=4, iter.max=12000)
	kmean4$withinss
	kmean4$size



	#-----------------------------------------------------------------------
	# Analyze the clusters
	#-----------------------------------------------------------------------

	## add the cluster membership to the player data
	skaters <- cbind(ds.skaters, kmean4$cluster)
	names(skaters)[ncol(skaters)] <- "cluster"

	## make them a factor
	skaters$cluster2 <- as.factor(skaters$cluster)
	skaters$cluster2 <- reorder(skaters$cluster2, new.order=c(4,2,1,3))

	## profile clusters by a few variables
	table(skaters$cluster2)
	ddply(skaters[c("cluster", "pts")], .(cluster), mean, na.rm=T)

	## boxplot these clusters
	png("4 Cluster by Points Boxplot.png")
	boxplot(pts ~ cluster2, data=skaters, main="Boxplot of Clusters by Points \nAs of Feb 7, 2011")
	dev.off()


	## cluster membership by team
	addmargins(table(skaters$tm, skaters$cluster2))
	(team.dist <- round(prop.table(table(skaters$cluster2, skaters$tm), 2), 3))


	## plot of the table
	## Rotate labels = http://stackoverflow.com/questions/1828742/rotating-axis-labels-in-r
	## http://research.stowers-institute.org/efg/R/Color/Chart/
	## type colors() to see names for the graph on link above
	mycol <- colors()
	mycol <- c("red4", "orangered", "burlywood1", "beige")
	png("Team Distribution.png")
	barplot(team.dist, horiz=T, cex.names=.75, las=1,
	col=mycol, main="Distribution of Clusters by Team", xlab="% of Team")
	abline(v=.25, lty=2)
	abline(v=.5, lty=2)
	abline(v=.75, lty=2)
	dev.off()