-
-
Save pnewhook/5009774 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
############################################################################### | |
# Author: @BrockTibert | |
# | |
# Used: R Version 2.12.1, Windows 7 Pro, StatET Plugin for Eclipse | |
# | |
# | |
############################################################################### | |
#----------------------------------------------------------------------- | |
# set up script level basics | |
#----------------------------------------------------------------------- | |
## I code on the screen, so I set it wider for the console output | |
options(widht=180) | |
## libraries | |
library(XML) | |
library(plyr) | |
library(gdata) | |
library(sqldf) | |
## directory for the project | |
DIR <- "C:/Users/Brock/Documents/My Dropbox/Projects/NHL" | |
setwd(DIR) | |
#----------------------------------------------------------------------- | |
# Grab the skaters data from the season page | |
#----------------------------------------------------------------------- | |
## set the season | |
SEASON <- "2011" | |
## create the URL | |
URL <- paste("http://www.hockey-reference.com/leagues/NHL_", SEASON, "_skaters.html", sep="") | |
## grab the page -- the table is parsed nicely | |
tables <- readHTMLTable(URL) | |
ds.skaters <- tables$stats | |
## I don't like dealing with factors if I don't have to | |
## and I prefer lower case | |
for(i in 1:ncol(ds.skaters)) { | |
ds.skaters[,i] <- as.character(ds.skaters[,i]) | |
names(ds.skaters) <- tolower(colnames(ds.skaters)) | |
} | |
## fix a couple of the column names | |
colnames(ds.skaters) | |
names(ds.skaters)[10] <- "plusmin" | |
names(ds.skaters)[17] <- "spct" | |
## finally fix the columns - NAs forced by coercion warnings | |
for(i in c(1, 3, 6:18)) { | |
ds.skaters[,i] <- as.numeric(ds.skaters[, i]) | |
} | |
## change the avg time on ice to a time format | |
ds.skaters$atoi <- strptime(ds.skaters$atoi, "%M:%S") | |
## remove the header and totals row | |
ds.skaters <- ds.skaters[!is.na(ds.skaters$rk), ] | |
ds.skaters <- ds.skaters[ds.skaters$tm != "TOT", ] | |
## save the datafile | |
write.table(ds.skaters, "Skaters.csv", sep=",", row.names=F) | |
## convert toi to seconds, and seconds/game | |
ds.skaters$seconds <- (ds.skaters$toi*60)/ds.skaters$gp | |
#----------------------------------------------------------------------- | |
# Quick overview of the 2011 skater data | |
#----------------------------------------------------------------------- | |
## quick look of the dataset | |
summary(ds.skaters) | |
## Best and worst +/- | |
head(ds.skaters[ds.skaters$plusmin == max(ds.skaters$plusmin), ]) | |
head(ds.skaters[ds.skaters$plusmin == min(ds.skaters$plusmin), ]) # how is that deal working out for NJ? | |
## select the columns to analyze | |
play <- ds.skaters[, c(7:17, 20)] | |
## fix the NA's to zeros | |
for(i in 1:ncol(play)) { | |
# which(is.na(play[,i])) <- rows that meet the criteria of NA | |
play[which(is.na(play[,i])), i] <- 0 | |
} | |
## look at the correlations | |
summary(play) | |
round(cor(play), 2) | |
#----------------------------------------------------------------------- | |
# Use PCA and clustering techniques to group the players | |
#----------------------------------------------------------------------- | |
## PCA analysis | |
## http://www.statmethods.net/advstats/factor.html | |
## the 1st component of princomp(play, cor=F) explained 99% variance | |
play.pca <- princomp(play, cor=T) | |
## what did we get for data? | |
summary(play.pca) | |
plot(play.pca) | |
png("PCA biplot.png") | |
biplot(play.pca) | |
dev.off() | |
## lets just use the first 5 = ~= 86% of the variance | |
play.pca.comp <- play.pca$scores[, 1:5] | |
## use hierarchical clustering to get a sense of how many clusters | |
## to feed to the kmeans procedure - need to feed distance matrix | |
## pull 40 random cases into the procedure | |
dist.mat <- dist(play.pca.comp[sample(1:nrow(play.pca.comp), 40), ], method="euclidian") | |
play.hclust <- hclust(dist.mat, method="ward") | |
png("Dendrogram.png") | |
plot(play.hclust) | |
dev.off() | |
## I am going to try to 4 clusters | |
kmean4 <- kmeans(play.pca.comp, centers=4, iter.max=12000) | |
kmean4$withinss | |
kmean4$size | |
#----------------------------------------------------------------------- | |
# Analyze the clusters | |
#----------------------------------------------------------------------- | |
## add the cluster membership to the player data | |
skaters <- cbind(ds.skaters, kmean4$cluster) | |
names(skaters)[ncol(skaters)] <- "cluster" | |
## make them a factor | |
skaters$cluster2 <- as.factor(skaters$cluster) | |
skaters$cluster2 <- reorder(skaters$cluster2, new.order=c(4,2,1,3)) | |
## profile clusters by a few variables | |
table(skaters$cluster2) | |
ddply(skaters[c("cluster", "pts")], .(cluster), mean, na.rm=T) | |
## boxplot these clusters | |
png("4 Cluster by Points Boxplot.png") | |
boxplot(pts ~ cluster2, data=skaters, main="Boxplot of Clusters by Points \nAs of Feb 7, 2011") | |
dev.off() | |
## cluster membership by team | |
addmargins(table(skaters$tm, skaters$cluster2)) | |
(team.dist <- round(prop.table(table(skaters$cluster2, skaters$tm), 2), 3)) | |
## plot of the table | |
## Rotate labels = http://stackoverflow.com/questions/1828742/rotating-axis-labels-in-r | |
## http://research.stowers-institute.org/efg/R/Color/Chart/ | |
## type colors() to see names for the graph on link above | |
mycol <- colors() | |
mycol <- c("red4", "orangered", "burlywood1", "beige") | |
png("Team Distribution.png") | |
barplot(team.dist, horiz=T, cex.names=.75, las=1, | |
col=mycol, main="Distribution of Clusters by Team", xlab="% of Team") | |
abline(v=.25, lty=2) | |
abline(v=.5, lty=2) | |
abline(v=.75, lty=2) | |
dev.off() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment