Skip to content

Instantly share code, notes, and snippets.

@Btibert3
Created February 6, 2011 18:57
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save Btibert3/813607 to your computer and use it in GitHub Desktop.
Save Btibert3/813607 to your computer and use it in GitHub Desktop.
Pull in Web data, and cluster them based on performance
###############################################################################
# Author: @BrockTibert
#
# Used: R Version 2.12.1, Windows 7 Pro, StatET Plugin for Eclipse
#
#
###############################################################################
#-----------------------------------------------------------------------
# set up script level basics
#-----------------------------------------------------------------------
## I code on the screen, so I set it wider for the console output
options(widht=180)
## libraries
library(XML)
library(plyr)
library(gdata)
library(sqldf)
## directory for the project
DIR <- "C:/Users/Brock/Documents/My Dropbox/Projects/NHL"
setwd(DIR)
#-----------------------------------------------------------------------
# Grab the skaters data from the season page
#-----------------------------------------------------------------------
## set the season
SEASON <- "2011"
## create the URL
URL <- paste("http://www.hockey-reference.com/leagues/NHL_", SEASON, "_skaters.html", sep="")
## grab the page -- the table is parsed nicely
tables <- readHTMLTable(URL)
ds.skaters <- tables$stats
## I don't like dealing with factors if I don't have to
## and I prefer lower case
for(i in 1:ncol(ds.skaters)) {
ds.skaters[,i] <- as.character(ds.skaters[,i])
names(ds.skaters) <- tolower(colnames(ds.skaters))
}
## fix a couple of the column names
colnames(ds.skaters)
names(ds.skaters)[10] <- "plusmin"
names(ds.skaters)[17] <- "spct"
## finally fix the columns - NAs forced by coercion warnings
for(i in c(1, 3, 6:18)) {
ds.skaters[,i] <- as.numeric(ds.skaters[, i])
}
## change the avg time on ice to a time format
ds.skaters$atoi <- strptime(ds.skaters$atoi, "%M:%S")
## remove the header and totals row
ds.skaters <- ds.skaters[!is.na(ds.skaters$rk), ]
ds.skaters <- ds.skaters[ds.skaters$tm != "TOT", ]
## save the datafile
write.table(ds.skaters, "Skaters.csv", sep=",", row.names=F)
## convert toi to seconds, and seconds/game
ds.skaters$seconds <- (ds.skaters$toi*60)/ds.skaters$gp
#-----------------------------------------------------------------------
# Quick overview of the 2011 skater data
#-----------------------------------------------------------------------
## quick look of the dataset
summary(ds.skaters)
## Best and worst +/-
head(ds.skaters[ds.skaters$plusmin == max(ds.skaters$plusmin), ])
head(ds.skaters[ds.skaters$plusmin == min(ds.skaters$plusmin), ]) # how is that deal working out for NJ?
## select the columns to analyze
play <- ds.skaters[, c(7:17, 20)]
## fix the NA's to zeros
for(i in 1:ncol(play)) {
# which(is.na(play[,i])) <- rows that meet the criteria of NA
play[which(is.na(play[,i])), i] <- 0
}
## look at the correlations
summary(play)
round(cor(play), 2)
#-----------------------------------------------------------------------
# Use PCA and clustering techniques to group the players
#-----------------------------------------------------------------------
## PCA analysis
## http://www.statmethods.net/advstats/factor.html
## the 1st component of princomp(play, cor=F) explained 99% variance
play.pca <- princomp(play, cor=T)
## what did we get for data?
summary(play.pca)
plot(play.pca)
png("PCA biplot.png")
biplot(play.pca)
dev.off()
## lets just use the first 5 = ~= 86% of the variance
play.pca.comp <- play.pca$scores[, 1:5]
## use hierarchical clustering to get a sense of how many clusters
## to feed to the kmeans procedure - need to feed distance matrix
## pull 40 random cases into the procedure
dist.mat <- dist(play.pca.comp[sample(1:nrow(play.pca.comp), 40), ], method="euclidian")
play.hclust <- hclust(dist.mat, method="ward")
png("Dendrogram.png")
plot(play.hclust)
dev.off()
## I am going to try to 4 clusters
kmean4 <- kmeans(play.pca.comp, centers=4, iter.max=12000)
kmean4$withinss
kmean4$size
#-----------------------------------------------------------------------
# Analyze the clusters
#-----------------------------------------------------------------------
## add the cluster membership to the player data
skaters <- cbind(ds.skaters, kmean4$cluster)
names(skaters)[ncol(skaters)] <- "cluster"
## make them a factor
skaters$cluster2 <- as.factor(skaters$cluster)
skaters$cluster2 <- reorder(skaters$cluster2, new.order=c(4,2,1,3))
## profile clusters by a few variables
table(skaters$cluster2)
ddply(skaters[c("cluster", "pts")], .(cluster), mean, na.rm=T)
## boxplot these clusters
png("4 Cluster by Points Boxplot.png")
boxplot(pts ~ cluster2, data=skaters, main="Boxplot of Clusters by Points \nAs of Feb 7, 2011")
dev.off()
## cluster membership by team
addmargins(table(skaters$tm, skaters$cluster2))
(team.dist <- round(prop.table(table(skaters$cluster2, skaters$tm), 2), 3))
## plot of the table
## Rotate labels = http://stackoverflow.com/questions/1828742/rotating-axis-labels-in-r
## http://research.stowers-institute.org/efg/R/Color/Chart/
## type colors() to see names for the graph on link above
mycol <- colors()
mycol <- c("red4", "orangered", "burlywood1", "beige")
png("Team Distribution.png")
barplot(team.dist, horiz=T, cex.names=.75, las=1,
col=mycol, main="Distribution of Clusters by Team", xlab="% of Team")
abline(v=.25, lty=2)
abline(v=.5, lty=2)
abline(v=.75, lty=2)
dev.off()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment