Created
August 28, 2010 22:20
-
-
Save ramnathv/555641 to your computer and use it in GitHub Desktop.
MLS Analysis
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ANALYSIS OF MAJOR LEAGUE SOCCER | |
# MOTIVATION: http://thelogcabin.wordpress.com/2010/08/16/goals-per-game-in-mls/ | |
# http://thelogcabin.wordpress.com/2010/08/23/a-rule-change-in-major-league-soccer/ | |
# LOAD ALL REQUIRED LIBRARIES | |
library(XML); | |
library(reshape); | |
library(plyr); | |
library(ggplot2); | |
# SCRAPE THE DATA FROM WEB AND SAVE TO FILE | |
url.base = 'http://www.mlsoccer.com/stats' ; | |
years = 1996:2010; | |
# save_data grabs data for each year and extracts table with team statistics | |
save_data = function(y){ | |
url = paste(url.base,as.character(y),'reg',sep = '/'); | |
tab = readHTMLTable(url, header = F, stringsAsFactors = F); | |
pos = max(grep("United", tab)); | |
tab = tab[[pos]]; tab$year = y; | |
tab | |
} | |
# extract team statistics across years and save as data frame | |
team.list = llply(years, save_data); | |
mls = merge_recurse(team.list); | |
names(mls) = c("Name", "GP", "G", "A", "SHT", "SOG", "PG", "PA", "FC", | |
"FS", "YC", "RC", "SO", "SV", "SV%", "GAA","Year"); | |
# convert team names to factor and other stats to numeric | |
mls[,-1] = sapply(mls[,-1], as.numeric); | |
mls[,1] = as.factor(mls[,1]); | |
# save data frame as local csv file | |
write.csv(mls, 'mls.csv', row.names = F); | |
# EXPLORATORY ANALYSIS THROUGH PLOTS | |
# Plot1: Heatmap of Goals per Game across Years | |
plot1 = ggplot(mls, aes(x = Year, y = Name)) + | |
geom_tile(aes(fill = G/GP), colour = 'white') + | |
scale_fill_gradient(low = 'white', high = 'red') + | |
theme_bw() + | |
labs(x = "", y = "") + | |
scale_x_continuous(expand = c(0,0)) + | |
scale_y_discrete(expand = c(0,0)) + | |
opts(legend.position = 'none', | |
axis.ticks = theme_blank()); | |
plot2 = qplot(Year, G/GP, data = mls, geom = 'line', facets = ~Name); | |
# Plot3: Average Statistics across Years | |
mls.m = melt(mls, id = c("Name", "Year"), variable_name = "metric"); | |
mls.1 = ddply(mls.m, .(Year, metric), summarize, avg_val = mean(value)); | |
plot3 = ggplot(mls.1, aes(Year, avg_val)) + | |
geom_point() + | |
geom_smooth(method = 'lm', se = F) + | |
facet_wrap(~metric, scales = 'free_y', ncol = 5) + | |
scale_x_continuous(breaks = c(1996, 2003, 2010)); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment