Skip to content

Instantly share code, notes, and snippets.

@ramnathv
Created August 28, 2010 22:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ramnathv/555641 to your computer and use it in GitHub Desktop.
Save ramnathv/555641 to your computer and use it in GitHub Desktop.
MLS Analysis
# ANALYSIS OF MAJOR LEAGUE SOCCER
# MOTIVATION: http://thelogcabin.wordpress.com/2010/08/16/goals-per-game-in-mls/
# http://thelogcabin.wordpress.com/2010/08/23/a-rule-change-in-major-league-soccer/
# LOAD ALL REQUIRED LIBRARIES
library(XML);
library(reshape);
library(plyr);
library(ggplot2);
# SCRAPE THE DATA FROM WEB AND SAVE TO FILE
url.base = 'http://www.mlsoccer.com/stats' ;
years = 1996:2010;
# save_data grabs data for each year and extracts table with team statistics
save_data = function(y){
url = paste(url.base,as.character(y),'reg',sep = '/');
tab = readHTMLTable(url, header = F, stringsAsFactors = F);
pos = max(grep("United", tab));
tab = tab[[pos]]; tab$year = y;
tab
}
# extract team statistics across years and save as data frame
team.list = llply(years, save_data);
mls = merge_recurse(team.list);
names(mls) = c("Name", "GP", "G", "A", "SHT", "SOG", "PG", "PA", "FC",
"FS", "YC", "RC", "SO", "SV", "SV%", "GAA","Year");
# convert team names to factor and other stats to numeric
mls[,-1] = sapply(mls[,-1], as.numeric);
mls[,1] = as.factor(mls[,1]);
# save data frame as local csv file
write.csv(mls, 'mls.csv', row.names = F);
# EXPLORATORY ANALYSIS THROUGH PLOTS
# Plot1: Heatmap of Goals per Game across Years
plot1 = ggplot(mls, aes(x = Year, y = Name)) +
geom_tile(aes(fill = G/GP), colour = 'white') +
scale_fill_gradient(low = 'white', high = 'red') +
theme_bw() +
labs(x = "", y = "") +
scale_x_continuous(expand = c(0,0)) +
scale_y_discrete(expand = c(0,0)) +
opts(legend.position = 'none',
axis.ticks = theme_blank());
plot2 = qplot(Year, G/GP, data = mls, geom = 'line', facets = ~Name);
# Plot3: Average Statistics across Years
mls.m = melt(mls, id = c("Name", "Year"), variable_name = "metric");
mls.1 = ddply(mls.m, .(Year, metric), summarize, avg_val = mean(value));
plot3 = ggplot(mls.1, aes(Year, avg_val)) +
geom_point() +
geom_smooth(method = 'lm', se = F) +
facet_wrap(~metric, scales = 'free_y', ncol = 5) +
scale_x_continuous(breaks = c(1996, 2003, 2010));
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment