MLB runs per game (Lahman database)
# load the package and data set "Teams" | |
install.packages("Lahman") | |
library("Lahman") | |
data(Teams) | |
# | |
# | |
# CREATE LEAGUE SUMMARY TABLES | |
# ============================ | |
# | |
# select a sub-set of teams from 1901 [the establishment of the American League] forward to 2012 | |
Teams_sub <- as.data.frame(subset (Teams, yearID > 1900)) | |
# | |
# calculate each team's average runs and runs allowed per game | |
Teams_sub$RPG <- Teams_sub$R / Teams_sub$G | |
Teams_sub$RAPG <- Teams_sub$RA / Teams_sub$G | |
# | |
# create new data frame with season totals for each league | |
LG_RPG <- aggregate(cbind(R, RA, G) ~ yearID + lgID, data = Teams_sub, sum) | |
# calculate league + season runs and runs allowed per game | |
LG_RPG$LG_RPG <- LG_RPG$R / LG_RPG$G | |
LG_RPG$LG_RAPG <- LG_RPG$RA / LG_RPG$G | |
# | |
# select a sub-set of teams from 1901 [the establishment of the American League] forward to 2012 | |
# read the data into separate league tables | |
ALseason <- (subset (LG_RPG, yearID > 1900 & lgID == "AL")) | |
NLseason <- (subset (LG_RPG, yearID > 1900 & lgID == "NL")) | |
# | |
# +++++++++++++++++++++++++++++++++++++++++++++++++++ | |
# | |
# RUNS SCORED PER GAME | |
# ==================== | |
# | |
# web references: | |
# http://princeofslides.blogspot.ca/2011/05/sab-r-metrics-basics-of-loess.html | |
# http://research.stowers-institute.org/efg/R/Statistics/loess.htm | |
# | |
# start with American League | |
# create new object ALRunScore.LO for loess model | |
ALRunScore.LO <- loess(ALseason$LG_RPG ~ ALseason$yearID) | |
ALRunScore.LO.predict <- predict(ALRunScore.LO) | |
# | |
# create new objects RunScore.Lo.XX for loess models with "span" control | |
ALRunScore.LO.25 <- loess(ALseason$LG_RPG ~ ALseason$yearID, span=0.25) | |
ALRunScore.LO.25.predict <- predict(ALRunScore.LO.25) | |
# | |
ALRunScore.LO.5 <- loess(ALseason$LG_RPG ~ ALseason$yearID, span=0.5) | |
ALRunScore.LO.5.predict <- predict(ALRunScore.LO.5) | |
# | |
# plot the data, add loess curve | |
ylim <- c(3,6) | |
plot(ALseason$LG_RPG ~ ALseason$yearID, | |
ylim = ylim, | |
main = "American League: runs per team per game, 1901-2012", | |
xlab = "year", ylab = "runs per game") | |
# loess predicted value line | |
lines(ALseason$yearID, ALRunScore.LO.predict, lty="solid", col="red", lwd=2) | |
lines(ALseason$yearID, ALRunScore.LO.25.predict, lty="dashed", col="blue", lwd=2) | |
lines(ALseason$yearID, ALRunScore.LO.5.predict, lty="dotdash", col="black", lwd=2) | |
legend(1980, 3.5, | |
c("default", "span=0.25", "span=0.50"), | |
lty=c("solid", "dashed", "dotdash"), | |
col=c("red", "blue", "black"), | |
lwd=c(2, 2, 2)) | |
grid() | |
# | |
# NATIONAL LEAGUE | |
# create new object RunScore.LO for loess model | |
NLRunScore.LO <- loess(NLseason$LG_RPG ~ NLseason$yearID) | |
NLRunScore.LO.predict <- predict(NLRunScore.LO) | |
# | |
# objects with span control in loess model | |
NLRunScore.LO.25 <- loess(NLseason$LG_RPG ~ NLseason$yearID, span=0.25) | |
NLRunScore.LO.25.predict <- predict(NLRunScore.LO.25) | |
NLRunScore.LO.5 <- loess(NLseason$LG_RPG ~ NLseason$yearID, span=0.5) | |
NLRunScore.LO.5.predict <- predict(NLRunScore.LO.5) | |
# | |
# plot the data, add loess curve | |
ylim <- c(3,6) | |
plot(NLseason$LG_RPG ~ NLseason$yearID, | |
pch=2, col="black", | |
ylim = ylim, | |
main = "National League: runs per team per game, 1901-2012", | |
xlab = "year", ylab = "runs per game") | |
# loess predicted value line | |
lines(NLseason$yearID, NLRunScore.LO.predict, lty="solid", col="blue", lwd=2) | |
lines(NLseason$yearID, NLRunScore.LO.25.predict, lty="dashed", col="red", lwd=2) | |
lines(NLseason$yearID, NLRunScore.LO.5.predict, lty="dotdash", col="black", lwd=2) | |
# chart tidying | |
legend(1980, 3.5, | |
c("default", "span=0.25", "span=0.50"), | |
lty=c("solid", "dashed", "dotdash"), | |
col=c("blue", "red", "black"), | |
lwd=c(2, 2, 2)) | |
grid() | |
# | |
# | |
# MULTI-PLOT -- MERGING AL AND NL RESULTS | |
# plot individual years as lines | |
ylim <- c(3,6) | |
# start with AL line | |
plot(ALseason$LG_RPG ~ ALseason$yearID, | |
type="l", lty="solid", col="red", lwd=2, | |
main = "Runs per team per game, 1901-2012", | |
ylim = ylim, | |
xlab = "year", ylab = "runs per game") | |
# add NL line | |
lines(NLseason$yearID, NLseason$LG_RPG, lty="solid", col="blue", lwd=2) | |
# chart additions | |
grid() | |
legend(1900, 3.5, c("AL", "NL"), lty=c("solid", "solid"), col=c("red", "blue"), lwd=c(2, 2)) | |
# | |
# | |
# plot loess curves (span=0.25) | |
ylim <- c(3,6) | |
# start with AL line | |
plot(ALRunScore.LO.25.predict ~ ALseason$yearID, | |
type="l", lty="solid", col="red", lwd=2, | |
main = "Runs per team per game, 1901-2012", | |
ylim = ylim, | |
xlab = "year", ylab = "runs per game") | |
# add NL line | |
lines(NLseason$yearID, NLRunScore.LO.25.predict, lty="dashed", col="blue", lwd=2) | |
# chart additions | |
legend(1900, 3.5, | |
c("AL (span=0.25)", "NL (span=0.25)"), | |
lty=c("solid", "dashed"), | |
col=c("red", "blue"), | |
lwd=c(2, 2)) | |
grid() | |
# | |
# | |
# plot loess curves (span=0.50) | |
ylim <- c(3,6) | |
# start with AL line | |
plot(ALRunScore.LO.5.predict ~ ALseason$yearID, | |
type="l", lty="solid", col="red", lwd=2, | |
main = "Runs per team per game, 1901-2012", | |
ylim = ylim, | |
xlab = "year", ylab = "runs per game") | |
# add NL line | |
lines(NLseason$yearID, NLRunScore.LO.5.predict, lty="dashed", col="blue", lwd=2) | |
# chart additions | |
legend(1900, 3.5, | |
c("AL (span=0.50)", "NL (span=0.50)"), | |
lty=c("solid", "dashed"), | |
col=c("red", "blue"), | |
lwd=c(2, 2)) | |
grid() | |
# | |
# | |
# | |
# plot multiple loess curves (span=0.50 and 0.25) | |
ylim <- c(3,6) | |
# start with AL line | |
plot(ALRunScore.LO.5.predict ~ ALseason$yearID, | |
type="l", lty="solid", col="red", lwd=2, | |
main = "Runs per team per game, 1901-2012", | |
ylim = ylim, | |
xlab = "year", ylab = "runs per game") | |
# add NL line | |
lines(NLseason$yearID, NLRunScore.LO.5.predict, lty="solid", col="blue", lwd=2) | |
# add 0.25 lines | |
lines(ALseason$yearID, ALRunScore.LO.25.predict, lty="dashed", col="red", lwd=2) | |
lines(NLseason$yearID, NLRunScore.LO.25.predict, lty="dashed", col="blue", lwd=2) | |
# chart additions | |
legend(1900, 3.5, | |
c("AL (span=0.50)", "NL (span=0.50)", "AL (span=0.25)", "NL (span=0.25)"), | |
lty=c("solid", "solid", "dashed", "dashed"), | |
col=c("red", "blue", "red", "blue"), | |
lwd=c(2, 2, 2, 2)) | |
grid() | |
# | |
# # # # # # # # # # # # # # # # # # | |
# | |
# calculate the difference between the two leagues | |
# 1. absolute | |
RunDiff <- (ALseason$LG_RPG - NLseason$LG_RPG) | |
# 2. LOESS span=0.25 | |
RunDiffLO <- (ALRunScore.LO.25.predict - NLRunScore.LO.25.predict) | |
# | |
# plot the LOESS difference | |
ylim <- c(-1,1) | |
plot(RunDiffLO ~ ALseason$yearID, | |
type="l", lty="solid", col="red", lwd=2, | |
main = "Run scoring trend: AL difference from NL, 1901-2012", | |
ylim = ylim, | |
xlab = "year", ylab = "runs per game") | |
# add line at zero | |
abline(h = 0, lty="dotdash") | |
grid() | |
# | |
# plot each year difference as line, trend as line | |
ylim <- c(-1,1.5) | |
plot(RunDiffLO ~ ALseason$yearID, | |
type="l", lty="solid", col="red", lwd=3, | |
main = "Run scoring trend: AL difference from NL, 1901-2012", | |
ylim = ylim, | |
xlab = "year", ylab = "runs per game") | |
# add RunDiff line | |
lines(ALseason$yearID, RunDiff, lty="solid", col="black", lwd=1) | |
# add line at zero | |
abline(h = 0, lty="dotdash") | |
grid() | |
# | |
# | |
# plot each year difference as bar, trend as line | |
ylim <- c(-1,1.5) | |
plot(RunDiff ~ ALseason$yearID, | |
type="h", lty="solid", col="blue", lwd=2, | |
main = "Run scoring trend: AL difference from NL, 1901-2012", | |
ylim = ylim, | |
xlab = "year", ylab = "runs per game") | |
# add RunDiff line | |
lines(ALseason$yearID, RunDiffLO, lty="solid", col="black", lwd=2) | |
# add line at zero | |
abline(h = 0, lty="dotdash") | |
# chart additions | |
grid() | |
legend(1900, 1.5, | |
c("AL difference from NL: absolute", "AL difference from NL, LOESS (span=0.25)"), | |
lty=c("solid", "solid"), | |
col=c("blue", "black"), | |
lwd=c(2, 2)) | |
# | |
# |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment