MLB runs per game - league trends
# MAJOR LEAGUE BASEBALL - RUNS PER GAME TREND | |
# | |
# discussion at | |
# 1. http://bayesball.blogspot.ca/2012/07/trends-in-al-run-scoring-using-r.html | |
# 2. http://bayesball.blogspot.ca/2012/07/trends-in-run-scoring-nl-edition-more-r.html | |
# 3. http://bayesball.blogspot.ca/2012/08/trends-in-run-scoring-comparing-leagues.html | |
# | |
# data source: Baseball Reference | |
# http://www.baseball-reference.com | |
# http://www.baseball-reference.com/leagues/AL/bat.shtml | |
# http://www.baseball-reference.com/leagues/NL/bat.shtml | |
# | |
# open with "csv" option (top right corner of the table), | |
# copy and paste into text editor or Excel, save as CSV file | |
# | |
# set working directory | |
# setwd("K:/data/R_the software/datatrials/baseball/RunsPerGame") | |
# read the data into a table | |
ALseason <- read.table(file="ALseasons.csv", sep = ",", header = TRUE) | |
NLseason <- read.table(file="NLseasons.csv", sep = ",", header = TRUE) | |
# | |
# alternate approach to reading NL season, from .txt file | |
NLseason <- read.table(file="NLseasons.txt", sep = ",", header = TRUE) | |
# | |
# | |
# RUNS SCORED PER GAME | |
# ==================== | |
# | |
# start with American League | |
# | |
# very simple plot -- as (x, y) | |
plot(ALseason$Year, ALseason$R) | |
# as (y predicted by x) | |
plot(ALseason$R ~ ALseason$Year) | |
# | |
# create new object ALRunScore.LO for loess model | |
ALRunScore.LO <- loess(ALseason$R ~ ALseason$Year) | |
ALRunScore.LO.predict <- predict(ALRunScore.LO) | |
# | |
# plot the data, add loess curve | |
ylim <- c(3,6) | |
plot(ALseason$R ~ ALseason$Year, | |
ylim = ylim, | |
main = "American League: runs per team per game, 1901-2012", | |
xlab = "year", ylab = "runs per game") | |
# chart tidying | |
grid() | |
# loess predicted value line | |
lines(ALseason$Year, ALRunScore.LO.predict, | |
lty="solid", col="red", lwd=2) | |
# | |
# | |
# VERSION 2 -- add "span" control to adjust smoothing | |
# | |
# references: | |
# http://princeofslides.blogspot.ca/2011/05/sab-r-metrics-basics-of-loess.html | |
# http://research.stowers-institute.org/efg/R/Statistics/loess.htm | |
# | |
# create new object RunScore.LO for loess model, span=0.25 | |
ALRunScore.LO.25 <- loess(ALseason$R ~ ALseason$Year, span=0.25) | |
ALRunScore.LO.25.predict <- predict(ALRunScore.LO.25) | |
# | |
ALRunScore.LO.5 <- loess(ALseason$R ~ ALseason$Year, span=0.5) | |
ALRunScore.LO.5.predict <- predict(ALRunScore.LO.5) | |
# | |
# plot the data, add loess curve | |
ylim <- c(3,6) | |
plot(ALseason$R ~ ALseason$Year, | |
ylim = ylim, | |
main = "American League: runs per team per game, 1901-2012", | |
xlab = "year", ylab = "runs per game") | |
# loess predicted value line | |
lines(ALseason$Year, ALRunScore.LO.predict, lty="solid", col="red", lwd=2) | |
lines(ALseason$Year, ALRunScore.LO.25.predict, lty="dashed", col="blue", lwd=2) | |
lines(ALseason$Year, ALRunScore.LO.5.predict, lty="dotdash", col="black", lwd=2) | |
# chart tidying | |
legend(1980, 3.5, | |
c("default", "span=0.25", "span=0.50"), | |
lty=c("solid", "dashed", "dotdash"), | |
col=c("red", "blue", "black"), | |
lwd=c(2, 2, 2)) | |
grid() | |
# | |
# | |
# | |
#-N-N-N-N-N-N-N-N-N-N-N-N-N-N | |
# | |
# NATIONAL LEAGUE | |
# | |
# create new object RunScore.LO for loess model | |
NLRunScore.LO <- loess(NLseason$R ~ NLseason$Year) | |
NLRunScore.LO.predict <- predict(NLRunScore.LO) | |
# | |
# plot the data, add loess curve | |
ylim <- c(3,6) | |
plot(NLseason$R ~ NLseason$Year, | |
pch=2, col="black", | |
ylim = ylim, | |
main = "National League: runs per team per game, 1901-2012", | |
xlab = "year", ylab = "runs per game") | |
# loess predicted value line | |
lines(NLseason$Year, NLRunScore.LO.predict, lty="solid", col="blue", lwd=2) | |
# chart tidying | |
grid() | |
# | |
# | |
# VERSION 2 -- add "span" control to adjust smoothing | |
# | |
# reference: http://research.stowers-institute.org/efg/R/Statistics/loess.htm | |
# | |
# create new object RunScore.LO for loess model | |
NLRunScore.LO.25 <- loess(NLseason$R ~ NLseason$Year, span=0.25) | |
NLRunScore.LO.25.predict <- predict(NLRunScore.LO.25) | |
NLRunScore.LO.5 <- loess(NLseason$R ~ NLseason$Year, span=0.5) | |
NLRunScore.LO.5.predict <- predict(NLRunScore.LO.5) | |
# | |
# plot the data, add loess curve | |
ylim <- c(3,6) | |
plot(NLseason$R ~ NLseason$Year, | |
pch=2, col="black", | |
ylim = ylim, | |
main = "National League: runs per team per game, 1901-2012", | |
xlab = "year", ylab = "runs per game") | |
# loess predicted value line | |
lines(NLseason$Year, NLRunScore.LO.predict, lty="solid", col="blue", lwd=2) | |
lines(NLseason$Year, NLRunScore.LO.25.predict, lty="dashed", col="red", lwd=2) | |
lines(NLseason$Year, NLRunScore.LO.5.predict, lty="dotdash", col="black", lwd=2) | |
# chart tidying | |
legend(1980, 3.5, | |
c("default", "span=0.25", "span=0.50"), | |
lty=c("solid", "dashed", "dotdash"), | |
col=c("blue", "red", "black"), | |
lwd=c(2, 2, 2)) | |
grid() | |
# | |
# | |
# | |
# | |
# MULTI-PLOT -- MERGING AL AND NL RESULTS | |
# | |
# plot individual years as points | |
ylim <- c(3,6) | |
# start with AL | |
plot(ALseason$R ~ ALseason$Year, | |
type="p", pch=1, col="black", | |
main = "Runs per team per game, 1901-2012", | |
ylim = ylim, | |
xlab = "year", ylab = "runs per game") | |
# add NL line | |
points(NLseason$Year, NLseason$R, pch=2, col="blue") | |
# chart additions | |
grid() | |
legend(1900, 6, c("AL", "NL"), pch=c(1, 2), col=c("black", "blue")) | |
# | |
# plot individual years as lines | |
ylim <- c(3,6) | |
# start with AL line | |
plot(ALseason$R ~ ALseason$Year, | |
type="l", lty="solid", col="red", lwd=2, | |
main = "Runs per team per game, 1901-2012", | |
ylim = ylim, | |
xlab = "year", ylab = "runs per game") | |
# add NL line | |
lines(NLseason$Year, NLseason$R, lty="solid", col="blue", lwd=2) | |
# chart additions | |
grid() | |
legend(1900, 3.5, c("AL", "NL"), lty=c("solid", "solid"), col=c("red", "blue"), lwd=c(2, 2)) | |
# | |
# | |
# plot loess curves (span=0.25) | |
ylim <- c(3,6) | |
# start with AL line | |
plot(ALRunScore.LO.25.predict ~ ALseason$Year, | |
type="l", lty="solid", col="red", lwd=2, | |
main = "Runs per team per game, 1901-2012", | |
ylim = ylim, | |
xlab = "year", ylab = "runs per game") | |
# add NL line | |
lines(NLseason$Year, NLRunScore.LO.25.predict, lty="dashed", col="blue", lwd=2) | |
# chart additions | |
legend(1900, 3.5, | |
c("AL (span=0.25)", "NL (span=0.25)"), | |
lty=c("solid", "dashed"), | |
col=c("red", "blue"), | |
lwd=c(2, 2)) | |
grid() | |
# | |
# | |
# plot loess curves (span=0.50) | |
ylim <- c(3,6) | |
# start with AL line | |
plot(ALRunScore.LO.5.predict ~ ALseason$Year, | |
type="l", lty="solid", col="red", lwd=2, | |
main = "Runs per team per game, 1901-2012", | |
ylim = ylim, | |
xlab = "year", ylab = "runs per game") | |
# add NL line | |
lines(NLseason$Year, NLRunScore.LO.5.predict, lty="dashed", col="blue", lwd=2) | |
# chart additions | |
legend(1900, 3.5, | |
c("AL (span=0.50)", "NL (span=0.50)"), | |
lty=c("solid", "dashed"), | |
col=c("red", "blue"), | |
lwd=c(2, 2)) | |
grid() | |
# | |
# | |
# | |
# plot multiple loess curves (span=0.50 and 0.25) | |
ylim <- c(3,6) | |
# start with AL line | |
plot(ALRunScore.LO.5.predict ~ ALseason$Year, | |
type="l", lty="solid", col="red", lwd=2, | |
main = "Runs per team per game, 1901-2012", | |
ylim = ylim, | |
xlab = "year", ylab = "runs per game") | |
# add NL line | |
lines(NLseason$Year, NLRunScore.LO.5.predict, lty="solid", col="blue", lwd=2) | |
# add 0.25 lines | |
lines(ALseason$Year, ALRunScore.LO.25.predict, lty="dashed", col="red", lwd=2) | |
lines(NLseason$Year, NLRunScore.LO.25.predict, lty="dashed", col="blue", lwd=2) | |
# chart additions | |
legend(1900, 3.5, | |
c("AL (span=0.50)", "NL (span=0.50)", "AL (span=0.25)", "NL (span=0.25)"), | |
lty=c("solid", "solid", "dashed", "dashed"), | |
col=c("red", "blue", "red", "blue"), | |
lwd=c(2, 2, 2, 2)) | |
grid() | |
# | |
# # # # # # # # # # # # # # # # # # | |
# | |
# calculate the difference between the two leagues | |
# 1. absolute | |
RunDiff <- (ALseason$R - NLseason$R) | |
# 2. LOESS span=0.25 | |
RunDiffLO <- (ALRunScore.LO.25.predict - NLRunScore.LO.25.predict) | |
# | |
# plot the LOESS difference | |
ylim <- c(-1,1) | |
plot(RunDiffLO ~ ALseason$Year, | |
type="l", lty="solid", col="red", lwd=2, | |
main = "Run scoring trend: AL difference from NL, 1901-2012", | |
ylim = ylim, | |
xlab = "year", ylab = "runs per game") | |
# add line at zero | |
abline(h = 0, lty="dotdash") | |
grid() | |
# | |
# plot each year difference as line, trend as line | |
ylim <- c(-1,1.5) | |
plot(RunDiffLO ~ ALseason$Year, | |
type="l", lty="solid", col="red", lwd=3, | |
main = "Run scoring trend: AL difference from NL, 1901-2012", | |
ylim = ylim, | |
xlab = "year", ylab = "runs per game") | |
# add RunDiff line | |
lines(ALseason$Year, RunDiff, lty="solid", col="black", lwd=1) | |
# add line at zero | |
abline(h = 0, lty="dotdash") | |
grid() | |
# | |
# | |
# plot each year difference as bar, trend as line | |
ylim <- c(-1,1.5) | |
plot(RunDiff ~ ALseason$Year, | |
type="h", lty="solid", col="blue", lwd=2, | |
main = "Run scoring trend: AL difference from NL, 1901-2012", | |
ylim = ylim, | |
xlab = "year", ylab = "runs per game") | |
# add RunDiff line | |
lines(ALseason$Year, RunDiffLO, lty="solid", col="black", lwd=2) | |
# add line at zero | |
abline(h = 0, lty="dotdash") | |
# chart additions | |
grid() | |
legend(1900, 1.5, | |
c("AL difference from NL: absolute", "AL difference from NL, LOESS (span=0.25)"), | |
lty=c("solid", "solid"), | |
col=c("blue", "black"), | |
lwd=c(2, 2)) | |
# | |
# |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment