MonkmanMH/gist:4970480

## gistfile1.r
# MAJOR LEAGUE BASEBALL - RUNS PER GAME TREND
#
# discussion at
# 1.  http://bayesball.blogspot.ca/2012/07/trends-in-al-run-scoring-using-r.html
# 2.  http://bayesball.blogspot.ca/2012/07/trends-in-run-scoring-nl-edition-more-r.html
# 3.  http://bayesball.blogspot.ca/2012/08/trends-in-run-scoring-comparing-leagues.html
#
# data source: Baseball Reference
# http://www.baseball-reference.com
# http://www.baseball-reference.com/leagues/AL/bat.shtml
# http://www.baseball-reference.com/leagues/NL/bat.shtml
#
# open with "csv" option (top right corner of the table),
# copy and paste into text editor or Excel, save as CSV file
#
# set working directory
# setwd("K:/data/R_the software/datatrials/baseball/RunsPerGame")
# read the data into a table
ALseason <- read.table(file="ALseasons.csv", sep = ",", header = TRUE)
NLseason <- read.table(file="NLseasons.csv", sep = ",", header = TRUE)
#
# alternate approach to reading NL season, from .txt file
NLseason <- read.table(file="NLseasons.txt", sep = ",", header = TRUE)
#
#
# RUNS SCORED PER GAME
# ====================
#
# start with American League
#
# very simple plot -- as (x, y)
plot(ALseason$Year, ALseason$R)
# as (y predicted by x)
plot(ALseason$R ~ ALseason$Year)
#
# create new object ALRunScore.LO for loess model
ALRunScore.LO <- loess(ALseason$R ~ ALseason$Year)
ALRunScore.LO.predict <- predict(ALRunScore.LO)
#
# plot the data, add loess curve
ylim <- c(3,6)
plot(ALseason$R ~ ALseason$Year,
  ylim = ylim,
  main = "American League: runs per team per game, 1901-2012",
  xlab = "year", ylab = "runs per game")
# chart tidying
  grid()
# loess predicted value line
  lines(ALseason$Year, ALRunScore.LO.predict,
    lty="solid", col="red", lwd=2)
#
#
# VERSION 2 -- add "span" control to adjust smoothing
#
# references:
#  http://princeofslides.blogspot.ca/2011/05/sab-r-metrics-basics-of-loess.html
#  http://research.stowers-institute.org/efg/R/Statistics/loess.htm
#
# create new object RunScore.LO for loess model, span=0.25
ALRunScore.LO.25 <- loess(ALseason$R ~ ALseason$Year, span=0.25)
ALRunScore.LO.25.predict <- predict(ALRunScore.LO.25)
#
ALRunScore.LO.5 <- loess(ALseason$R ~ ALseason$Year, span=0.5)
ALRunScore.LO.5.predict <- predict(ALRunScore.LO.5)
#
# plot the data, add loess curve
ylim <- c(3,6)
plot(ALseason$R ~ ALseason$Year,
  ylim = ylim,
  main = "American League: runs per team per game, 1901-2012",
  xlab = "year", ylab = "runs per game")
# loess predicted value line
  lines(ALseason$Year, ALRunScore.LO.predict, lty="solid", col="red", lwd=2)
  lines(ALseason$Year, ALRunScore.LO.25.predict, lty="dashed", col="blue", lwd=2)
  lines(ALseason$Year, ALRunScore.LO.5.predict, lty="dotdash", col="black", lwd=2)
# chart tidying
  legend(1980, 3.5,
    c("default", "span=0.25", "span=0.50"),
    lty=c("solid", "dashed", "dotdash"),
    col=c("red", "blue", "black"),
    lwd=c(2, 2, 2))
  grid()
#
#
#
#-N-N-N-N-N-N-N-N-N-N-N-N-N-N
#
# NATIONAL LEAGUE
#
# create new object RunScore.LO for loess model
NLRunScore.LO <- loess(NLseason$R ~ NLseason$Year)
NLRunScore.LO.predict <- predict(NLRunScore.LO)
#
# plot the data, add loess curve
ylim <- c(3,6)
plot(NLseason$R ~ NLseason$Year,
  pch=2, col="black",
  ylim = ylim,
  main = "National League: runs per team per game, 1901-2012",
  xlab = "year", ylab = "runs per game")
# loess predicted value line
  lines(NLseason$Year, NLRunScore.LO.predict, lty="solid", col="blue", lwd=2)
# chart tidying
  grid()
#
#
# VERSION 2 -- add "span" control to adjust smoothing
#
# reference: http://research.stowers-institute.org/efg/R/Statistics/loess.htm
#
# create new object RunScore.LO for loess model
NLRunScore.LO.25 <- loess(NLseason$R ~ NLseason$Year, span=0.25)
NLRunScore.LO.25.predict <- predict(NLRunScore.LO.25)
NLRunScore.LO.5 <- loess(NLseason$R ~ NLseason$Year, span=0.5)
NLRunScore.LO.5.predict <- predict(NLRunScore.LO.5)
#
# plot the data, add loess curve
ylim <- c(3,6)
plot(NLseason$R ~ NLseason$Year,
  pch=2, col="black",
  ylim = ylim,
  main = "National League: runs per team per game, 1901-2012",
  xlab = "year", ylab = "runs per game")
# loess predicted value line
  lines(NLseason$Year, NLRunScore.LO.predict, lty="solid", col="blue", lwd=2)
  lines(NLseason$Year, NLRunScore.LO.25.predict, lty="dashed", col="red", lwd=2)
  lines(NLseason$Year, NLRunScore.LO.5.predict, lty="dotdash", col="black", lwd=2)
# chart tidying
  legend(1980, 3.5,
    c("default", "span=0.25", "span=0.50"),
    lty=c("solid", "dashed", "dotdash"),
    col=c("blue", "red", "black"),
    lwd=c(2, 2, 2))
  grid()
#
#
#
#
# MULTI-PLOT -- MERGING AL AND NL RESULTS
#
# plot individual years as points
ylim <- c(3,6)
# start with AL
plot(ALseason$R ~ ALseason$Year,
  type="p", pch=1, col="black",
  main = "Runs per team per game, 1901-2012",
  ylim = ylim,
  xlab = "year", ylab = "runs per game")
# add NL line
  points(NLseason$Year, NLseason$R, pch=2, col="blue")
# chart additions
  grid()
  legend(1900, 6, c("AL", "NL"), pch=c(1, 2), col=c("black", "blue"))
#

# plot individual years as lines
ylim <- c(3,6)
# start with AL line
plot(ALseason$R ~ ALseason$Year,
  type="l", lty="solid", col="red", lwd=2,
  main = "Runs per team per game, 1901-2012",
  ylim = ylim,
  xlab = "year", ylab = "runs per game")
# add NL line
  lines(NLseason$Year, NLseason$R, lty="solid", col="blue", lwd=2)
# chart additions
  grid()
  legend(1900, 3.5, c("AL", "NL"), lty=c("solid", "solid"), col=c("red", "blue"), lwd=c(2, 2))
#
#
# plot loess curves (span=0.25)
ylim <- c(3,6)
# start with AL line
plot(ALRunScore.LO.25.predict ~ ALseason$Year,
  type="l", lty="solid", col="red", lwd=2,
  main = "Runs per team per game, 1901-2012",
  ylim = ylim,
  xlab = "year", ylab = "runs per game")
# add NL line
  lines(NLseason$Year, NLRunScore.LO.25.predict, lty="dashed", col="blue", lwd=2)
# chart additions
   legend(1900, 3.5,
    c("AL (span=0.25)", "NL (span=0.25)"),
    lty=c("solid", "dashed"),
    col=c("red", "blue"),
    lwd=c(2, 2))
  grid()
#
#
# plot loess curves (span=0.50)
ylim <- c(3,6)
# start with AL line
plot(ALRunScore.LO.5.predict ~ ALseason$Year,
  type="l", lty="solid", col="red", lwd=2,
  main = "Runs per team per game, 1901-2012",
  ylim = ylim,
  xlab = "year", ylab = "runs per game")
# add NL line
  lines(NLseason$Year, NLRunScore.LO.5.predict, lty="dashed", col="blue", lwd=2)
# chart additions
   legend(1900, 3.5,
    c("AL (span=0.50)", "NL (span=0.50)"),
    lty=c("solid", "dashed"),
    col=c("red", "blue"),
    lwd=c(2, 2))
  grid()
#
#
#
# plot multiple loess curves (span=0.50 and 0.25)
ylim <- c(3,6)
# start with AL line
plot(ALRunScore.LO.5.predict ~ ALseason$Year,
  type="l", lty="solid", col="red", lwd=2,
  main = "Runs per team per game, 1901-2012",
  ylim = ylim,
  xlab = "year", ylab = "runs per game")
# add NL line
  lines(NLseason$Year, NLRunScore.LO.5.predict, lty="solid", col="blue", lwd=2)
# add 0.25 lines
  lines(ALseason$Year, ALRunScore.LO.25.predict, lty="dashed", col="red", lwd=2)
  lines(NLseason$Year, NLRunScore.LO.25.predict, lty="dashed", col="blue", lwd=2)
# chart additions
   legend(1900, 3.5,
    c("AL (span=0.50)", "NL (span=0.50)", "AL (span=0.25)", "NL (span=0.25)"),
    lty=c("solid", "solid", "dashed", "dashed"),
    col=c("red", "blue", "red", "blue"),
    lwd=c(2, 2, 2, 2))
  grid()
#
# # # # # # # # # # # # # # # # # #
#
# calculate the difference between the two leagues
# 1. absolute
RunDiff <- (ALseason$R - NLseason$R)
# 2. LOESS span=0.25
RunDiffLO <- (ALRunScore.LO.25.predict - NLRunScore.LO.25.predict)
#
# plot the LOESS difference
ylim <- c(-1,1)
plot(RunDiffLO ~ ALseason$Year,
  type="l", lty="solid", col="red", lwd=2,
  main = "Run scoring trend: AL difference from NL, 1901-2012",
  ylim = ylim,
  xlab = "year", ylab = "runs per game")
# add line at zero
  abline(h = 0, lty="dotdash")
  grid()
#
# plot each year difference as line, trend as line
ylim <- c(-1,1.5)
plot(RunDiffLO ~ ALseason$Year,
  type="l", lty="solid", col="red", lwd=3,
  main = "Run scoring trend: AL difference from NL, 1901-2012",
  ylim = ylim,
  xlab = "year", ylab = "runs per game")
# add RunDiff line
  lines(ALseason$Year, RunDiff, lty="solid", col="black", lwd=1)
# add line at zero
  abline(h = 0, lty="dotdash")
  grid()
#
#
# plot each year difference as bar, trend as line
ylim <- c(-1,1.5)
plot(RunDiff ~ ALseason$Year,
  type="h", lty="solid", col="blue", lwd=2,
  main = "Run scoring trend: AL difference from NL, 1901-2012",
  ylim = ylim,
  xlab = "year", ylab = "runs per game")
# add RunDiff line
  lines(ALseason$Year, RunDiffLO, lty="solid", col="black", lwd=2)
# add line at zero
  abline(h = 0, lty="dotdash")
# chart additions
  grid()
  legend(1900, 1.5,
    c("AL difference from NL: absolute", "AL difference from NL, LOESS (span=0.25)"),
    lty=c("solid", "solid"),
    col=c("blue", "black"),
    lwd=c(2, 2))
#
#
	# MAJOR LEAGUE BASEBALL - RUNS PER GAME TREND
	#
	# discussion at
	# 1. http://bayesball.blogspot.ca/2012/07/trends-in-al-run-scoring-using-r.html
	# 2. http://bayesball.blogspot.ca/2012/07/trends-in-run-scoring-nl-edition-more-r.html
	# 3. http://bayesball.blogspot.ca/2012/08/trends-in-run-scoring-comparing-leagues.html
	#
	# data source: Baseball Reference
	# http://www.baseball-reference.com
	# http://www.baseball-reference.com/leagues/AL/bat.shtml
	# http://www.baseball-reference.com/leagues/NL/bat.shtml
	#
	# open with "csv" option (top right corner of the table),
	# copy and paste into text editor or Excel, save as CSV file
	#
	# set working directory
	# setwd("K:/data/R_the software/datatrials/baseball/RunsPerGame")
	# read the data into a table
	ALseason <- read.table(file="ALseasons.csv", sep = ",", header = TRUE)
	NLseason <- read.table(file="NLseasons.csv", sep = ",", header = TRUE)
	#
	# alternate approach to reading NL season, from .txt file
	NLseason <- read.table(file="NLseasons.txt", sep = ",", header = TRUE)
	#
	#
	# RUNS SCORED PER GAME
	# ====================
	#
	# start with American League
	#
	# very simple plot -- as (x, y)
	plot(ALseason$Year, ALseason$R)
	# as (y predicted by x)
	plot(ALseason$R ~ ALseason$Year)
	#
	# create new object ALRunScore.LO for loess model
	ALRunScore.LO <- loess(ALseason$R ~ ALseason$Year)
	ALRunScore.LO.predict <- predict(ALRunScore.LO)
	#
	# plot the data, add loess curve
	ylim <- c(3,6)
	plot(ALseason$R ~ ALseason$Year,
	ylim = ylim,
	main = "American League: runs per team per game, 1901-2012",
	xlab = "year", ylab = "runs per game")
	# chart tidying
	grid()
	# loess predicted value line
	lines(ALseason$Year, ALRunScore.LO.predict,
	lty="solid", col="red", lwd=2)
	#
	#
	# VERSION 2 -- add "span" control to adjust smoothing
	#
	# references:
	# http://princeofslides.blogspot.ca/2011/05/sab-r-metrics-basics-of-loess.html
	# http://research.stowers-institute.org/efg/R/Statistics/loess.htm
	#
	# create new object RunScore.LO for loess model, span=0.25
	ALRunScore.LO.25 <- loess(ALseason$R ~ ALseason$Year, span=0.25)
	ALRunScore.LO.25.predict <- predict(ALRunScore.LO.25)
	#
	ALRunScore.LO.5 <- loess(ALseason$R ~ ALseason$Year, span=0.5)
	ALRunScore.LO.5.predict <- predict(ALRunScore.LO.5)
	#
	# plot the data, add loess curve
	ylim <- c(3,6)
	plot(ALseason$R ~ ALseason$Year,
	ylim = ylim,
	main = "American League: runs per team per game, 1901-2012",
	xlab = "year", ylab = "runs per game")
	# loess predicted value line
	lines(ALseason$Year, ALRunScore.LO.predict, lty="solid", col="red", lwd=2)
	lines(ALseason$Year, ALRunScore.LO.25.predict, lty="dashed", col="blue", lwd=2)
	lines(ALseason$Year, ALRunScore.LO.5.predict, lty="dotdash", col="black", lwd=2)
	# chart tidying
	legend(1980, 3.5,
	c("default", "span=0.25", "span=0.50"),
	lty=c("solid", "dashed", "dotdash"),
	col=c("red", "blue", "black"),
	lwd=c(2, 2, 2))
	grid()
	#
	#
	#
	#-N-N-N-N-N-N-N-N-N-N-N-N-N-N
	#
	# NATIONAL LEAGUE
	#
	# create new object RunScore.LO for loess model
	NLRunScore.LO <- loess(NLseason$R ~ NLseason$Year)
	NLRunScore.LO.predict <- predict(NLRunScore.LO)
	#
	# plot the data, add loess curve
	ylim <- c(3,6)
	plot(NLseason$R ~ NLseason$Year,
	pch=2, col="black",
	ylim = ylim,
	main = "National League: runs per team per game, 1901-2012",
	xlab = "year", ylab = "runs per game")
	# loess predicted value line
	lines(NLseason$Year, NLRunScore.LO.predict, lty="solid", col="blue", lwd=2)
	# chart tidying
	grid()
	#
	#
	# VERSION 2 -- add "span" control to adjust smoothing
	#
	# reference: http://research.stowers-institute.org/efg/R/Statistics/loess.htm
	#
	# create new object RunScore.LO for loess model
	NLRunScore.LO.25 <- loess(NLseason$R ~ NLseason$Year, span=0.25)
	NLRunScore.LO.25.predict <- predict(NLRunScore.LO.25)
	NLRunScore.LO.5 <- loess(NLseason$R ~ NLseason$Year, span=0.5)
	NLRunScore.LO.5.predict <- predict(NLRunScore.LO.5)
	#
	# plot the data, add loess curve
	ylim <- c(3,6)
	plot(NLseason$R ~ NLseason$Year,
	pch=2, col="black",
	ylim = ylim,
	main = "National League: runs per team per game, 1901-2012",
	xlab = "year", ylab = "runs per game")
	# loess predicted value line
	lines(NLseason$Year, NLRunScore.LO.predict, lty="solid", col="blue", lwd=2)
	lines(NLseason$Year, NLRunScore.LO.25.predict, lty="dashed", col="red", lwd=2)
	lines(NLseason$Year, NLRunScore.LO.5.predict, lty="dotdash", col="black", lwd=2)
	# chart tidying
	legend(1980, 3.5,
	c("default", "span=0.25", "span=0.50"),
	lty=c("solid", "dashed", "dotdash"),
	col=c("blue", "red", "black"),
	lwd=c(2, 2, 2))
	grid()
	#
	#
	#
	#
	# MULTI-PLOT -- MERGING AL AND NL RESULTS
	#
	# plot individual years as points
	ylim <- c(3,6)
	# start with AL
	plot(ALseason$R ~ ALseason$Year,
	type="p", pch=1, col="black",
	main = "Runs per team per game, 1901-2012",
	ylim = ylim,
	xlab = "year", ylab = "runs per game")
	# add NL line
	points(NLseason$Year, NLseason$R, pch=2, col="blue")
	# chart additions
	grid()
	legend(1900, 6, c("AL", "NL"), pch=c(1, 2), col=c("black", "blue"))
	#

	# plot individual years as lines
	ylim <- c(3,6)
	# start with AL line
	plot(ALseason$R ~ ALseason$Year,
	type="l", lty="solid", col="red", lwd=2,
	main = "Runs per team per game, 1901-2012",
	ylim = ylim,
	xlab = "year", ylab = "runs per game")
	# add NL line
	lines(NLseason$Year, NLseason$R, lty="solid", col="blue", lwd=2)
	# chart additions
	grid()
	legend(1900, 3.5, c("AL", "NL"), lty=c("solid", "solid"), col=c("red", "blue"), lwd=c(2, 2))
	#
	#
	# plot loess curves (span=0.25)
	ylim <- c(3,6)
	# start with AL line
	plot(ALRunScore.LO.25.predict ~ ALseason$Year,
	type="l", lty="solid", col="red", lwd=2,
	main = "Runs per team per game, 1901-2012",
	ylim = ylim,
	xlab = "year", ylab = "runs per game")
	# add NL line
	lines(NLseason$Year, NLRunScore.LO.25.predict, lty="dashed", col="blue", lwd=2)
	# chart additions
	legend(1900, 3.5,
	c("AL (span=0.25)", "NL (span=0.25)"),
	lty=c("solid", "dashed"),
	col=c("red", "blue"),
	lwd=c(2, 2))
	grid()
	#
	#
	# plot loess curves (span=0.50)
	ylim <- c(3,6)
	# start with AL line
	plot(ALRunScore.LO.5.predict ~ ALseason$Year,
	type="l", lty="solid", col="red", lwd=2,
	main = "Runs per team per game, 1901-2012",
	ylim = ylim,
	xlab = "year", ylab = "runs per game")
	# add NL line
	lines(NLseason$Year, NLRunScore.LO.5.predict, lty="dashed", col="blue", lwd=2)
	# chart additions
	legend(1900, 3.5,
	c("AL (span=0.50)", "NL (span=0.50)"),
	lty=c("solid", "dashed"),
	col=c("red", "blue"),
	lwd=c(2, 2))
	grid()
	#
	#
	#
	# plot multiple loess curves (span=0.50 and 0.25)
	ylim <- c(3,6)
	# start with AL line
	plot(ALRunScore.LO.5.predict ~ ALseason$Year,
	type="l", lty="solid", col="red", lwd=2,
	main = "Runs per team per game, 1901-2012",
	ylim = ylim,
	xlab = "year", ylab = "runs per game")
	# add NL line
	lines(NLseason$Year, NLRunScore.LO.5.predict, lty="solid", col="blue", lwd=2)
	# add 0.25 lines
	lines(ALseason$Year, ALRunScore.LO.25.predict, lty="dashed", col="red", lwd=2)
	lines(NLseason$Year, NLRunScore.LO.25.predict, lty="dashed", col="blue", lwd=2)
	# chart additions
	legend(1900, 3.5,
	c("AL (span=0.50)", "NL (span=0.50)", "AL (span=0.25)", "NL (span=0.25)"),
	lty=c("solid", "solid", "dashed", "dashed"),
	col=c("red", "blue", "red", "blue"),
	lwd=c(2, 2, 2, 2))
	grid()
	#
	# # # # # # # # # # # # # # # # # #
	#
	# calculate the difference between the two leagues
	# 1. absolute
	RunDiff <- (ALseason$R - NLseason$R)
	# 2. LOESS span=0.25
	RunDiffLO <- (ALRunScore.LO.25.predict - NLRunScore.LO.25.predict)
	#
	# plot the LOESS difference
	ylim <- c(-1,1)
	plot(RunDiffLO ~ ALseason$Year,
	type="l", lty="solid", col="red", lwd=2,
	main = "Run scoring trend: AL difference from NL, 1901-2012",
	ylim = ylim,
	xlab = "year", ylab = "runs per game")
	# add line at zero
	abline(h = 0, lty="dotdash")
	grid()
	#
	# plot each year difference as line, trend as line
	ylim <- c(-1,1.5)
	plot(RunDiffLO ~ ALseason$Year,
	type="l", lty="solid", col="red", lwd=3,
	main = "Run scoring trend: AL difference from NL, 1901-2012",
	ylim = ylim,
	xlab = "year", ylab = "runs per game")
	# add RunDiff line
	lines(ALseason$Year, RunDiff, lty="solid", col="black", lwd=1)
	# add line at zero
	abline(h = 0, lty="dotdash")
	grid()
	#
	#
	# plot each year difference as bar, trend as line
	ylim <- c(-1,1.5)
	plot(RunDiff ~ ALseason$Year,
	type="h", lty="solid", col="blue", lwd=2,
	main = "Run scoring trend: AL difference from NL, 1901-2012",
	ylim = ylim,
	xlab = "year", ylab = "runs per game")
	# add RunDiff line
	lines(ALseason$Year, RunDiffLO, lty="solid", col="black", lwd=2)
	# add line at zero
	abline(h = 0, lty="dotdash")
	# chart additions
	grid()
	legend(1900, 1.5,
	c("AL difference from NL: absolute", "AL difference from NL, LOESS (span=0.25)"),
	lty=c("solid", "solid"),
	col=c("blue", "black"),
	lwd=c(2, 2))
	#
	#