MonkmanMH/gist:5027789

## gistfile1.r
# THE HISTORICAL RECORD - RUNS PER GAME
#
# discussion and output can be found at
# http://bayesball.blogspot.ca/2013/02/comparing-individual-team-run-production.html
#
# data source: Lahman Database
# http://www.seanlahman.com/baseball-archive/statistics/
# 2012 version (1871-2012)
# table: "Teams"
#
# note: R doesn't like something in line 141 of the stadium name fields,
# so reading the data becomes a two-step process
#
# read the file, then convert it into a datafram
Teams1 <- read.csv(file="Teams.csv", header = TRUE)
Teams1 <- as.data.frame(Teams1)
#
# select only those teams from 1947 forward to 2012
# see Bill James http://www.billjamesonline.com/dividing_baseball_history_into_eras/
Teams <- as.data.frame(subset (Teams1, yearID > 1946))
#
# calculate average runs per game
Teams$RPG <- Teams$R / Teams$G
# calculate average runs allowed per game
Teams$RAPG <- Teams$RA / Teams$G
#
# calculate RPG and RAPG season averages for each league
# step 1a: sum of annual runs per league
RunsLG <- data.frame(aggregate(Teams$R ~ Teams$yearID + Teams$lgID, FUN=sum))
# step 1b: sum of annual runs allowed per league
#   [NOTE: since the introduction of interleague play, this is not the same as runs scored!]
RunsALG <- data.frame(aggregate(Teams$RA ~ Teams$yearID + Teams$lgID, FUN=sum))
# step 2: sum of annual games per league
GamesLG <- data.frame(aggregate(Teams$G ~ Teams$yearID + Teams$lgID, FUN=sum))
#
#
# merge the two objects together (need to find a more elegant way to do this!)
LG_RPG <- data.frame(merge(RunsLG, RunsALG,
   by.x = c("Teams.yearID", "Teams.lgID"),
   by.y = c("Teams.yearID", "Teams.lgID")))
#
LG_RPG <- data.frame(merge(LG_RPG, GamesLG,
   by.x = c("Teams.yearID", "Teams.lgID"),
   by.y = c("Teams.yearID", "Teams.lgID")))
#
# clean up the variable names
names(LG_RPG)[1]<-paste("yearID")
names(LG_RPG)[2]<-paste("lgID")
#
# Peter's more elegant solution for lines #35-45
LG_RPG <- aggregate(cbind(R, RA, G) ~ yearID + lgID, data = Teams, sum)
#
# calculate league runs and runs allowed per game
LG_RPG$LG_RPG <- LG_RPG$Teams.R / LG_RPG$Teams.G
LG_RPG$LG_RAPG <- LG_RPG$Teams.RA / LG_RPG$Teams.G
##
# Use the "merge" command to append the league values to the correct rows
# Creates a single data frame Teams.merge that contains the team runs etc as well as the league run values for that season
#
Teams.merge <- merge(Teams, LG_RPG)
#
#
# CREATE INDEX VALUES FOR EACH TEAM
#
# A. Runs per game
#
# create new values to compare the individual team's runs/game compares to the league average that season
# 1. use an index where 100=the league average for that season
Teams.merge$R_index <- Teams.merge$RPG / Teams.merge$LG_RPG * 100
# 2. and Z scores of the index scores
R_index.sd <- sd(Teams.merge$R_index)
Teams.merge$R_Z <- (Teams.merge$R_index - 100)/R_index.sd
#
# calculate minimum, maximum, and standard deviation
min(Teams.merge$R_index)
max(Teams.merge$R_index)
sd(Teams.merge$R_index)
#
# B. Runs allowed per game
#
# create new values to compare the individual team's runs allowed/game compares to the league average that season
# 1. use an index where 100=the league average for that season
Teams.merge$RA_index <- Teams.merge$RAPG / Teams.merge$LG_RAPG * 100
# 2. and Z scores of the index scores
RA_index.sd <- sd(Teams.merge$RA_index)
Teams.merge$RA_Z <- (Teams.merge$RA_index - 100)/R_index.sd
#
# calculate minimum, maximum, and standard deviation
min(Teams.merge$RA_index)
max(Teams.merge$RA_index)
sd(Teams.merge$RA_index)
#
#
# RANK AND SORT BY R_INDEX
# 1. low to high (default)
#   a. rank
Teams.merge$R_index_rank <- rank(Teams.merge$R_index)
#   b. Sort
Teams.merge.sort <- Teams.merge[c("yearID","lgID","franchID","R_index", "R_index_rank")]
Teams.merge.sort <- Teams.merge.sort[order(Teams.merge.sort$R_index),]
Teams.low_off <- as.data.frame (subset(Teams.merge.sort, R_index < 80))
Teams.low_off
write.csv(Teams.low_off, file="Teams.low_off.csv")
#
# 2. high to low
#   a. rank (note use of "-" in front of variable name)
Teams.merge$R_index_rank <- rank(-Teams.merge$R_index)
#   b. sort (note use of "decreasing=TRUE" in "order" command)
Teams.merge.sort <- Teams.merge.sort[order(Teams.merge.sort$R_index,decreasing=TRUE),]
Teams.hi_off <- as.data.frame (subset(Teams.merge.sort, R_index > 120))
Teams.hi_off
write.csv(Teams.hi_off, file="Teams.hi_off.csv")
#
#
# PLOT!
# index (basic)
hist(Teams.merge$R_index,
  main="MLB teams 1947-2012: Distribution of scoring",
  xla="Index value (100=league average)")
#
# index with density curve
hist(Teams.merge$R_index,
  prob=T,
  main="MLB teams 1947-2012: Distribution & density curve of scoring",
  xla="Index value (100=league average)")
lines(density(Teams.merge$R_index))
#
# Z scores
hist(Teams.merge$R_Z, prob=T)
lines(density(Teams.merge$R_Z))
#
#
# and write a new file with the Teams.merge data
write.csv(Teams.merge, file="Teams.merge.csv")
#
	# THE HISTORICAL RECORD - RUNS PER GAME
	#
	# discussion and output can be found at
	# http://bayesball.blogspot.ca/2013/02/comparing-individual-team-run-production.html
	#
	# data source: Lahman Database
	# http://www.seanlahman.com/baseball-archive/statistics/
	# 2012 version (1871-2012)
	# table: "Teams"
	#
	# note: R doesn't like something in line 141 of the stadium name fields,
	# so reading the data becomes a two-step process
	#
	# read the file, then convert it into a datafram
	Teams1 <- read.csv(file="Teams.csv", header = TRUE)
	Teams1 <- as.data.frame(Teams1)
	#
	# select only those teams from 1947 forward to 2012
	# see Bill James http://www.billjamesonline.com/dividing_baseball_history_into_eras/
	Teams <- as.data.frame(subset (Teams1, yearID > 1946))
	#
	# calculate average runs per game
	Teams$RPG <- Teams$R / Teams$G
	# calculate average runs allowed per game
	Teams$RAPG <- Teams$RA / Teams$G
	#
	# calculate RPG and RAPG season averages for each league
	# step 1a: sum of annual runs per league
	RunsLG <- data.frame(aggregate(Teams$R ~ Teams$yearID + Teams$lgID, FUN=sum))
	# step 1b: sum of annual runs allowed per league
	# [NOTE: since the introduction of interleague play, this is not the same as runs scored!]
	RunsALG <- data.frame(aggregate(Teams$RA ~ Teams$yearID + Teams$lgID, FUN=sum))
	# step 2: sum of annual games per league
	GamesLG <- data.frame(aggregate(Teams$G ~ Teams$yearID + Teams$lgID, FUN=sum))
	#
	#
	# merge the two objects together (need to find a more elegant way to do this!)
	LG_RPG <- data.frame(merge(RunsLG, RunsALG,
	by.x = c("Teams.yearID", "Teams.lgID"),
	by.y = c("Teams.yearID", "Teams.lgID")))
	#
	LG_RPG <- data.frame(merge(LG_RPG, GamesLG,
	by.x = c("Teams.yearID", "Teams.lgID"),
	by.y = c("Teams.yearID", "Teams.lgID")))
	#
	# clean up the variable names
	names(LG_RPG)[1]<-paste("yearID")
	names(LG_RPG)[2]<-paste("lgID")
	#
	# Peter's more elegant solution for lines #35-45
	LG_RPG <- aggregate(cbind(R, RA, G) ~ yearID + lgID, data = Teams, sum)
	#
	# calculate league runs and runs allowed per game
	LG_RPG$LG_RPG <- LG_RPG$Teams.R / LG_RPG$Teams.G
	LG_RPG$LG_RAPG <- LG_RPG$Teams.RA / LG_RPG$Teams.G
	##
	# Use the "merge" command to append the league values to the correct rows
	# Creates a single data frame Teams.merge that contains the team runs etc as well as the league run values for that season
	#
	Teams.merge <- merge(Teams, LG_RPG)
	#
	#
	# CREATE INDEX VALUES FOR EACH TEAM
	#
	# A. Runs per game
	#
	# create new values to compare the individual team's runs/game compares to the league average that season
	# 1. use an index where 100=the league average for that season
	Teams.merge$R_index <- Teams.merge$RPG / Teams.merge$LG_RPG * 100
	# 2. and Z scores of the index scores
	R_index.sd <- sd(Teams.merge$R_index)
	Teams.merge$R_Z <- (Teams.merge$R_index - 100)/R_index.sd
	#
	# calculate minimum, maximum, and standard deviation
	min(Teams.merge$R_index)
	max(Teams.merge$R_index)
	sd(Teams.merge$R_index)
	#
	# B. Runs allowed per game
	#
	# create new values to compare the individual team's runs allowed/game compares to the league average that season
	# 1. use an index where 100=the league average for that season
	Teams.merge$RA_index <- Teams.merge$RAPG / Teams.merge$LG_RAPG * 100
	# 2. and Z scores of the index scores
	RA_index.sd <- sd(Teams.merge$RA_index)
	Teams.merge$RA_Z <- (Teams.merge$RA_index - 100)/R_index.sd
	#
	# calculate minimum, maximum, and standard deviation
	min(Teams.merge$RA_index)
	max(Teams.merge$RA_index)
	sd(Teams.merge$RA_index)
	#
	#
	# RANK AND SORT BY R_INDEX
	# 1. low to high (default)
	# a. rank
	Teams.merge$R_index_rank <- rank(Teams.merge$R_index)
	# b. Sort
	Teams.merge.sort <- Teams.merge[c("yearID","lgID","franchID","R_index", "R_index_rank")]
	Teams.merge.sort <- Teams.merge.sort[order(Teams.merge.sort$R_index),]
	Teams.low_off <- as.data.frame (subset(Teams.merge.sort, R_index < 80))
	Teams.low_off
	write.csv(Teams.low_off, file="Teams.low_off.csv")
	#
	# 2. high to low
	# a. rank (note use of "-" in front of variable name)
	Teams.merge$R_index_rank <- rank(-Teams.merge$R_index)
	# b. sort (note use of "decreasing=TRUE" in "order" command)
	Teams.merge.sort <- Teams.merge.sort[order(Teams.merge.sort$R_index,decreasing=TRUE),]
	Teams.hi_off <- as.data.frame (subset(Teams.merge.sort, R_index > 120))
	Teams.hi_off
	write.csv(Teams.hi_off, file="Teams.hi_off.csv")
	#
	#
	# PLOT!
	# index (basic)
	hist(Teams.merge$R_index,
	main="MLB teams 1947-2012: Distribution of scoring",
	xla="Index value (100=league average)")
	#
	# index with density curve
	hist(Teams.merge$R_index,
	prob=T,
	main="MLB teams 1947-2012: Distribution & density curve of scoring",
	xla="Index value (100=league average)")
	lines(density(Teams.merge$R_index))
	#
	# Z scores
	hist(Teams.merge$R_Z, prob=T)
	lines(density(Teams.merge$R_Z))
	#
	#
	# and write a new file with the Teams.merge data
	write.csv(Teams.merge, file="Teams.merge.csv")
	#