Skip to content

Instantly share code, notes, and snippets.

@MonkmanMH
Created February 25, 2013 04:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save MonkmanMH/5027789 to your computer and use it in GitHub Desktop.
Save MonkmanMH/5027789 to your computer and use it in GitHub Desktop.
MLB runs per game - league average
# THE HISTORICAL RECORD - RUNS PER GAME
#
# discussion and output can be found at
# http://bayesball.blogspot.ca/2013/02/comparing-individual-team-run-production.html
#
# data source: Lahman Database
# http://www.seanlahman.com/baseball-archive/statistics/
# 2012 version (1871-2012)
# table: "Teams"
#
# note: R doesn't like something in line 141 of the stadium name fields,
# so reading the data becomes a two-step process
#
# read the file, then convert it into a datafram
Teams1 <- read.csv(file="Teams.csv", header = TRUE)
Teams1 <- as.data.frame(Teams1)
#
# select only those teams from 1947 forward to 2012
# see Bill James http://www.billjamesonline.com/dividing_baseball_history_into_eras/
Teams <- as.data.frame(subset (Teams1, yearID > 1946))
#
# calculate average runs per game
Teams$RPG <- Teams$R / Teams$G
# calculate average runs allowed per game
Teams$RAPG <- Teams$RA / Teams$G
#
# calculate RPG and RAPG season averages for each league
# step 1a: sum of annual runs per league
RunsLG <- data.frame(aggregate(Teams$R ~ Teams$yearID + Teams$lgID, FUN=sum))
# step 1b: sum of annual runs allowed per league
# [NOTE: since the introduction of interleague play, this is not the same as runs scored!]
RunsALG <- data.frame(aggregate(Teams$RA ~ Teams$yearID + Teams$lgID, FUN=sum))
# step 2: sum of annual games per league
GamesLG <- data.frame(aggregate(Teams$G ~ Teams$yearID + Teams$lgID, FUN=sum))
#
#
# merge the two objects together (need to find a more elegant way to do this!)
LG_RPG <- data.frame(merge(RunsLG, RunsALG,
by.x = c("Teams.yearID", "Teams.lgID"),
by.y = c("Teams.yearID", "Teams.lgID")))
#
LG_RPG <- data.frame(merge(LG_RPG, GamesLG,
by.x = c("Teams.yearID", "Teams.lgID"),
by.y = c("Teams.yearID", "Teams.lgID")))
#
# clean up the variable names
names(LG_RPG)[1]<-paste("yearID")
names(LG_RPG)[2]<-paste("lgID")
#
# Peter's more elegant solution for lines #35-45
LG_RPG <- aggregate(cbind(R, RA, G) ~ yearID + lgID, data = Teams, sum)
#
# calculate league runs and runs allowed per game
LG_RPG$LG_RPG <- LG_RPG$Teams.R / LG_RPG$Teams.G
LG_RPG$LG_RAPG <- LG_RPG$Teams.RA / LG_RPG$Teams.G
##
# Use the "merge" command to append the league values to the correct rows
# Creates a single data frame Teams.merge that contains the team runs etc as well as the league run values for that season
#
Teams.merge <- merge(Teams, LG_RPG)
#
#
# CREATE INDEX VALUES FOR EACH TEAM
#
# A. Runs per game
#
# create new values to compare the individual team's runs/game compares to the league average that season
# 1. use an index where 100=the league average for that season
Teams.merge$R_index <- Teams.merge$RPG / Teams.merge$LG_RPG * 100
# 2. and Z scores of the index scores
R_index.sd <- sd(Teams.merge$R_index)
Teams.merge$R_Z <- (Teams.merge$R_index - 100)/R_index.sd
#
# calculate minimum, maximum, and standard deviation
min(Teams.merge$R_index)
max(Teams.merge$R_index)
sd(Teams.merge$R_index)
#
# B. Runs allowed per game
#
# create new values to compare the individual team's runs allowed/game compares to the league average that season
# 1. use an index where 100=the league average for that season
Teams.merge$RA_index <- Teams.merge$RAPG / Teams.merge$LG_RAPG * 100
# 2. and Z scores of the index scores
RA_index.sd <- sd(Teams.merge$RA_index)
Teams.merge$RA_Z <- (Teams.merge$RA_index - 100)/R_index.sd
#
# calculate minimum, maximum, and standard deviation
min(Teams.merge$RA_index)
max(Teams.merge$RA_index)
sd(Teams.merge$RA_index)
#
#
# RANK AND SORT BY R_INDEX
# 1. low to high (default)
# a. rank
Teams.merge$R_index_rank <- rank(Teams.merge$R_index)
# b. Sort
Teams.merge.sort <- Teams.merge[c("yearID","lgID","franchID","R_index", "R_index_rank")]
Teams.merge.sort <- Teams.merge.sort[order(Teams.merge.sort$R_index),]
Teams.low_off <- as.data.frame (subset(Teams.merge.sort, R_index < 80))
Teams.low_off
write.csv(Teams.low_off, file="Teams.low_off.csv")
#
# 2. high to low
# a. rank (note use of "-" in front of variable name)
Teams.merge$R_index_rank <- rank(-Teams.merge$R_index)
# b. sort (note use of "decreasing=TRUE" in "order" command)
Teams.merge.sort <- Teams.merge.sort[order(Teams.merge.sort$R_index,decreasing=TRUE),]
Teams.hi_off <- as.data.frame (subset(Teams.merge.sort, R_index > 120))
Teams.hi_off
write.csv(Teams.hi_off, file="Teams.hi_off.csv")
#
#
# PLOT!
# index (basic)
hist(Teams.merge$R_index,
main="MLB teams 1947-2012: Distribution of scoring",
xla="Index value (100=league average)")
#
# index with density curve
hist(Teams.merge$R_index,
prob=T,
main="MLB teams 1947-2012: Distribution & density curve of scoring",
xla="Index value (100=league average)")
lines(density(Teams.merge$R_index))
#
# Z scores
hist(Teams.merge$R_Z, prob=T)
lines(density(Teams.merge$R_Z))
#
#
# and write a new file with the Teams.merge data
write.csv(Teams.merge, file="Teams.merge.csv")
#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment