Skip to content

Instantly share code, notes, and snippets.

@bayesball
Last active February 5, 2016 14:50
Show Gist options
  • Save bayesball/7ec0782f12b9ccf3f745 to your computer and use it in GitHub Desktop.
Save bayesball/7ec0782f12b9ccf3f745 to your computer and use it in GitHub Desktop.
R functions for Improved Component Predictions of Batting and Pitching Measures
#######################################################################################
# R functions for Paper "Improved Component Predictions of Batting and Pitching Measures"
# Journal of Quantitative Analysis of Sports (2016)
# Jim Albert, albert@bgsu.edu
# functions fit_component_average, plot_avg_results
# fit_component_obp, plot_obp_results
# fit_component_fip, plot_fip_results
# require installation of packages Lahman, dplyr, ggplot2, and LearnBayes
########################################################################################
fit_component_average <- function(season){
# implements component method for estimating group of batting averages for nonpitchers
# for a particular baseball season
#
# input: season
# output: a list with components
# component -- gives values of K and eta for all component fits
# shrinkage -- gives value of K and eta for shrinkage fit
# S -- gives raw stats and component and shrinkage estimates for all players
require(Lahman)
require(dplyr)
require(LearnBayes)
##---------------------------------------------------------------------
get.data <- function(season){
B <- summarize(group_by(filter(Batting, yearID==season), playerID),
AB=sum(AB), SO=sum(SO), H=sum(H), HR=sum(HR))
B <- filter(B, AB > 0)
NonPitchers <- unique(select(filter(Fielding,
yearID==season, POS!="P"), playerID))
merge(B, NonPitchers, by="playerID")
}
## ------------------------------------------------------------------------
fit.model <- function(data){
mode <- laplace(betabinexch, c(1, 1),
cbind(data$y, data$n))$mode
eta <- exp(mode[1]) / (1 + exp(mode[1]))
K <- exp(mode[2])
list(eta=eta, K=K,
d=data.frame(data, est=(data$y + K * eta) / (data$n + K)))
}
## ------------------------------------------------------------------------
d <- get.data(season)
## ------------------------------------------------------------------------
S.SO <- fit.model(data.frame(playerID=d$playerID,
y=d$SO, n=d$AB))
S.HR <- fit.model(data.frame(playerID=d$playerID,
y=d$HR, n=d$AB - d$SO))
S.H <- fit.model(data.frame(playerID=d$playerID,
y=d$H - d$HR,
n=d$AB - d$HR - d$SO))
S <- merge(S.SO$d, S.HR$d, by="playerID")
S <- merge(S, S.H$d, by="playerID")
names(S) <- c("playerID", "SO", "AB", "SO.Rate",
"HR", "AB.SO", "HR.Rate",
"H.HR", "AB.SO.HR", "H.Rate")
component.fit <- data.frame(eta=c(S.SO$eta, S.HR$eta, S.H$eta),
K=c(S.SO$K, S.HR$K, S.H$K))
row.names(component.fit) <- c("SO", "HR", "H")
## ------------------------------------------------------------------------
S$Est <- with(S,
(1 - SO.Rate) * (HR.Rate + (1 - HR.Rate) * H.Rate))
## ------------------------------------------------------------------------
S2 <- fit.model(data.frame(playerID=d$playerID,
y=d$H, n=d$AB))
shrinkage.fit <- c(eta=S2$eta, K=S2$K)
S <- merge(S, S2$d, by="playerID")
names(S)[c(11:14)] <- c("Comp.Est", "H", "AB1", "Shrinkage.Est")
S$Season <- season
list(S=S, component=component.fit, shrinkage=shrinkage.fit)
}
plot_avg_results <- function(fitwork){
# function constructs plot of observed and predicted one.minus.strikeout.rates and his.in.non.so.ab rates
# from output of fit_component_average function
require(ggplot2)
require(dplyr)
S <- fitwork$S
myf <- function(x, p) p / x
AVG1 <- data.frame(P=.25, x=seq(.6, 1, .01), y=myf(seq(.6, 1, .01), .25))
AVG2 <- data.frame(P=.3, x=seq(.6, 1, .01), y=myf(seq(.6, 1, .01), .3))
AVG3 <- data.frame(P=.2, x=seq(.6, 1, .01), y=myf(seq(.6, 1, .01), .2))
AVG4 <- data.frame(P=.35, x=seq(.6, 1, .01), y=myf(seq(.6, 1, .01), .35))
AVG5 <- data.frame(P=.4, x=seq(.6, 1, .01), y=myf(seq(.6, 1, .01), .4))
AVG <- rbind(AVG1, AVG2, AVG3, AVG4, AVG5)
AVG <- mutate(AVG, BA=factor(P))
d1 <- data.frame(Type="Observed BA",
One.Minus.SO.Rate=1 - S$SO / S$AB,
H.Rate=S$H / S$AB.SO,
AB=S$AB)
d2 <- data.frame(Type="Predicted BA",
One.Minus.SO.Rate=1 - S$SO.Rate,
H.Rate=S$HR.Rate + (1 - S$HR.Rate) * S$H.Rate,
AB=S$AB)
dd <- rbind(d1, d2)
dd1 <- filter(dd, AB >= 300)
Xlow <- range(dd1$One.Minus.SO.Rate)
Ylow <- range(dd1$H.Rate)
p <- ggplot(dd1, aes(One.Minus.SO.Rate, H.Rate)) +
geom_point() +
facet_wrap(~ Type, ncol=1) +
geom_line(data=AVG, aes(x, y, color=BA)) +
ylim(Ylow[1], Ylow[2]) + xlim(Xlow[1], Xlow[2]) +
ggtitle(paste("Hitters with at least 300 AB in",S$Season,"Season" )) +
labs(y = "Hits in Non-SO-AB Rate")
print(p)
}
fit_component_obp <- function(season){
# implement component method for estimating group of on-base percentages for nonpitchers
# for a particular baseball season
#
# input: season
# output: a list with components
# component -- gives values of K and eta for all component fits
# shrinkage -- gives value of K and eta for shrinkage fit
# ST -- gives raw stats and component and shrinkage estimates for all players
require(Lahman)
require(dplyr)
require(LearnBayes)
get.data <- function(season){
B <- summarize(group_by(filter(Batting,
yearID==season), playerID),
PA=sum(AB + BB + HBP),
AB=sum(AB), BB=sum(BB + HBP), H=sum(H),
SO=sum(SO), HR=sum(HR))
NonPitchers <- unique(select(filter(Fielding,
yearID==season, POS!="P"),
playerID))
merge(B, NonPitchers, by="playerID")
}
## ------------------------------------------------------------------------
fit.model <- function(data){
mode <- laplace(betabinexch, c(1, 1),
cbind(data$y, data$n))$mode
eta <- exp(mode[1]) / (1 + exp(mode[1]))
K <- exp(mode[2])
list(eta=eta, K=K,
d=data.frame(data, est=(data$y + K * eta) / (data$n + K)))
}
## ------------------------------------------------------------------------
d <- get.data(season)
## ------------------------------------------------------------------------
S.BB <- fit.model(data.frame(playerID=d$playerID,
y=d$BB, n=d$PA))
S.SO <- fit.model(data.frame(playerID=d$playerID,
y=d$SO, n=d$AB))
S.HR <- fit.model(data.frame(playerID=d$playerID,
y=d$HR, n=d$AB - d$SO))
S.H <- fit.model(data.frame(playerID=d$playerID,
y=d$H - d$HR,
n=d$AB - d$HR - d$SO))
S <- merge(S.BB$d, S.SO$d, by="playerID")
S <- merge(S, S.HR$d, by="playerID")
names(S) <- c("playerID", "BB", "PA", "BB.Rate",
"SO", "AB", "SO.Rate",
"HR", "AB.SO", "HR.Rate")
S <- merge(S, S.H$d, by="playerID")
names(S)[11:13] <- c("HIP", "IP", "HIP.Rate")
S$BAEst <- with(S,
(1 - SO.Rate) * (HR.Rate + (1 - HR.Rate) * HIP.Rate))
S$Est <- with(S,
BB.Rate + (1 - BB.Rate) * BAEst)
component.fit <- data.frame(eta=c(S.BB$eta, S.SO$eta,
S.HR$eta, S.H$eta),
K=c(S.BB$K, S.SO$K, S.HR$K, S.H$K))
row.names(component.fit) <- c("BB", "SO", "HR", "H")
## ------------------------------------------------------------------------
S2 <- fit.model(data.frame(playerID=d$playerID,
y=(d$H + d$BB),
n=d$PA))
shrinkage.fit <- c(eta=S2$eta, K=S2$K)
ST <- merge(S, S2$d, by="playerID")
names(ST)[c(15, 18)] <- c("Combo.Est", "Shrinkage.Est")
ST$Season <- season
list(ST=ST, component=component.fit, shrinkage=shrinkage.fit)
}
plot_obp_results <- function(fitwork){
# function constructs plot of observed and predicted walk and hit rates
# from output of fit_component_obp function
require(ggplot2)
require(dplyr)
ST <- fitwork$ST
myf <- function(x, p) (p - x) / (1 - x)
AVG1 <- data.frame(P=.25, x=seq(.02, .3, .01), y=myf(seq(.02, .3, .01), .25))
AVG2 <- data.frame(P=.3, x=seq(.02, .3, .01), y=myf(seq(.02, .3, .01), .3))
AVG3 <- data.frame(P=.35, x=seq(.02, .3, .01), y=myf(seq(.02, .3, .01), .35))
AVG4 <- data.frame(P=.4, x=seq(.02, .3, .01), y=myf(seq(.02, .3, .01), .4))
AVG5 <- data.frame(P=.45, x=seq(.02, .3, .01), y=myf(seq(.02, .3, .01), .45))
AVG <- rbind(AVG1, AVG2, AVG3, AVG4, AVG5)
AVG <- mutate(AVG, OBP=factor(P))
d1 <- data.frame(Type="Observed OBP",
Walk.Rate=ST$BB / ST$PA,
H.Rate=(ST$HIP + ST$HR) / ST$AB,
PA=ST$PA)
d2 <- data.frame(Type="Predicted OBP",
Walk.Rate=ST$BB.Rate,
H.Rate=ST$BAEst,
PA=ST$PA)
dd <- rbind(d1, d2)
dd1 <- filter(dd, PA >= 300)
Xlim <- range(dd1$Walk.Rate); Ylim <- range(dd1$H.Rate)
p <- ggplot(dd1, aes(Walk.Rate, H.Rate)) +
geom_point() +
facet_wrap(~ Type, ncol=1) +
geom_line(data=AVG, aes(x, y, color=OBP)) +
ylim(Ylim[1], Ylim[2]) + xlim(Xlim[1], Xlim[2]) +
ggtitle(paste("Hitters with at least 300 PA in",ST$Season,"Season")) +
labs(x = "Walk Rate", y = "Hit Rate")
print(p)
}
fit_component_fip <- function(season){
# implement component method for estimating group of FIPs for pitchers in a particular season
#
# input: season
# output:
# S -- gives raw stats and component and shrinkage estimates for all pitches who face at least 50 batters
require(Lahman)
require(dplyr)
require(LearnBayes)
get.data <- function(season){
summarize(group_by(filter(Pitching, yearID==season), playerID),
PA=sum(BFP),
AB=sum(BFP - BB - HBP, na.rm=TRUE),
SO=sum(SO, na.rm=TRUE),
BB=sum(BB + HBP, na.rm=TRUE),
HR=sum(HR),
H=sum(H),
IP=sum(IPouts) / 3)
}
fit.model <- function(data){
mode <- laplace(betabinexch, c(1, 1),
cbind(data$y, data$n))$mode
eta <- exp(mode[1]) / (1 + exp(mode[1]))
K <- exp(mode[2])
list(eta=eta, K=K,
d=data.frame(data, est=(data$y + K * eta) / (data$n + K)))
}
d <- filter(get.data(season), PA >= 50)
S.BB <- fit.model(data.frame(playerID=d$playerID,
y=d$BB, n=d$PA))
names(S.BB$d) <- c("playerID", "BB", "PA", "BB.Rate")
S.SO <- fit.model(data.frame(playerID=d$playerID,
y=d$SO, n=d$AB))
names(S.SO$d) <- c("playerID", "SO", "AB", "SO.Rate")
S.HR <- fit.model(data.frame(playerID=d$playerID,
y=d$HR, n=d$AB - d$SO))
names(S.HR$d) <- c("playerID", "HR", "AB.minus.SO", "HR.Rate")
S.H <- fit.model(data.frame(playerID=d$playerID,
y=d$H - d$HR,
n=d$AB - d$SO - d$HR))
names(S.H$d) <- c("playerID", "HIP", "OIP", "HIP.Rate")
S <- merge(S.BB$d, S.SO$d, by="playerID")
S <- merge(S, S.HR$d, by="playerID")
S <- merge(S, S.H$d, by="playerID")
S$Est <- with(S,
(39 * (1 - BB.Rate) * (1 - SO.Rate) * HR.Rate +
9 * BB.Rate - 6 * (1 - BB.Rate) * SO.Rate) /
((1 - BB.Rate) * (SO.Rate +
(1 - SO.Rate) * (1 - HR.Rate) * (1 - HIP.Rate))))
S$Obs <- with(S,
((13 * HR) + (3 * BB) - (2 * SO))) / d$IP
vary <- 5.1 ^ 2 / d$IP
dd <- cbind(S$Obs, vary)
dd <- dd[complete.cases(dd), ]
fit <- laplace(normnormexch, c(0, 0), dd)
mn <- fit$mode[1]
tau <- exp(fit$mode[2])
S$shrink <- (S$Obs / vary + mn / tau ^ 2) /
(1 / vary + 1 / tau ^ 2)
S
}
plot_fip_results <- function(S){
# function constructs plot of observed and predicted FIP measures
# from output of fit_component_fip function
library(ggplot2)
d1 <- data.frame(PA=S$PA, FIP=S$Obs, Type="Observed")
d2 <- data.frame(PA=S$PA, FIP=S$Est, Type="Estimated")
d3 <- data.frame(PA=S$PA, FIP=S$shrink, Type="Shrinkage")
d <- rbind(d1, d2, d3)
p1 <- ggplot(filter(d, Type == "Observed" | Type == "Shrinkage"),
aes(PA, FIP, color=Type)) + geom_point()
p2 <- ggplot(filter(d, Type == "Estimated" | Type == "Shrinkage"),
aes(PA, FIP, color=Type)) + geom_point()
print(p1)
print(p2)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment