Skip to content

Instantly share code, notes, and snippets.

@bayesball
Created January 1, 2018 18:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save bayesball/bacb10e199b827c2de3b64e00c7d02ee to your computer and use it in GitHub Desktop.
Save bayesball/bacb10e199b827c2de3b64e00c7d02ee to your computer and use it in GitHub Desktop.
R script for Using Statcast to Measure HItters post
# load in tidyverse package and
# load in theme for title
library(tidyverse)
TH <- theme(plot.title = element_text(colour = "blue",
size = 18,
hjust = 0.5, vjust = 0.8, angle = 0))
# read in the 2017 statcast data
sc2017 <- read_csv("statcast2017.csv")
# only look at balls in play and define the hit variable
sc2017 %>% filter(type == "X") %>%
mutate(hit = ifelse(events %in%
c("single", "double", "triple", "home_run"),
1, 0)) -> sc2017_ip
# fit gam with binomial distribution (logistic link)
library(mgcv)
fit <- gam(hit ~ s(launch_speed, launch_angle),
data = sc2017_ip, family = binomial)
# for a particular player, computes
# number of batted balls, number of hits, and number
# of predicted hits from model
one_player <- function(id){
invlogit <- function(x){exp(x) / (1 + exp(x))}
sc2017_ip %>% filter(batter == id) %>%
select(player_name, events, launch_speed,
launch_angle, hit) -> d
d$Predict <- invlogit(predict(fit, d))
c(length(d$hit), sum(d$hit), sum(d$Predict))
}
# does this work for all batters
# computes Z scores to contrast observed and expected
unique_ids <- unique(sc2017_ip$batter)
S <- sapply(unique_ids, one_player)
S1 <- data.frame(Batter = unique_ids,
N = S[1, ],
H = S[2, ],
Expected = S[3, ])
sc2017_ip %>% group_by(batter) %>%
summarize(Player = first(player_name)) -> SC
inner_join(S1, SC,
by=c("Batter" = "batter")) -> S2
S2$Z = with(S2, (H - Expected) / sqrt(Expected))
# three graphs comparing observed and expected hits
library(ggrepel)
ggplot(S2, aes(Expected, H, label=Player)) +
geom_point() + geom_smooth(color="red") + TH +
ggtitle("Observed and Expected Hits for All Players")
ggplot(S2, aes(N, H - Expected, label=Player)) +
geom_point() +
geom_hline(yintercept = 0, color="red") + TH +
ggtitle("Graph of Residuals")
ggplot(S2, aes(N, Z, label=Player)) +
geom_point() +
geom_label_repel(data = filter(S2, Z > 2)) +
geom_label_repel(data = filter(S2, Z < -2)) +
ylim(-2.5, 4) +
geom_hline(yintercept = 0, color="red") + TH +
ggtitle("Graph of Standardized Scores")
# look at a player's hits more carefully
one_player_graph <- function(pname){
invlogit <- function(x){exp(x) / (1 + exp(x))}
TH <- theme(plot.title = element_text(colour = "blue",
size = 18,
hjust = 0.5, vjust = 0.8, angle = 0))
filter(sc2017_ip, player_name == pname) %>%
select(launch_angle, launch_speed, hit) -> dg
dg$P <- invlogit(predict(fit, dg))
dg$Predict <- ifelse(dg$P > .5, 1, 0)
df1 <- select(dg, launch_angle, launch_speed)
df1$Outcome = dg$hit; df1$Type = "Actual"
df2 <- select(dg, launch_angle, launch_speed)
df2$Outcome = dg$Predict; df2$Type = "Predicted"
df <- rbind(df1, df2)
df$Outcome <- as.factor(df$Outcome)
print(ggplot(df, aes(launch_angle, launch_speed,
color=Outcome)) +
geom_jitter() +
facet_wrap(~ Type, ncol=1) +
ggtitle(pname) + TH)
dg
}
# focus on Dee Gordon and Miguel Cabrera
dg <- one_player_graph("Dee Gordon")
mc <- one_player_graph("Miguel Cabrera")
# compute actual and predicted number of hits for
# each player
summarize(dg, H = sum(hit), P = sum(P))
summarize(mc, H = sum(hit), P = sum(P))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment