Skip to content

Instantly share code, notes, and snippets.

@bayesball
Created October 21, 2019 22:01
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bayesball/5387d8eef308b3a772da7bdfcfd27c90 to your computer and use it in GitHub Desktop.
Save bayesball/5387d8eef308b3a772da7bdfcfd27c90 to your computer and use it in GitHub Desktop.
Exploration of streakiness of 2019 sluggers
library(tidyverse)
# read in PA data for all plays in the 2019 season
sc <- read_csv("http://www-math.bgsu.edu/~albert/statcast/statcast2019.csv")
# function to detect streakiness using a specific player and a specific streaky stat
streaky_detect <- function(name,
stat = sd){
##### streakiness of observed data
mt <- filter(sc, player_name == name)
d <- ifelse(mt$events =="home_run", 1, 0)
out <- rle(1 - d)
ofers <- out$lengths[out$values == 1]
mt_streak <- stat(ofers)
##### simulating streakiness of similar coins
p <- mean(d)
n <- length(d)
simulating <- function(){
coinflips <- rbernoulli(n, p)
out <- rle(1 - coinflips)
stat(out$lengths[out$values == 1])
}
SD <- replicate(1000, simulating())
###### compute a tail-probability
c(N = length(d), HR = sum(d), P = p,
S = mt_streak,
P_Value = mean(SD >= mt_streak))
}
# top home run hitters
HR_min <- 30
sc %>%
group_by(player_name) %>%
summarize(HR = sum(events == "home_run")) %>%
filter(HR >= HR_min) %>%
arrange(desc(HR)) %>%
head()
# find all players with at least HR_min home runs in 2019
sc %>%
group_by(player_name) %>%
summarize(HR = sum(events == "home_run")) %>%
filter(HR >= HR_min) %>%
pull(player_name) -> P_Names
# find all associated p-values
sapply(P_Names, streaky_detect) -> theout
# plot output
Results <- data.frame(Player = P_Names,
PA = theout[1, ],
HR = theout[2, ],
P = theout[3, ],
S = theout[4, ],
P_Value = theout[5, ])
row.names(Results) <- NULL
ggplot(Results, aes(Player, P_Value)) +
geom_point() +
coord_flip()
# plot from least streaky to most streaky
Results$Player <- factor(Results$Player,
levels = Results$Player[order(- Results$P_Value)])
ggplot(Results, aes(Player, P_Value)) +
geom_point(color = "red") +
coord_flip() +
ggtitle("P Values of Maximum Ofers for 58 Sluggers in 2019 Season")
# try another streaky stat
sapply(P_Names, streaky_detect,
stat = max) -> theout
Results <- data.frame(Player = P_Names,
PA = theout[1, ],
HR = theout[2, ],
P = theout[3, ],
S = theout[4, ],
P_Value = theout[5, ])
row.names(Results) <- NULL
Results$Player <- factor(Results$Player,
levels = Results$Player[order(- Results$P_Value)])
ggplot(Results, aes(Player, P_Value)) +
geom_point() +
coord_flip()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment