Created
October 21, 2019 22:01
-
-
Save bayesball/5387d8eef308b3a772da7bdfcfd27c90 to your computer and use it in GitHub Desktop.
Exploration of streakiness of 2019 sluggers
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
# read in PA data for all plays in the 2019 season | |
sc <- read_csv("http://www-math.bgsu.edu/~albert/statcast/statcast2019.csv") | |
# function to detect streakiness using a specific player and a specific streaky stat | |
streaky_detect <- function(name, | |
stat = sd){ | |
##### streakiness of observed data | |
mt <- filter(sc, player_name == name) | |
d <- ifelse(mt$events =="home_run", 1, 0) | |
out <- rle(1 - d) | |
ofers <- out$lengths[out$values == 1] | |
mt_streak <- stat(ofers) | |
##### simulating streakiness of similar coins | |
p <- mean(d) | |
n <- length(d) | |
simulating <- function(){ | |
coinflips <- rbernoulli(n, p) | |
out <- rle(1 - coinflips) | |
stat(out$lengths[out$values == 1]) | |
} | |
SD <- replicate(1000, simulating()) | |
###### compute a tail-probability | |
c(N = length(d), HR = sum(d), P = p, | |
S = mt_streak, | |
P_Value = mean(SD >= mt_streak)) | |
} | |
# top home run hitters | |
HR_min <- 30 | |
sc %>% | |
group_by(player_name) %>% | |
summarize(HR = sum(events == "home_run")) %>% | |
filter(HR >= HR_min) %>% | |
arrange(desc(HR)) %>% | |
head() | |
# find all players with at least HR_min home runs in 2019 | |
sc %>% | |
group_by(player_name) %>% | |
summarize(HR = sum(events == "home_run")) %>% | |
filter(HR >= HR_min) %>% | |
pull(player_name) -> P_Names | |
# find all associated p-values | |
sapply(P_Names, streaky_detect) -> theout | |
# plot output | |
Results <- data.frame(Player = P_Names, | |
PA = theout[1, ], | |
HR = theout[2, ], | |
P = theout[3, ], | |
S = theout[4, ], | |
P_Value = theout[5, ]) | |
row.names(Results) <- NULL | |
ggplot(Results, aes(Player, P_Value)) + | |
geom_point() + | |
coord_flip() | |
# plot from least streaky to most streaky | |
Results$Player <- factor(Results$Player, | |
levels = Results$Player[order(- Results$P_Value)]) | |
ggplot(Results, aes(Player, P_Value)) + | |
geom_point(color = "red") + | |
coord_flip() + | |
ggtitle("P Values of Maximum Ofers for 58 Sluggers in 2019 Season") | |
# try another streaky stat | |
sapply(P_Names, streaky_detect, | |
stat = max) -> theout | |
Results <- data.frame(Player = P_Names, | |
PA = theout[1, ], | |
HR = theout[2, ], | |
P = theout[3, ], | |
S = theout[4, ], | |
P_Value = theout[5, ]) | |
row.names(Results) <- NULL | |
Results$Player <- factor(Results$Player, | |
levels = Results$Player[order(- Results$P_Value)]) | |
ggplot(Results, aes(Player, P_Value)) + | |
geom_point() + | |
coord_flip() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment