Created
October 10, 2020 21:23
-
-
Save bayesball/b4b88f649003e2cd74ff1e3a0fb4e95e to your computer and use it in GitHub Desktop.
Streaky measures of hitting from Statcast data.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
title: "Statcast Streaky Analysis" | |
author: "Jim Albert" | |
date: "10/5/2020" | |
output: html_document | |
--- | |
```{r setup, include=FALSE} | |
knitr::opts_chunk$set(echo = TRUE, | |
message = FALSE, | |
warning = FALSE) | |
``` | |
#### Read in Data | |
Read in statcast data for 2020 season; | |
```{r} | |
library(tidyverse) | |
sc <- read_csv("~/Dropbox/2020 WORK/statcast2020/sc2020final.csv") | |
``` | |
#### Main Functions | |
Here is the main function to compute the streaky statistics described in the post. | |
The inputs to ```streak_data3()``` are the statcast dataset and the player of interest. | |
The output is a list with four components: | |
- Swing - data frame with the swing and miss data | |
- InPlay - data frame with the BIP data | |
- pv - p-value of the permutation test | |
- r2 - value of the R^2 statistic | |
```{r} | |
streak_data3 <- function(sc, player){ | |
require(tidyverse) | |
require(BayesTestStreak) | |
require(lubridate) | |
pdata <- filter(sc, player_name == player) | |
swing_situations <- c("bunt_foul_tip", | |
"foul", "foul_bunt", "foul_tip", | |
"hit_into_play", "hit_into_play_no_out", | |
"hit_into_play_score", "missed_bunt", | |
"swinging_strike", "swinging_strike_blocked") | |
pdata %>% | |
filter(description %in% swing_situations) %>% | |
mutate(Miss = ifelse(description %in% | |
c("swinging_strike", | |
"swinging_strike_blocked"), 1, 0), | |
Contact = 1 - Miss, | |
Count = paste(balls, strikes, sep = "-")) -> | |
pdata_swing | |
pdata_swing %>% | |
arrange(game_date, | |
game_pk, | |
at_bat_number, | |
pitch_number) -> pdata_swing | |
pdata_inplay <- filter(pdata_swing, type == "X") | |
pdata_inplay$prob <- | |
as.numeric(pdata_inplay$estimated_ba_using_speedangle) | |
pdata_inplay$Week <- | |
factor(week(ymd(pdata_inplay$game_date))) | |
fit <- lm(prob ~ Week, | |
data = pdata_inplay) | |
r2 <- summary(fit)$r.squared | |
df1 <- data.frame(Swing = 1:dim(pdata_swing)[1], | |
Miss = pdata_swing$Miss) | |
df2 <- data.frame(Inplay = 1:dim(pdata_inplay)[1], | |
Week = pdata_inplay$Week, | |
Prob = pdata_inplay$prob) | |
pv <- permutation.test(df1$Miss) | |
list(Player = player, | |
Swing = df1, | |
InPlay = df2, | |
pv = pv, | |
r2 = r2) | |
} | |
``` | |
The function ```make_plots()``` will create associated graphs from the output of the ```streak_data3()``` function. | |
```{r} | |
make_plots <- function(sd){ | |
require(BayesTestStreak) | |
require(gridExtra) | |
require(ProbBayes) | |
p1 <- plot_streak_data(sd$Swing$Miss) + | |
ggtitle(paste(sd$Player, | |
"Misses on Swings: P-Value =", | |
sd$pv)) + | |
centertitle() | |
p2 <- ggplot(sd$InPlay, | |
aes(Week, Prob)) + | |
xlab("Week") + | |
geom_boxplot() + | |
ggtitle(paste(sd$Player, | |
"P(H) on BIP: R2 =", | |
round(sd$r2, 3))) + | |
centertitle() | |
grid.arrange(p1, p2, ncol = 1) | |
} | |
``` | |
#### Example | |
Here is an example of using the functions for Bryce Harper. | |
```{r} | |
streak_data3(sc, "Bryce Harper") %>% make_plots() | |
``` | |
#### Many Players | |
Want to collect streaky stats for many players. Write function that will collect the p-value and R squared statistics for a given player | |
```{r} | |
collect_pvs <- function(player){ | |
out <- streak_data3(sc, player) | |
c(out$pv, out$r2) | |
} | |
``` | |
Collect number of pitches for all players. | |
```{r} | |
sc %>% | |
group_by(player_name) %>% | |
count() -> S | |
``` | |
We focus on hitters who faced at least 600 pitches during the 2020 season. | |
```{r} | |
player_list <- filter(S, n >= 600) %>% | |
pull(player_name) | |
``` | |
Collect stats for many hitters by use of the ```sapply()``` function. | |
```{r} | |
out <- sapply(player_list, collect_pvs) | |
out_df <- data.frame(Player = player_list, | |
Swing_PV = out[1, ], | |
XBA_R2 = out[2, ]) | |
row.names(out_df) <- NULL | |
``` | |
Construct some graphs of these streaky statistics: | |
```{r} | |
ggplot(out_df, aes(Swing_PV, XBA_R2)) + | |
geom_point() | |
ggplot(out_df, aes(Swing_PV)) + | |
geom_density() | |
ggplot(out_df, aes(XBA_R2)) + | |
geom_density() | |
``` | |
Look at several interesting streaky hitters with respect to R2. | |
Output the players whose R^2 value exceeeds 0.2. | |
```{r} | |
filter(out_df, XBA_R2 > .2) | |
``` | |
Here are the plots for these streaky players. | |
```{r} | |
streak_data3(sc, "Luis Robert") %>% | |
make_plots() | |
streak_data3(sc, "Daniel Vogelbach") %>% | |
make_plots() | |
streak_data3(sc, "Gleyber Torres") %>% | |
make_plots() | |
``` | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment