Created
June 26, 2019 12:45
-
-
Save bayesball/88a372ed29714de7915e07e2ca2b8102 to your computer and use it in GitHub Desktop.
R code for "cleaning" Statcast data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# read in SC data from 2019 seasons | |
# through games of June 23, 2019 | |
library(tidyverse) | |
library(CalledStrike) | |
sc2019 <- read_csv("~/Dropbox/2016 WORK/BLOG Baseball R/OTHER/StatcastData/statcast2019.csv") | |
sc2019_ip <- filter(sc2019, type == "X") | |
# focus on Statcast 2019 in-play data | |
# look for weird values of launch speed and launch angle | |
# first graph the distinct values of launch speed | |
sc2019_ip %>% | |
group_by(launch_speed) %>% | |
summarize(N = n()) %>% | |
ggplot(aes(launch_speed, N)) + | |
geom_col() + | |
increasefont() + | |
ggtitle("Distribution of 2019 Launch Speeds") + | |
centertitle() | |
# graph of the distinct values of launch angles | |
sc2019_ip %>% | |
group_by(launch_angle) %>% | |
summarize(N = n()) %>% | |
ggplot(aes(launch_angle, N)) + | |
geom_col() + | |
increasefont() + | |
ggtitle("Distribution of 2019 Launch Angles") + | |
centertitle() | |
# pick out the extreme pairs of (ls, la) | |
# finds the pairs where the frequency exceeds n_min | |
find_extreme <- function(d, n_min = 80){ | |
d %>% | |
group_by(launch_speed, launch_angle) %>% | |
summarize(N = n()) -> S | |
S %>% filter(N >= n_min) | |
} | |
# decide on using a frequency of 100 as a cutoff | |
o19 <- find_extreme(sc2019_ip, n_min = 100) %>% | |
arrange(desc(N)) | |
# remove these pairs from Statcast data | |
# must be a more elegant way of doing this | |
sc2019_ip %>% mutate(Row = row_number()) -> sc2019_ip | |
find_rows_one_pair <- function(j){ | |
sc2019_ip %>% | |
filter(launch_speed == o19$launch_speed[j], | |
launch_angle == o19$launch_angle[j]) %>% | |
select(Row) %>% pull() | |
} | |
row_numbers <- NULL | |
for(j in 1:nrow(o19)){ | |
row_numbers <- c(row_numbers, find_rows_one_pair(j)) | |
} | |
sc2019_ip %>% | |
filter(!row_number() %in% row_numbers) -> | |
sc2019_clean | |
# graphs of cleaned data | |
sc2019_clean %>% | |
group_by(launch_speed) %>% | |
summarize(N = n()) %>% | |
ggplot(aes(launch_speed, N)) + | |
geom_col() + | |
increasefont() + | |
ggtitle("Distribution of 2019 Launch Speeds") + | |
centertitle() | |
# graph of launch angles from cleaned data | |
sc2019_clean %>% | |
group_by(launch_angle) %>% | |
summarize(N = n()) %>% | |
ggplot(aes(launch_angle, N)) + | |
geom_col() + | |
increasefont() + | |
ggtitle("Distribution of 2019 Launch Angles") + | |
centertitle() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment