R code for "cleaning" Statcast data
# read in SC data from 2019 seasons | |
# through games of June 23, 2019 | |
library(tidyverse) | |
library(CalledStrike) | |
sc2019 <- read_csv("~/Dropbox/2016 WORK/BLOG Baseball R/OTHER/StatcastData/statcast2019.csv") | |
sc2019_ip <- filter(sc2019, type == "X") | |
# focus on Statcast 2019 in-play data | |
# look for weird values of launch speed and launch angle | |
# first graph the distinct values of launch speed | |
sc2019_ip %>% | |
group_by(launch_speed) %>% | |
summarize(N = n()) %>% | |
ggplot(aes(launch_speed, N)) + | |
geom_col() + | |
increasefont() + | |
ggtitle("Distribution of 2019 Launch Speeds") + | |
centertitle() | |
# graph of the distinct values of launch angles | |
sc2019_ip %>% | |
group_by(launch_angle) %>% | |
summarize(N = n()) %>% | |
ggplot(aes(launch_angle, N)) + | |
geom_col() + | |
increasefont() + | |
ggtitle("Distribution of 2019 Launch Angles") + | |
centertitle() | |
# pick out the extreme pairs of (ls, la) | |
# finds the pairs where the frequency exceeds n_min | |
find_extreme <- function(d, n_min = 80){ | |
d %>% | |
group_by(launch_speed, launch_angle) %>% | |
summarize(N = n()) -> S | |
S %>% filter(N >= n_min) | |
} | |
# decide on using a frequency of 100 as a cutoff | |
o19 <- find_extreme(sc2019_ip, n_min = 100) %>% | |
arrange(desc(N)) | |
# remove these pairs from Statcast data | |
# must be a more elegant way of doing this | |
sc2019_ip %>% mutate(Row = row_number()) -> sc2019_ip | |
find_rows_one_pair <- function(j){ | |
sc2019_ip %>% | |
filter(launch_speed == o19$launch_speed[j], | |
launch_angle == o19$launch_angle[j]) %>% | |
select(Row) %>% pull() | |
} | |
row_numbers <- NULL | |
for(j in 1:nrow(o19)){ | |
row_numbers <- c(row_numbers, find_rows_one_pair(j)) | |
} | |
sc2019_ip %>% | |
filter(!row_number() %in% row_numbers) -> | |
sc2019_clean | |
# graphs of cleaned data | |
sc2019_clean %>% | |
group_by(launch_speed) %>% | |
summarize(N = n()) %>% | |
ggplot(aes(launch_speed, N)) + | |
geom_col() + | |
increasefont() + | |
ggtitle("Distribution of 2019 Launch Speeds") + | |
centertitle() | |
# graph of launch angles from cleaned data | |
sc2019_clean %>% | |
group_by(launch_angle) %>% | |
summarize(N = n()) %>% | |
ggplot(aes(launch_angle, N)) + | |
geom_col() + | |
increasefont() + | |
ggtitle("Distribution of 2019 Launch Angles") + | |
centertitle() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment