Last active
July 5, 2020 18:07
-
-
Save bayesball/cdf7f15be6d116d7035b8ba36957d82f to your computer and use it in GitHub Desktop.
R function to find the longest stretches of .400 hitting among all batters in a single season
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Inputs to function best_streaks() are | |
# - retro_data - Retrosheet play-by-play data for a single season | |
# (this page describes how to download Retrosheet data for a single season) | |
# https://bayesball.github.io/VB/Getting_Retrosheet_Files.html | |
# - minAB - consider all hitters who have at least minAB at-bats that season | |
# - numtop - the number of top performers to display | |
# Note: the only special function I am using is rowapply() from the zoo package | |
best_streaks <- function(retro_data, | |
minAB = 300, | |
numtop = 5){ | |
# load several packages | |
library(tidyverse) | |
library(zoo) | |
library(dplyr) | |
# limit to official AB, define Date, HIT variables | |
retro_data %>% | |
filter(AB_FL == TRUE) %>% | |
mutate(Date = substr(GAME_ID, 4, 12)) %>% | |
mutate(HIT = ifelse(H_FL > 0, 1, 0)) -> d_AB | |
# sort data by game date and EVENT_ID | |
d_AB %>% | |
arrange(Date, EVENT_ID) -> d_AB | |
# collect the players with at least minAB AB | |
d_AB %>% group_by(BAT_ID) %>% | |
summarize(N = n(), .groups = "drop") %>% | |
dplyr::filter(N >= minAB) %>% | |
select(BAT_ID) %>% pull() %>% | |
as.character() -> p300 | |
d_AB_300 <- dplyr::filter(d_AB, BAT_ID %in% p300) | |
# main function -- finds the maximum number of AB | |
# where rolling AVG is at least .400 for one player | |
# outputs N and the AVG | |
myfunction <- function(playerid, df){ | |
max_mavg <- function(N){ | |
max(rollapply(y, N, mean, fill = NA), | |
na.rm = TRUE) | |
} | |
df %>% | |
dplyr::filter(BAT_ID == playerid) %>% | |
select(HIT) %>% pull() -> y | |
Nvalues <- 10:length(y) | |
AVG <- sapply(Nvalues, max_mavg) | |
Nmax <- max(Nvalues[AVG >= .400], na.rm = TRUE) | |
AVGmax <- max_mavg(Nmax) | |
c(Nmax, AVGmax) | |
} | |
# does this for all players (probably faster way | |
# of programming this) | |
out <- sapply(p300, myfunction, d_AB_300) | |
# organize results as a data frame | |
results <- data.frame(Player = p300, | |
N = out[1, ], | |
AVG = out[2, ]) | |
row.names(results) <- NULL | |
# outputs the top numtop | |
arrange(results, desc(N)) %>% head(numtop) | |
} | |
# Illustration of using function to find top stretches of .400 hitting | |
# for 2019 season where d2019 is the Retrosheet dataset for 2019 season | |
out6 <- best_streaks(d2019) | |
# Here is the output: | |
# Player N AVG | |
# 1 blacc001 170 0.4000000 | |
# 2 martk001 170 0.4000000 | |
# 3 bellc002 167 0.4011976 | |
# 4 arenn001 152 0.4013158 | |
# 5 branm003 150 0.4000000 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment