Skip to content

Instantly share code, notes, and snippets.

@bayesball
Last active July 5, 2020 18:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bayesball/cdf7f15be6d116d7035b8ba36957d82f to your computer and use it in GitHub Desktop.
Save bayesball/cdf7f15be6d116d7035b8ba36957d82f to your computer and use it in GitHub Desktop.
R function to find the longest stretches of .400 hitting among all batters in a single season
# Inputs to function best_streaks() are
# - retro_data - Retrosheet play-by-play data for a single season
# (this page describes how to download Retrosheet data for a single season)
# https://bayesball.github.io/VB/Getting_Retrosheet_Files.html
# - minAB - consider all hitters who have at least minAB at-bats that season
# - numtop - the number of top performers to display
# Note: the only special function I am using is rowapply() from the zoo package
best_streaks <- function(retro_data,
minAB = 300,
numtop = 5){
# load several packages
library(tidyverse)
library(zoo)
library(dplyr)
# limit to official AB, define Date, HIT variables
retro_data %>%
filter(AB_FL == TRUE) %>%
mutate(Date = substr(GAME_ID, 4, 12)) %>%
mutate(HIT = ifelse(H_FL > 0, 1, 0)) -> d_AB
# sort data by game date and EVENT_ID
d_AB %>%
arrange(Date, EVENT_ID) -> d_AB
# collect the players with at least minAB AB
d_AB %>% group_by(BAT_ID) %>%
summarize(N = n(), .groups = "drop") %>%
dplyr::filter(N >= minAB) %>%
select(BAT_ID) %>% pull() %>%
as.character() -> p300
d_AB_300 <- dplyr::filter(d_AB, BAT_ID %in% p300)
# main function -- finds the maximum number of AB
# where rolling AVG is at least .400 for one player
# outputs N and the AVG
myfunction <- function(playerid, df){
max_mavg <- function(N){
max(rollapply(y, N, mean, fill = NA),
na.rm = TRUE)
}
df %>%
dplyr::filter(BAT_ID == playerid) %>%
select(HIT) %>% pull() -> y
Nvalues <- 10:length(y)
AVG <- sapply(Nvalues, max_mavg)
Nmax <- max(Nvalues[AVG >= .400], na.rm = TRUE)
AVGmax <- max_mavg(Nmax)
c(Nmax, AVGmax)
}
# does this for all players (probably faster way
# of programming this)
out <- sapply(p300, myfunction, d_AB_300)
# organize results as a data frame
results <- data.frame(Player = p300,
N = out[1, ],
AVG = out[2, ])
row.names(results) <- NULL
# outputs the top numtop
arrange(results, desc(N)) %>% head(numtop)
}
# Illustration of using function to find top stretches of .400 hitting
# for 2019 season where d2019 is the Retrosheet dataset for 2019 season
out6 <- best_streaks(d2019)
# Here is the output:
# Player N AVG
# 1 blacc001 170 0.4000000
# 2 martk001 170 0.4000000
# 3 bellc002 167 0.4011976
# 4 arenn001 152 0.4013158
# 5 branm003 150 0.4000000
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment