Skip to content

Instantly share code, notes, and snippets.

@bayesball
Created January 16, 2022 15:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bayesball/bbe54bfb5b1b56cd2024cd58da64e3ef to your computer and use it in GitHub Desktop.
Save bayesball/bbe54bfb5b1b56cd2024cd58da64e3ef to your computer and use it in GitHub Desktop.
R script to add variables to FanGraphs leaderboard data
# preparation work for FanGraphs leaderboard data
# collected FanGraphs Leaderboard batting data
# choose Multiple Seasons (all), Split Seasons
# minimum 100 PA in each season
# download data and saved as "fgbatting.csv"
library(dplyr)
library(readr)
library(Lahman)
library(purrr)
# read data and do some data maintenance
fg <- read_csv("fgbatting.csv")
names(fg)[c(10, 11, 19)] <-
c("BB_Pct", "K_Pct", "wRC_plus")
fg$BB_Pct <- as.numeric(gsub("[\\%,]", "", fg$BB_Pct))
fg$K_Pct <- as.numeric(gsub("[\\%,]", "", fg$K_Pct))
# merge Lahman ids from chadwick database
# see https://github.com/chadwickbureau/register
people <- read_csv("people.csv")
inner_join(fg,
select(people, key_bbref, key_fangraphs),
by = c("playerid" = "key_fangraphs")) -> fg
# add career PA, midyear, and ages to data file
Batting %>%
mutate(PA = ifelse(is.na(SF) == FALSE,
AB + BB + SH + SF + HBP,
AB + BB + SH + HBP)) %>%
group_by(playerID) %>%
summarize(minYear = min(yearID),
maxYear = max(yearID),
midYear = (minYear + maxYear) / 2,
cPA = sum(PA),
.groups = "drop") %>%
select(playerID, midYear, cPA) -> S
# collect birthyears
get_birthyear <- function(playerid) {
Master %>%
filter(playerID == playerid) %>%
mutate(birthyear = ifelse(birthMonth >= 7,
birthYear + 1, birthYear)) %>%
select(playerID, birthyear)
}
player_ids <- unique(fg$key_bbref)
S1 <- map_df(player_ids, get_birthyear)
# merge birthyears with summary dataset
inner_join(S, S1, by = "playerID") -> S2
# merge the fangraphs and summary datasets
inner_join(fg, S2,
by = c("key_bbref" = "playerID")) -> fg
# compute ages
fg %>%
mutate(Age = Season - birthyear) -> fg
# read in baseball reference HOF list
# downloaded from
# https://www.baseball-reference.com/awards/hof.shtml
hof <- read_csv("bref_hof.csv")
extract_name <- function(name){
unlist(strsplit(name, "\\\\"))[2]
}
hof$playerID <- sapply(hof$Name, extract_name)
hof$inducted <- "Y"
# merge with original dataset
left_join(fg, select(hof,
playerID, inducted),
by = c("key_bbref" = "playerID")) -> fg
# write csv and tab-deliminted versions of data
write_csv(fg, "fgbatting_complete.csv")
write_delim(fg, "fg_batting_leaderboard.txt")
######################################
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment