Created
January 16, 2022 15:52
-
-
Save bayesball/bbe54bfb5b1b56cd2024cd58da64e3ef to your computer and use it in GitHub Desktop.
R script to add variables to FanGraphs leaderboard data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# preparation work for FanGraphs leaderboard data | |
# collected FanGraphs Leaderboard batting data | |
# choose Multiple Seasons (all), Split Seasons | |
# minimum 100 PA in each season | |
# download data and saved as "fgbatting.csv" | |
library(dplyr) | |
library(readr) | |
library(Lahman) | |
library(purrr) | |
# read data and do some data maintenance | |
fg <- read_csv("fgbatting.csv") | |
names(fg)[c(10, 11, 19)] <- | |
c("BB_Pct", "K_Pct", "wRC_plus") | |
fg$BB_Pct <- as.numeric(gsub("[\\%,]", "", fg$BB_Pct)) | |
fg$K_Pct <- as.numeric(gsub("[\\%,]", "", fg$K_Pct)) | |
# merge Lahman ids from chadwick database | |
# see https://github.com/chadwickbureau/register | |
people <- read_csv("people.csv") | |
inner_join(fg, | |
select(people, key_bbref, key_fangraphs), | |
by = c("playerid" = "key_fangraphs")) -> fg | |
# add career PA, midyear, and ages to data file | |
Batting %>% | |
mutate(PA = ifelse(is.na(SF) == FALSE, | |
AB + BB + SH + SF + HBP, | |
AB + BB + SH + HBP)) %>% | |
group_by(playerID) %>% | |
summarize(minYear = min(yearID), | |
maxYear = max(yearID), | |
midYear = (minYear + maxYear) / 2, | |
cPA = sum(PA), | |
.groups = "drop") %>% | |
select(playerID, midYear, cPA) -> S | |
# collect birthyears | |
get_birthyear <- function(playerid) { | |
Master %>% | |
filter(playerID == playerid) %>% | |
mutate(birthyear = ifelse(birthMonth >= 7, | |
birthYear + 1, birthYear)) %>% | |
select(playerID, birthyear) | |
} | |
player_ids <- unique(fg$key_bbref) | |
S1 <- map_df(player_ids, get_birthyear) | |
# merge birthyears with summary dataset | |
inner_join(S, S1, by = "playerID") -> S2 | |
# merge the fangraphs and summary datasets | |
inner_join(fg, S2, | |
by = c("key_bbref" = "playerID")) -> fg | |
# compute ages | |
fg %>% | |
mutate(Age = Season - birthyear) -> fg | |
# read in baseball reference HOF list | |
# downloaded from | |
# https://www.baseball-reference.com/awards/hof.shtml | |
hof <- read_csv("bref_hof.csv") | |
extract_name <- function(name){ | |
unlist(strsplit(name, "\\\\"))[2] | |
} | |
hof$playerID <- sapply(hof$Name, extract_name) | |
hof$inducted <- "Y" | |
# merge with original dataset | |
left_join(fg, select(hof, | |
playerID, inducted), | |
by = c("key_bbref" = "playerID")) -> fg | |
# write csv and tab-deliminted versions of data | |
write_csv(fg, "fgbatting_complete.csv") | |
write_delim(fg, "fg_batting_leaderboard.txt") | |
###################################### | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment