Skip to content

Instantly share code, notes, and snippets.

@bayesball

bayesball/HOFwork.R

Created Feb 25, 2018
Embed
What would you like to do?
R script for Hall of Fame voting study
# add a theme element that will be used for titles
TH <- theme(
plot.title = element_text(
colour = "blue",
size = 18,
hjust = 0.5,
vjust = 0.8,
angle = 0
)
)
# read in datasets
library(tidyverse)
library(Lahman)
# focus on players who get votes by BBWAA
# and who received votes in 1960 or later
HOF1960 <- filter(HallOfFame, yearID >= 1960,
votedBy == "BBWAA")
# list of players voted for since 1960 (894 of these)
player_list <- unique(HOF1960$playerID)
# how many years on the ballot?
HallOfFame %>% filter(playerID %in% player_list,
votedBy == "BBWAA") %>%
group_by(playerID) %>% count() -> S
# collect list of players who have been on ballot
# for 5 or more years (183 in list)
player_list_5 <- filter(S, n >= 5)$playerID
HallOfFame %>% filter(votedBy == "BBWAA",
playerID %in% player_list_5) ->
HOF1
# restrict attention to the players who get at
# least 20% of the vote the first year
HOF1 %>% group_by(playerID) %>%
summarize(V1 = first(votes) / first(ballots)) -> F
inner_join(HOF1, F) %>% filter(V1 >= 0.20) ->
HOF2
# add the player's last name to data frame
inner_join(HOF2,
select(Master, playerID, nameLast)) -> HOF2
# first graph
ggplot(HOF2, aes(yearID, votes / ballots,
color=nameLast)) +
geom_line() +
geom_hline(yintercept = .75,
size=1.5) + TH +
ggtitle("Voting Trajectories")
# replace points by best fit lines
ggplot(HOF2, aes(yearID, votes / ballots,
color=nameLast)) +
geom_smooth(method="lm") +
geom_hline(yintercept = .75,
size=1.5) + TH +
ggtitle("Least-Squares Fits")
# add Initial Year of voting variable to data frame
HOF2 %>% group_by(playerID) %>%
mutate(InitialYear = first(yearID)) %>%
select(playerID, InitialYear) -> SH
inner_join(HOF2, SH, by="playerID") -> HOF3
# collect regression fits for players
library(broom)
regressions <- HOF3 %>% group_by(playerID) %>%
do(tidy(lm(I(votes / ballots) ~
I(yearID - InitialYear),
data=.)))
# reformat data so I can plot intercepts and slopes
R <- spread(regressions[, 1:3], term, estimate)
names(R)[2:3] <- c("Intercept", "Slope")
inner_join(R, select(Master, playerID, nameLast)) -> R
# add variable that indicates if player is currently in
# HOF
InHOF <- filter(HOF1, inducted == "Y")
R %>%
mutate(HOF = ifelse(playerID %in% InHOF$playerID,
"YES", "NO")) -> R
# plot intercepts and slopes
library(ggrepel)
ggplot(R, aes(Intercept, Slope,
label=nameLast,
color=HOF)) +
geom_label_repel() + TH +
ggtitle("Plot of Initial Percentage and Slope")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment