Created
February 25, 2018 19:03
-
-
Save bayesball/117169c69685a041251d153d271279b9 to your computer and use it in GitHub Desktop.
R script for Hall of Fame voting study
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# add a theme element that will be used for titles | |
TH <- theme( | |
plot.title = element_text( | |
colour = "blue", | |
size = 18, | |
hjust = 0.5, | |
vjust = 0.8, | |
angle = 0 | |
) | |
) | |
# read in datasets | |
library(tidyverse) | |
library(Lahman) | |
# focus on players who get votes by BBWAA | |
# and who received votes in 1960 or later | |
HOF1960 <- filter(HallOfFame, yearID >= 1960, | |
votedBy == "BBWAA") | |
# list of players voted for since 1960 (894 of these) | |
player_list <- unique(HOF1960$playerID) | |
# how many years on the ballot? | |
HallOfFame %>% filter(playerID %in% player_list, | |
votedBy == "BBWAA") %>% | |
group_by(playerID) %>% count() -> S | |
# collect list of players who have been on ballot | |
# for 5 or more years (183 in list) | |
player_list_5 <- filter(S, n >= 5)$playerID | |
HallOfFame %>% filter(votedBy == "BBWAA", | |
playerID %in% player_list_5) -> | |
HOF1 | |
# restrict attention to the players who get at | |
# least 20% of the vote the first year | |
HOF1 %>% group_by(playerID) %>% | |
summarize(V1 = first(votes) / first(ballots)) -> F | |
inner_join(HOF1, F) %>% filter(V1 >= 0.20) -> | |
HOF2 | |
# add the player's last name to data frame | |
inner_join(HOF2, | |
select(Master, playerID, nameLast)) -> HOF2 | |
# first graph | |
ggplot(HOF2, aes(yearID, votes / ballots, | |
color=nameLast)) + | |
geom_line() + | |
geom_hline(yintercept = .75, | |
size=1.5) + TH + | |
ggtitle("Voting Trajectories") | |
# replace points by best fit lines | |
ggplot(HOF2, aes(yearID, votes / ballots, | |
color=nameLast)) + | |
geom_smooth(method="lm") + | |
geom_hline(yintercept = .75, | |
size=1.5) + TH + | |
ggtitle("Least-Squares Fits") | |
# add Initial Year of voting variable to data frame | |
HOF2 %>% group_by(playerID) %>% | |
mutate(InitialYear = first(yearID)) %>% | |
select(playerID, InitialYear) -> SH | |
inner_join(HOF2, SH, by="playerID") -> HOF3 | |
# collect regression fits for players | |
library(broom) | |
regressions <- HOF3 %>% group_by(playerID) %>% | |
do(tidy(lm(I(votes / ballots) ~ | |
I(yearID - InitialYear), | |
data=.))) | |
# reformat data so I can plot intercepts and slopes | |
R <- spread(regressions[, 1:3], term, estimate) | |
names(R)[2:3] <- c("Intercept", "Slope") | |
inner_join(R, select(Master, playerID, nameLast)) -> R | |
# add variable that indicates if player is currently in | |
# HOF | |
InHOF <- filter(HOF1, inducted == "Y") | |
R %>% | |
mutate(HOF = ifelse(playerID %in% InHOF$playerID, | |
"YES", "NO")) -> R | |
# plot intercepts and slopes | |
library(ggrepel) | |
ggplot(R, aes(Intercept, Slope, | |
label=nameLast, | |
color=HOF)) + | |
geom_label_repel() + TH + | |
ggtitle("Plot of Initial Percentage and Slope") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment