Skip to content

Instantly share code, notes, and snippets.

@bayesball
Created July 1, 2016 00:40
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bayesball/d555466d9de52d57437787d0b3bfa036 to your computer and use it in GitHub Desktop.
Save bayesball/d555466d9de52d57437787d0b3bfa036 to your computer and use it in GitHub Desktop.
Illustrating broom package using career trajectory of home run rates
# read in Lahman batting and master files
# can also use Lahman package -- data is only through 2014 season
Batting <- read.csv("~/OneDriveBusiness/lahman-csv_2015-01-24/Batting.csv")
Master <- read.csv("~/OneDriveBusiness/lahman-csv_2015-01-24/Master.csv")
# find players with at least 500 career homes (through 2015)
library(dplyr)
S <- summarize(group_by(Batting, playerID),
HR=sum(HR))
Names <- unique(filter(S, HR >= 500)$playerID)
Batting500 <- filter(Batting, playerID %in% Names)
S500 <- summarize(group_by(Batting500, playerID, yearID),
HR=sum(HR), AB=sum(AB))
NAMES <- select(filter(Master, playerID %in% Names),
playerID, nameFirst, nameLast)
mutate(NAMES, Name=paste(nameFirst, nameLast)) %>%
select(playerID, Name) -> NAMES
S500 <- inner_join(S500, NAMES, by="playerID")
# add ages
get.birthyear <- function(player.id){
playerline <- subset(Master, playerID == player.id)
birthyear <- playerline$birthYear
birthmonth <- playerline$birthMonth
ifelse(birthmonth >= 7, birthyear + 1, birthyear)
}
Birthyears <- data.frame(Player=Names,
Birthyear=sapply(as.character(Names), get.birthyear))
S500 <- inner_join(S500, Birthyears,
by=c("playerID"="Player"))
S500 <- mutate(S500, Age=yearID - Birthyear)
# illustrate broom package
library(broom)
library(ggplot2)
library(ggthemes)
# tidy function
regressions <- S500 %>% group_by(Name) %>%
do(tidy(lm(I(HR / AB) ~ Age + I(Age ^ 2), data=.)))
S <- summarize(group_by(regressions, Name),
peak_age=- estimate[2] / 2 / estimate[3])
ggplot(S, aes(Name, peak_age)) + geom_point() +
coord_flip() + ylim(20, 40) +
ggtitle("Age of Peak Performance of Home Run Rate")
# augment function
individual <- S500 %>% group_by(Name) %>%
do(augment(lm(I(HR / AB) ~ Age + I(Age ^ 2), data=.)))
ggplot(individual, aes(Age, .fitted)) +
geom_line(color="red", size=1.5) +
facet_wrap(~ Name) +
geom_point(data=S500, aes(Age, HR / AB), color="blue") +
ggtitle("Career Trajectories of the 500 Home Run Club") +
ylim(.03, .13) +
theme_fivethirtyeight() +
xlab("AGE") + ylab("HOME RUN RATE")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment