Created
July 1, 2016 00:40
-
-
Save bayesball/d555466d9de52d57437787d0b3bfa036 to your computer and use it in GitHub Desktop.
Illustrating broom package using career trajectory of home run rates
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# read in Lahman batting and master files | |
# can also use Lahman package -- data is only through 2014 season | |
Batting <- read.csv("~/OneDriveBusiness/lahman-csv_2015-01-24/Batting.csv") | |
Master <- read.csv("~/OneDriveBusiness/lahman-csv_2015-01-24/Master.csv") | |
# find players with at least 500 career homes (through 2015) | |
library(dplyr) | |
S <- summarize(group_by(Batting, playerID), | |
HR=sum(HR)) | |
Names <- unique(filter(S, HR >= 500)$playerID) | |
Batting500 <- filter(Batting, playerID %in% Names) | |
S500 <- summarize(group_by(Batting500, playerID, yearID), | |
HR=sum(HR), AB=sum(AB)) | |
NAMES <- select(filter(Master, playerID %in% Names), | |
playerID, nameFirst, nameLast) | |
mutate(NAMES, Name=paste(nameFirst, nameLast)) %>% | |
select(playerID, Name) -> NAMES | |
S500 <- inner_join(S500, NAMES, by="playerID") | |
# add ages | |
get.birthyear <- function(player.id){ | |
playerline <- subset(Master, playerID == player.id) | |
birthyear <- playerline$birthYear | |
birthmonth <- playerline$birthMonth | |
ifelse(birthmonth >= 7, birthyear + 1, birthyear) | |
} | |
Birthyears <- data.frame(Player=Names, | |
Birthyear=sapply(as.character(Names), get.birthyear)) | |
S500 <- inner_join(S500, Birthyears, | |
by=c("playerID"="Player")) | |
S500 <- mutate(S500, Age=yearID - Birthyear) | |
# illustrate broom package | |
library(broom) | |
library(ggplot2) | |
library(ggthemes) | |
# tidy function | |
regressions <- S500 %>% group_by(Name) %>% | |
do(tidy(lm(I(HR / AB) ~ Age + I(Age ^ 2), data=.))) | |
S <- summarize(group_by(regressions, Name), | |
peak_age=- estimate[2] / 2 / estimate[3]) | |
ggplot(S, aes(Name, peak_age)) + geom_point() + | |
coord_flip() + ylim(20, 40) + | |
ggtitle("Age of Peak Performance of Home Run Rate") | |
# augment function | |
individual <- S500 %>% group_by(Name) %>% | |
do(augment(lm(I(HR / AB) ~ Age + I(Age ^ 2), data=.))) | |
ggplot(individual, aes(Age, .fitted)) + | |
geom_line(color="red", size=1.5) + | |
facet_wrap(~ Name) + | |
geom_point(data=S500, aes(Age, HR / AB), color="blue") + | |
ggtitle("Career Trajectories of the 500 Home Run Club") + | |
ylim(.03, .13) + | |
theme_fivethirtyeight() + | |
xlab("AGE") + ylab("HOME RUN RATE") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment