Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Plots trajectories of strikeout rates, home run rates, and hit-in-play rates for players with similar batting averages
# requires packages
# dplyr, Lahman, ggplot2
# some preliminary work
library(dplyr)
library(Lahman)
get.birthyear <- function(player.id){
birthdata <- select(filter(Master, playerID==player.id),
birthMonth, birthYear)
with(birthdata,
ifelse(birthMonth >= 7, birthYear + 1, birthYear))
}
Batting1 <- summarize(group_by(Batting, playerID, yearID),
AB=sum(AB),
H=sum(H),
SO=sum(SO),
HR=sum(HR))
Master1 <- mutate(Master,
BirthYear = get.birthyear(playerID))
Batting1 <- mutate(inner_join(Batting1,
select(Master1, playerID, BirthYear),
by="playerID"), Age=yearID - BirthYear)
S <- summarize(group_by(Batting1, playerID),
MinYear = min(yearID),
MaxYear = max(yearID),
MidCareer = (MinYear + MaxYear) / 2,
CareerAB = sum(AB),
CareerH = sum(H),
CareerAVG = round(CareerH / CareerAB, 3))
Batting1 <- inner_join(Batting1,
select(S, playerID, MidCareer, CareerAB, CareerAVG),
by="playerID")
# look at players with similar career batting averages with a min
# number of career AB in a particular
# era -- look at trajectories of SO Rate = SO / AB and
# HIP Rate = H / (AB - SO)
# idea is that the SO Rates should be more stable over time than
# HIP rates
# by default find players with at least 3000 AB and career AVG of .300
# target midcareer year that is input
# allow errors of 4 for midcareer and .002 for target career AVG
compare_rates <- function(target.year, Career_eps = 4,
AVG_target = .300,
AVG_eps = .002,
Career_target = 3000){
require(ggplot2)
S_select <- filter(S,
abs(MidCareer - target.year) <= Career_eps,
abs(CareerAVG - AVG_target) <= AVG_eps,
CareerAB >= Career_target)
Batting_select <- filter(Batting1,
playerID %in% S_select$playerID)
Batting_select <- inner_join(Batting_select,
select(Master, playerID, nameFirst, nameLast),
by="playerID")
Batting_select <- mutate(Batting_select,
Name=paste(nameFirst, nameLast))
if(dim(Batting_select)[1] == 0) cat("No players matched the criteria.")
if(dim(Batting_select)[1] > 0) {
plot1 <- ggplot(Batting_select, aes(Age, SO / AB)) +
geom_point(color="red") + geom_smooth(se=FALSE) +
ggtitle("Strikeout Rates") +
facet_wrap(~ Name, ncol=2)
plot2 <- ggplot(Batting_select, aes(Age, HR / (AB - SO))) +
geom_point(color="red") + geom_smooth(se=FALSE) +
ggtitle("Home Run Rates") +
facet_wrap(~ Name, ncol=2)
plot3 <- ggplot(Batting_select, aes(Age, (H - HR) / (AB - SO - HR))) +
geom_point(color="red") + geom_smooth(se=FALSE) +
ggtitle("Hit-In-Play Rates") +
facet_wrap(~ Name, ncol=2)
print(plot1)
print(plot2)
print(plot3)
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment