Skip to content

Instantly share code, notes, and snippets.

@bayesball
Created April 24, 2022 17:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bayesball/acd412f8cd4f2e70aa85207a8ae84eae to your computer and use it in GitHub Desktop.
Save bayesball/acd412f8cd4f2e70aa85207a8ae84eae to your computer and use it in GitHub Desktop.
R script for 3000 hitting club exploration
# read in three packages
library(readr)
library(dplyr)
library(ggplot2)
# helper ggplot2 function
helper_ggplot <- function(){
theme(text=element_text(size=18)) +
theme(plot.title = element_text(colour = "blue",
size = 18,
hjust = 0.5,
vjust = 0.8,
angle = 0))
}
# read in Lahman datasets (can also use datasets in Lahman package)
Batting <- read_csv("~/Dropbox/Google Drive/Lahman/core/Batting.csv")
People <- read_csv("~/Dropbox/Google Drive/Lahman/core/People.csv")
# identify players with at least 3000 career hits
Batting %>%
group_by(playerID) %>%
summarize(H = sum(H),
.groups = "drop") %>%
filter(H >= 3000) %>%
pull(playerID) -> players_3000
# add Cabrera to list
People %>%
filter(nameLast == "Cabrera",
nameFirst == "Miguel") %>%
pull(playerID) -> mc_id
players_3000 <- c(players_3000, mc_id)
# collect season data including ages
Batting %>%
filter(playerID %in% players_3000) %>%
group_by(playerID, yearID) %>%
summarize(H = sum(H),
AB = sum(AB),
BA = H / AB,
.groups = "drop") -> S3000
# create dataset of players in 3000 hit club
People %>%
select(playerID, nameFirst, nameLast,
birthYear, birthMonth) %>%
inner_join(S3000, by = "playerID") -> S3000
# add ages
S3000 %>%
mutate(birthyear = ifelse(birthMonth >= 7,
birthYear + 1, birthYear),
Age = yearID - birthyear) -> S3000
# initial graph -- too much plotting!
ggplot(S3000, aes(Age, BA, color = playerID)) +
geom_point()
# try plotting smooths
ggplot(S3000, aes(Age, BA, color = playerID)) +
geom_smooth(se = FALSE)
# separate into two data sets by midseason
S3000 %>%
group_by(playerID) %>%
mutate(MidYear = (max(yearID) +
min(yearID)) / 2) %>%
select(playerID, MidYear) -> S_midyear
inner_join(S3000, S_midyear,
by = "playerID") -> S3000
# smooths over two groups of players
ggplot(filter(S3000, MidYear <= 1976,
AB >= 300),
aes(Age, BA, color = playerID)) +
geom_smooth(se = FALSE,
method = "loess",
formula = "y ~ x")
ggplot(filter(S3000, MidYear > 1976,
AB >= 300),
aes(Age, BA, color = playerID)) +
geom_smooth(se = FALSE,
method = "loess",
formula = "y ~ x")
# look at each trajectory and categorize
# by the shape of the smooth
# add this information to my main dataset
plot_single_trajectory <- function(j){
ggplot(filter(S3000,
playerID == players_3000[j] ,
AB >= 300),
aes(Age, BA)) +
geom_smooth(se = FALSE,
method = "loess",
formula = "y ~ x") +
ggtitle(players_3000[j]) +
helper_ggplot()
}
plot_single_trajectory(1)
# I estimated the peak age and the type of
# trajectory for each player in this group
peak_ages <- c(25, 27, 33, 30, 27, 27, 33,
30, 32, 25, 25, 35, 25,
25, 23, 27,
26, 35, 27, 27, 27, 25,
35, 21, 30, 35, 30, 30, 25,
29, 25, 30, 30)
desc <- c("E", "E", "L", "M", "E", "E", "L", "M",
"L", "E", "E", "L", "L", "M", "E", "E",
"E", "L", "E", "E", "E", "E", "L", "E",
"M", "L", "E", "M", "E", "M", "E", "M",
"M")
Players <- data.frame(playerID = players_3000,
Peak_Age = peak_ages,
Type = desc)
inner_join(S3000, Players,
by = "playerID") -> S3000
############### six graphs
ggplot(filter(S3000,
Type == "E",
MidYear <= 1976,
AB >= 300),
aes(Age, BA, color = nameLast)) +
geom_smooth(se = FALSE,
method = "loess",
formula = "y ~ x") +
ggtitle("Early Bloomers - MidCareer <= 1976") +
helper_ggplot()
ggplot(filter(S3000,
Type == "E",
MidYear > 1976,
AB >= 300),
aes(Age, BA, color = nameLast)) +
geom_smooth(se = FALSE,
method = "loess",
formula = "y ~ x") +
ggtitle("Early Bloomers - MidCareer > 1976") +
helper_ggplot()
ggplot(filter(S3000,
Type == "L",
MidYear <= 1976,
AB >= 300),
aes(Age, BA, color = nameLast)) +
geom_smooth(se = FALSE,
method = "loess",
formula = "y ~ x") +
ggtitle("Late Bloomers - MidCareer <= 1976") +
helper_ggplot()
ggplot(filter(S3000,
Type == "L",
MidYear > 1976,
AB >= 300),
aes(Age, BA, color = nameLast)) +
geom_smooth(se = FALSE,
method = "loess",
formula = "y ~ x") +
ggtitle("Late Bloomers - MidCareer > 1976") +
helper_ggplot()
ggplot(filter(S3000,
Type == "M",
MidYear <= 1976,
AB >= 300),
aes(Age, BA, color = nameLast)) +
geom_smooth(se = FALSE,
method = "loess",
formula = "y ~ x") +
ggtitle("Common Trajectory - MidCareer <= 1976") +
helper_ggplot()
ggplot(filter(S3000,
Type == "M",
MidYear > 1976,
AB >= 300),
aes(Age, BA, color = nameLast)) +
geom_smooth(se = FALSE,
method = "loess",
formula = "y ~ x") +
ggtitle("Common Trajectory - MidCareer > 1976") +
helper_ggplot()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment