Created
April 24, 2022 17:44
-
-
Save bayesball/acd412f8cd4f2e70aa85207a8ae84eae to your computer and use it in GitHub Desktop.
R script for 3000 hitting club exploration
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# read in three packages | |
library(readr) | |
library(dplyr) | |
library(ggplot2) | |
# helper ggplot2 function | |
helper_ggplot <- function(){ | |
theme(text=element_text(size=18)) + | |
theme(plot.title = element_text(colour = "blue", | |
size = 18, | |
hjust = 0.5, | |
vjust = 0.8, | |
angle = 0)) | |
} | |
# read in Lahman datasets (can also use datasets in Lahman package) | |
Batting <- read_csv("~/Dropbox/Google Drive/Lahman/core/Batting.csv") | |
People <- read_csv("~/Dropbox/Google Drive/Lahman/core/People.csv") | |
# identify players with at least 3000 career hits | |
Batting %>% | |
group_by(playerID) %>% | |
summarize(H = sum(H), | |
.groups = "drop") %>% | |
filter(H >= 3000) %>% | |
pull(playerID) -> players_3000 | |
# add Cabrera to list | |
People %>% | |
filter(nameLast == "Cabrera", | |
nameFirst == "Miguel") %>% | |
pull(playerID) -> mc_id | |
players_3000 <- c(players_3000, mc_id) | |
# collect season data including ages | |
Batting %>% | |
filter(playerID %in% players_3000) %>% | |
group_by(playerID, yearID) %>% | |
summarize(H = sum(H), | |
AB = sum(AB), | |
BA = H / AB, | |
.groups = "drop") -> S3000 | |
# create dataset of players in 3000 hit club | |
People %>% | |
select(playerID, nameFirst, nameLast, | |
birthYear, birthMonth) %>% | |
inner_join(S3000, by = "playerID") -> S3000 | |
# add ages | |
S3000 %>% | |
mutate(birthyear = ifelse(birthMonth >= 7, | |
birthYear + 1, birthYear), | |
Age = yearID - birthyear) -> S3000 | |
# initial graph -- too much plotting! | |
ggplot(S3000, aes(Age, BA, color = playerID)) + | |
geom_point() | |
# try plotting smooths | |
ggplot(S3000, aes(Age, BA, color = playerID)) + | |
geom_smooth(se = FALSE) | |
# separate into two data sets by midseason | |
S3000 %>% | |
group_by(playerID) %>% | |
mutate(MidYear = (max(yearID) + | |
min(yearID)) / 2) %>% | |
select(playerID, MidYear) -> S_midyear | |
inner_join(S3000, S_midyear, | |
by = "playerID") -> S3000 | |
# smooths over two groups of players | |
ggplot(filter(S3000, MidYear <= 1976, | |
AB >= 300), | |
aes(Age, BA, color = playerID)) + | |
geom_smooth(se = FALSE, | |
method = "loess", | |
formula = "y ~ x") | |
ggplot(filter(S3000, MidYear > 1976, | |
AB >= 300), | |
aes(Age, BA, color = playerID)) + | |
geom_smooth(se = FALSE, | |
method = "loess", | |
formula = "y ~ x") | |
# look at each trajectory and categorize | |
# by the shape of the smooth | |
# add this information to my main dataset | |
plot_single_trajectory <- function(j){ | |
ggplot(filter(S3000, | |
playerID == players_3000[j] , | |
AB >= 300), | |
aes(Age, BA)) + | |
geom_smooth(se = FALSE, | |
method = "loess", | |
formula = "y ~ x") + | |
ggtitle(players_3000[j]) + | |
helper_ggplot() | |
} | |
plot_single_trajectory(1) | |
# I estimated the peak age and the type of | |
# trajectory for each player in this group | |
peak_ages <- c(25, 27, 33, 30, 27, 27, 33, | |
30, 32, 25, 25, 35, 25, | |
25, 23, 27, | |
26, 35, 27, 27, 27, 25, | |
35, 21, 30, 35, 30, 30, 25, | |
29, 25, 30, 30) | |
desc <- c("E", "E", "L", "M", "E", "E", "L", "M", | |
"L", "E", "E", "L", "L", "M", "E", "E", | |
"E", "L", "E", "E", "E", "E", "L", "E", | |
"M", "L", "E", "M", "E", "M", "E", "M", | |
"M") | |
Players <- data.frame(playerID = players_3000, | |
Peak_Age = peak_ages, | |
Type = desc) | |
inner_join(S3000, Players, | |
by = "playerID") -> S3000 | |
############### six graphs | |
ggplot(filter(S3000, | |
Type == "E", | |
MidYear <= 1976, | |
AB >= 300), | |
aes(Age, BA, color = nameLast)) + | |
geom_smooth(se = FALSE, | |
method = "loess", | |
formula = "y ~ x") + | |
ggtitle("Early Bloomers - MidCareer <= 1976") + | |
helper_ggplot() | |
ggplot(filter(S3000, | |
Type == "E", | |
MidYear > 1976, | |
AB >= 300), | |
aes(Age, BA, color = nameLast)) + | |
geom_smooth(se = FALSE, | |
method = "loess", | |
formula = "y ~ x") + | |
ggtitle("Early Bloomers - MidCareer > 1976") + | |
helper_ggplot() | |
ggplot(filter(S3000, | |
Type == "L", | |
MidYear <= 1976, | |
AB >= 300), | |
aes(Age, BA, color = nameLast)) + | |
geom_smooth(se = FALSE, | |
method = "loess", | |
formula = "y ~ x") + | |
ggtitle("Late Bloomers - MidCareer <= 1976") + | |
helper_ggplot() | |
ggplot(filter(S3000, | |
Type == "L", | |
MidYear > 1976, | |
AB >= 300), | |
aes(Age, BA, color = nameLast)) + | |
geom_smooth(se = FALSE, | |
method = "loess", | |
formula = "y ~ x") + | |
ggtitle("Late Bloomers - MidCareer > 1976") + | |
helper_ggplot() | |
ggplot(filter(S3000, | |
Type == "M", | |
MidYear <= 1976, | |
AB >= 300), | |
aes(Age, BA, color = nameLast)) + | |
geom_smooth(se = FALSE, | |
method = "loess", | |
formula = "y ~ x") + | |
ggtitle("Common Trajectory - MidCareer <= 1976") + | |
helper_ggplot() | |
ggplot(filter(S3000, | |
Type == "M", | |
MidYear > 1976, | |
AB >= 300), | |
aes(Age, BA, color = nameLast)) + | |
geom_smooth(se = FALSE, | |
method = "loess", | |
formula = "y ~ x") + | |
ggtitle("Common Trajectory - MidCareer > 1976") + | |
helper_ggplot() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment