bayesball/player_hr_improvement.R

## player_hr_improvement.R
# data frame scip reads all Statcast play by play for the 2015 through 2019 seasons

scip <- read_csv("~/Dropbox/2016 WORK/BLOG Baseball R/OTHER/StatcastData/5_seasons_inplay_b.csv")

# function all_work() does all the computation

# inputs are season1 - first season, season2 - second season, minBIP - min balls in play for both seasons

# two outputs:  p is the ggplot2 representation of the graph of the changes in HR production
#               d contains the data frame with the calculations for all players

all_work <- function(season1, season2,
                     minBIP = 300){
  require(tidyverse)
  require(mgcv)
  require(CalledStrike)
  # function to implement the GAM random
  # effects model
  two_var_fit_re <- function(df){
    gam(HR ~ s(launch_angle, launch_speed) +
          s(Batter, bs = "re"),
        family = binomial,
        data = df) -> fit
  }
  # find players with at least 300 BIP
  # for both seasons
  scip %>%
    filter(Season == season1) %>%
    group_by(batter) %>%
    summarize(BIP = n()) %>%
    filter(BIP >= minBIP) %>%
    select(batter) %>% pull() -> players300a

  scip %>%
    filter(Season == season2) %>%
    group_by(batter) %>%
    summarize(BIP = n()) %>%
    filter(BIP >= minBIP) %>%
    select(batter) %>% pull() -> players300b

  players300 <- intersect(players300a,
                         players300b)
  N <- length(players300)

  # create data frame of data for those players
  # for both seasons
  df300_twoyears <- filter(scip,
                     batter %in% players300,
          Season %in% c(season1, season2)) %>%
          mutate(Batter = factor(batter))

  df300_season1 <- filter(df300_twoyears,
                     Season == season1)

  # run GAM fit to first season data
  fit1 <- two_var_fit_re(df300_season1)
  # extract the random effects
  beta <- coef(fit1)
  p <- length(beta)
  REFF <- data.frame(
    Batter = levels(df300_season1$Batter),
    reff = coef(fit1)[(p - N + 1):p])
  row.names(REFF) <- NULL

  # merge these season1 random effects
  # with the season 2 dataset
  df300_season2 <- filter(df300_twoyears,
                     Season == season2) %>%
  select(Batter, HR, launch_angle,
         launch_speed) %>%
  inner_join(REFF, by = c("Batter")) ->
    df300_season2

  # use season1 fit to predict HR probability
  # for each BIP in season2
  df300_season2$Prob <- predict(fit1,
        newdata = df300_season2,
        type = "response")

  # for each player, compute expected HR
  # for single BIP in second season
  df300_season2 %>%
    group_by(Batter) %>%
    summarize(RE = first(reff),
              BIP2 = n(),
            HR2 = sum(HR),
            E1 = sum(Prob, na.rm = TRUE) /
                 BIP2) -> S2
  # for each player, find BIP and HR in
  # first season
  df300_season1 %>%
    group_by(Batter) %>%
    summarize(BIP1 = n(),
            HR1 = sum(HR)) -> S1
  # merge the two datasets
  S12 <- inner_join(S1, S2,
                  by = "Batter")

  # graph the first season HR against the
  # increase in HR due to the change in
  # launch conditions in 2nd season
  the_graph <- ggplot(S12, aes(HR1, BIP1 * E1 - HR1)) +
    geom_point() +
    geom_hline(yintercept = 0, color = "red") +
    increasefont() +
    ggtitle(paste(season1, "/", season2)) +
    centertitle() +
    xlab(paste(season1, "HR")) +
      ylab(paste(season2, "Increase Due to LC"))

  list(p = the_graph, d = S12)
}

out1 <- all_work(2015, 2016, 200)
out2 <- all_work(2016, 2017, 200)
out3 <- all_work(2017, 2018, 200)
out4 <- all_work(2018, 2019, 200)

save(out1, out2, out3, out4,
     file = "twoyears2.Rdata")

#load("twoyears.Rdata")
#load("twoyears2.Rdata")

# check Chris Davis 448801

out1$d %>% filter(as.character(Batter) ==
                    "448801")

# proportion of positive increases for each
# pair of years

prop_pos <- function(out){
  out$d %>%
    summarize(N = n(),
              P = mean(BIP1 * E1 - HR1 > 0))
}

prop_pos(out1) # 210, 0.643
prop_pos(out2) # 210, 0.443
prop_pos(out3) # 216, 0.569
prop_pos(out4) # 193, 0.611

dis_df <- data.frame(
  Transition = c("2015/2016", "2016/2017",
                 "2017/2018", "2018/2019"),
  Percentage_Positive = c(64.3, 44.3,
                          56.9, 61.1))
#########

out1$p + increasefont(12) +
  ylab("2016 Increase") -> p1
out2$p + increasefont(12) +
  ylab("2017 Increase") -> p2
out3$p + increasefont(12) +
  ylab("2018 Increase") -> p3
out4$p + increasefont(12) +
  ylab("2019 Increase") -> p4

grid.arrange(p1, p2, p3, p4)

# put all data frames together

out1$d %>%
  mutate(Transition = "2015/2016") -> df1
out2$d %>%
  mutate(Transition = "2016/2017") -> df2
out3$d %>%
  mutate(Transition = "2017/2018") -> df3
out4$d %>%
  mutate(Transition = "2018/2019") -> df4
df_all <- rbind(df1, df2, df3, df4)

df_all %>%
  mutate(Expected = BIP1 * E1,
         Change = Expected - HR1) -> df_all

select(df_all, Batter, Transition, HR1,
       Expected, Change) %>%
       arrange(desc(Change)) %>%
  head(5) -> top_df

select(df_all, Batter, Transition, HR1,
       Expected, Change) %>%
       arrange(Change) %>%
  head(5) -> bottom_df

# players at top

top_df$Player <-
  c("Mookie Betts", "Xander Bogaerts",
    "Christian Yelich",
    "J.D. Martinez", "Yonder Alonso")

bottom_df$Player <-
  c("Mark Trumbo", "Khris Davis",
    "Evan Longoria", "Jonathan Lucroy",
    "Miguel Cabera")

select(top_df, Player, Transition,
       HR1, Expected, Change)

select(bottom_df, Player, Transition,
       HR1, Expected, Change)
	# data frame scip reads all Statcast play by play for the 2015 through 2019 seasons

	scip <- read_csv("~/Dropbox/2016 WORK/BLOG Baseball R/OTHER/StatcastData/5_seasons_inplay_b.csv")

	# function all_work() does all the computation

	# inputs are season1 - first season, season2 - second season, minBIP - min balls in play for both seasons

	# two outputs: p is the ggplot2 representation of the graph of the changes in HR production
	# d contains the data frame with the calculations for all players

	all_work <- function(season1, season2,
	minBIP = 300){
	require(tidyverse)
	require(mgcv)
	require(CalledStrike)
	# function to implement the GAM random
	# effects model
	two_var_fit_re <- function(df){
	gam(HR ~ s(launch_angle, launch_speed) +
	s(Batter, bs = "re"),
	family = binomial,
	data = df) -> fit
	}
	# find players with at least 300 BIP
	# for both seasons
	scip %>%
	filter(Season == season1) %>%
	group_by(batter) %>%
	summarize(BIP = n()) %>%
	filter(BIP >= minBIP) %>%
	select(batter) %>% pull() -> players300a

	scip %>%
	filter(Season == season2) %>%
	group_by(batter) %>%
	summarize(BIP = n()) %>%
	filter(BIP >= minBIP) %>%
	select(batter) %>% pull() -> players300b

	players300 <- intersect(players300a,
	players300b)
	N <- length(players300)

	# create data frame of data for those players
	# for both seasons
	df300_twoyears <- filter(scip,
	batter %in% players300,
	Season %in% c(season1, season2)) %>%
	mutate(Batter = factor(batter))

	df300_season1 <- filter(df300_twoyears,
	Season == season1)

	# run GAM fit to first season data
	fit1 <- two_var_fit_re(df300_season1)
	# extract the random effects
	beta <- coef(fit1)
	p <- length(beta)
	REFF <- data.frame(
	Batter = levels(df300_season1$Batter),
	reff = coef(fit1)[(p - N + 1):p])
	row.names(REFF) <- NULL

	# merge these season1 random effects
	# with the season 2 dataset
	df300_season2 <- filter(df300_twoyears,
	Season == season2) %>%
	select(Batter, HR, launch_angle,
	launch_speed) %>%
	inner_join(REFF, by = c("Batter")) ->
	df300_season2

	# use season1 fit to predict HR probability
	# for each BIP in season2
	df300_season2$Prob <- predict(fit1,
	newdata = df300_season2,
	type = "response")

	# for each player, compute expected HR
	# for single BIP in second season
	df300_season2 %>%
	group_by(Batter) %>%
	summarize(RE = first(reff),
	BIP2 = n(),
	HR2 = sum(HR),
	E1 = sum(Prob, na.rm = TRUE) /
	BIP2) -> S2
	# for each player, find BIP and HR in
	# first season
	df300_season1 %>%
	group_by(Batter) %>%
	summarize(BIP1 = n(),
	HR1 = sum(HR)) -> S1
	# merge the two datasets
	S12 <- inner_join(S1, S2,
	by = "Batter")

	# graph the first season HR against the
	# increase in HR due to the change in
	# launch conditions in 2nd season
	the_graph <- ggplot(S12, aes(HR1, BIP1 * E1 - HR1)) +
	geom_point() +
	geom_hline(yintercept = 0, color = "red") +
	increasefont() +
	ggtitle(paste(season1, "/", season2)) +
	centertitle() +
	xlab(paste(season1, "HR")) +
	ylab(paste(season2, "Increase Due to LC"))

	list(p = the_graph, d = S12)
	}

	out1 <- all_work(2015, 2016, 200)
	out2 <- all_work(2016, 2017, 200)
	out3 <- all_work(2017, 2018, 200)
	out4 <- all_work(2018, 2019, 200)

	save(out1, out2, out3, out4,
	file = "twoyears2.Rdata")

	#load("twoyears.Rdata")
	#load("twoyears2.Rdata")

	# check Chris Davis 448801

	out1$d %>% filter(as.character(Batter) ==
	"448801")

	# proportion of positive increases for each
	# pair of years

	prop_pos <- function(out){
	out$d %>%
	summarize(N = n(),
	P = mean(BIP1 * E1 - HR1 > 0))
	}

	prop_pos(out1) # 210, 0.643
	prop_pos(out2) # 210, 0.443
	prop_pos(out3) # 216, 0.569
	prop_pos(out4) # 193, 0.611

	dis_df <- data.frame(
	Transition = c("2015/2016", "2016/2017",
	"2017/2018", "2018/2019"),
	Percentage_Positive = c(64.3, 44.3,
	56.9, 61.1))
	#########

	out1$p + increasefont(12) +
	ylab("2016 Increase") -> p1
	out2$p + increasefont(12) +
	ylab("2017 Increase") -> p2
	out3$p + increasefont(12) +
	ylab("2018 Increase") -> p3
	out4$p + increasefont(12) +
	ylab("2019 Increase") -> p4

	grid.arrange(p1, p2, p3, p4)

	# put all data frames together

	out1$d %>%
	mutate(Transition = "2015/2016") -> df1
	out2$d %>%
	mutate(Transition = "2016/2017") -> df2
	out3$d %>%
	mutate(Transition = "2017/2018") -> df3
	out4$d %>%
	mutate(Transition = "2018/2019") -> df4
	df_all <- rbind(df1, df2, df3, df4)

	df_all %>%
	mutate(Expected = BIP1 * E1,
	Change = Expected - HR1) -> df_all

	select(df_all, Batter, Transition, HR1,
	Expected, Change) %>%
	arrange(desc(Change)) %>%
	head(5) -> top_df

	select(df_all, Batter, Transition, HR1,
	Expected, Change) %>%
	arrange(Change) %>%
	head(5) -> bottom_df

	# players at top

	top_df$Player <-
	c("Mookie Betts", "Xander Bogaerts",
	"Christian Yelich",
	"J.D. Martinez", "Yonder Alonso")

	bottom_df$Player <-
	c("Mark Trumbo", "Khris Davis",
	"Evan Longoria", "Jonathan Lucroy",
	"Miguel Cabera")

	select(top_df, Player, Transition,
	HR1, Expected, Change)

	select(bottom_df, Player, Transition,
	HR1, Expected, Change)