bayesball/gist:4302c5309bdd5cb8a307612804d3a41b

## gistfile1.txt
---
title: "Home Run Prediction"
author: "Jim Albert"
date: "10/14/2020"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE,
                      message = FALSE,
                      warning = FALSE)
```

#### Predicting home runs in the 2020 playoffs

Illustrates the use of a nonnested multilevel model with random effects for both teams and parks.

Fitting by Stan via brms package.  The focus is on prediction of the number of home runs hit during the 2020 Division Series.

Motivation:

- nice illustration of random effects
- illustrates the use of posterior predictive distribution for model checking and for prediction
- see if home run hitting in playoffs is consistent
with 2020 regular season hitting
- model can be used to adjust for team and field (especially appropriate for 2020 season)

Outline:

- extension of Predicting Home Run Count using a
Random Effects Model post of August 3, 2019 (summarize what was done there)
- denominator?  use balls in play (AB - SO)
- Poisson errors, offset, log link
- description of random effects model

#### Data Part

Load in packages:

```{r}
library(tidyverse)
library(ProbBayes)
library(brms)
library(stringr)
library(gridExtra)
```


Read in 2020 home run data2

```{r}
S2 <- read_csv("https://bayesball.github.io/baseball/2020homeruns.csv")
head(S2)
```

#### Model Fitting


Bayesian fit using Stan

```{r}
bfit2 <- brm(HR ~ offset(log(N)) +
               (1 | BAT_TEAM) +
               (1 | venue_name),
            data = S2,
            family = poisson,
            refresh = 0)
```
Priors?

```{r}
prior_summary(bfit2)
```

Summary of posterior fit

```{r}
bfit2
```

Collect posterior draws of all unobservables

```{r}
draws <- data.frame(bfit2)
```

#### Model Fits

Draws MCMC diagnostics for intercept and standard deviations

```{r}
plot(bfit2)
```

Plot estimated random effects

First park effects -- extract posterior means

```{r}
pm <- apply(draws, 2, mean)[34:63]

dpm <- data.frame(Park = names(pm),
                  Effect = pm)
row.names(dpm) <- NULL
```

Trim labels of random effects

```{r}
dpm$Park <- str_remove(dpm$Park, "r_venue_name.")
dpm$Park <- str_remove(dpm$Park, ".Intercept.")
```

Graph

```{r}
ggplot(dpm, aes(Park, Effect)) +
  geom_point() +
  coord_flip() +
  geom_hline(yintercept = 0, color = "red") +
  ggtitle("Park Effects") +
  centertitle()
```

Team effects

```{r}
pm2 <- apply(draws, 2, mean)[4:33]
dpm <- data.frame(Team = names(pm2),
                  Effect = pm2)
row.names(dpm) <- NULL
dpm$Team <- str_remove(dpm$Team, "r_BAT_TEAM.")
dpm$Team <- str_remove(dpm$Team, ".Intercept.")
```

```{r}
ggplot(dpm, aes(Team, Effect)) +
  geom_point() +
  coord_flip() +
  geom_hline(yintercept = 0, color = "red") +
  centertitle() +
  ggtitle("Team Effects")
```

#### Predictive checks

(this is not the greatest plot for discrete response but it shows the basic pattern)

```{r}
pp_check(bfit2, nsamples = 50)
```

Look at frequencies of 0, 1, 2, ... home runs in
observed data

```{r}
bar_plot(S2$HR)
table(S2$HR)
```


Compare with simulated predictive frequencies

Here is simulated posterior predictive ofreplicated data

```{r}
pp <- posterior_predict(bfit2)
```

My clunky attempt to get mean number of y over simulations

```{r}
k <- 0:7
sumk <- function(y, k) sum(y == k)
ppm <- rep(0, 8)
for(j in 0:7){
  ppm[j + 1] <- mean(apply(pp, 1, sumk, j))
}
```

Graph

```{r}
outa <- data.frame(Number = 0:7,
                  Count = c(table(S2$HR)),
                  Type = "Observed")
outb <- data.frame(Number = 0:7,
                   Count = ppm,
                   Type = "Predicted")

out <- rbind(outa, outb)
```

```{r}
ggplot(out) +
  geom_line(aes(Number, Count, color = Type)) +
  geom_point(aes(Number, Count, color = Type)) +
  increasefont() +
  centertitle() +
  ggtitle("Observed and Predicted Home Runs\n Per Team Per Game")
```

#### Prediction


Predict the number of home runs in the playoffs

This function
- inputs team names and stadium
and number of BIP and observed HR for each team
- output a graph of a sample from pp distribution of total HR showing
a 90% prediction interval and the observed home run count

```{r}
predict_hr <- function(draws,
                       team1, team2,
                       field, BIP1, BIP2,
                       obs_hr1, obs_hr2){

  label1 <- paste("r_BAT_TEAM.", team1, ".Intercept.",
                sep="")
  label2 <- paste("r_BAT_TEAM.", team2, ".Intercept.",
                sep="")
  label3 <- paste("r_venue_name.", field,
                ".Intercept.", sep = "")

  # this does the simulated predictions
  dr <- draws[, c(label1, label2, label3)]
  lam1 <- exp(log(BIP1) + draws$b_Intercept +
              dr[, 1] + dr[, 3])
  lam2 <- exp(log(BIP2) + draws$b_Intercept +
             dr[, 2] + dr[, 3])
  N <- dim(draws)[1]
  hr1 <- rpois(N, lam1)
  hr2 <- rpois(N, lam2)
  total_hr <- hr1 + hr2
  total_observed <- obs_hr1 + obs_hr2

  df <- data.frame(HR = total_hr,
                   Type = "Predicted")
  df[df$HR == total_observed, "Type"] = "Observed"

  q <- quantile(total_hr, c(.05, .95))

  ggplot(df, aes(HR, fill = Type)) +
    geom_bar(width = 0.5) +
    ggtitle(paste(team1, " vs ", team2,
          ": 90% Interval: (",
                q[1], ", ", q[2], ")", sep="")) +
              centertitle() +
    xlab("Total Home Runs") + increasefont() +
    scale_fill_manual(values =
                c("black", "red"))
}

```

Illustrate with 4 best-of-five series

```{r}
p1 <- predict_hr(draws,
           "NYY", "TB", "Petco.Park", 120, 114,
           10, 11)

p2 <- predict_hr(draws,
           "HOU", "OAK", "Dodger.Stadium",
           115, 102, 12, 12)

p3 <- predict_hr(draws,
           "ATL", "MIA", "Minute.Maid.Park",
           73, 69, 5, 1)

p4 <- predict_hr(draws,
           "LAD", "SD", "Globe.Life.Field",
           79, 68, 1, 2)

grid.arrange(p1, p2, p3, p4, ncol = 2)
```
 Some more series:

 Houston/Tampa Bay Championship series.

```{r}
p11 <- predict_hr(draws,
           "HOU", "TB", "Petco.Park",
           173, 138, 9, 11)
```

Atlanta/Los Angeles Championship series.

```{r}
p12 <- predict_hr(draws,
           "ATL", "LAD", "Globe.Life.Field",
           173, 173, 9, 16)
```


```{r}
grid.arrange(p11, p12, ncol = 1)
```
	---
	title: "Home Run Prediction"
	author: "Jim Albert"
	date: "10/14/2020"
	output: html_document
	---

	```{r setup, include=FALSE}
	knitr::opts_chunk$set(echo = TRUE,
	message = FALSE,
	warning = FALSE)
	```

	#### Predicting home runs in the 2020 playoffs

	Illustrates the use of a nonnested multilevel model with random effects for both teams and parks.

	Fitting by Stan via brms package. The focus is on prediction of the number of home runs hit during the 2020 Division Series.

	Motivation:

	- nice illustration of random effects
	- illustrates the use of posterior predictive distribution for model checking and for prediction
	- see if home run hitting in playoffs is consistent
	with 2020 regular season hitting
	- model can be used to adjust for team and field (especially appropriate for 2020 season)

	Outline:

	- extension of Predicting Home Run Count using a
	Random Effects Model post of August 3, 2019 (summarize what was done there)
	- denominator? use balls in play (AB - SO)
	- Poisson errors, offset, log link
	- description of random effects model

	#### Data Part

	Load in packages:

	```{r}
	library(tidyverse)
	library(ProbBayes)
	library(brms)
	library(stringr)
	library(gridExtra)
	```


	Read in 2020 home run data2

	```{r}
	S2 <- read_csv("https://bayesball.github.io/baseball/2020homeruns.csv")
	head(S2)
	```

	#### Model Fitting


	Bayesian fit using Stan

	```{r}
	bfit2 <- brm(HR ~ offset(log(N)) +
	(1 \| BAT_TEAM) +
	(1 \| venue_name),
	data = S2,
	family = poisson,
	refresh = 0)
	```
	Priors?

	```{r}
	prior_summary(bfit2)
	```

	Summary of posterior fit

	```{r}
	bfit2
	```

	Collect posterior draws of all unobservables

	```{r}
	draws <- data.frame(bfit2)
	```

	#### Model Fits

	Draws MCMC diagnostics for intercept and standard deviations

	```{r}
	plot(bfit2)
	```

	Plot estimated random effects

	First park effects -- extract posterior means

	```{r}
	pm <- apply(draws, 2, mean)[34:63]

	dpm <- data.frame(Park = names(pm),
	Effect = pm)
	row.names(dpm) <- NULL
	```

	Trim labels of random effects

	```{r}
	dpm$Park <- str_remove(dpm$Park, "r_venue_name.")
	dpm$Park <- str_remove(dpm$Park, ".Intercept.")
	```

	Graph

	```{r}
	ggplot(dpm, aes(Park, Effect)) +
	geom_point() +
	coord_flip() +
	geom_hline(yintercept = 0, color = "red") +
	ggtitle("Park Effects") +
	centertitle()
	```

	Team effects

	```{r}
	pm2 <- apply(draws, 2, mean)[4:33]
	dpm <- data.frame(Team = names(pm2),
	Effect = pm2)
	row.names(dpm) <- NULL
	dpm$Team <- str_remove(dpm$Team, "r_BAT_TEAM.")
	dpm$Team <- str_remove(dpm$Team, ".Intercept.")
	```

	```{r}
	ggplot(dpm, aes(Team, Effect)) +
	geom_point() +
	coord_flip() +
	geom_hline(yintercept = 0, color = "red") +
	centertitle() +
	ggtitle("Team Effects")
	```

	#### Predictive checks

	(this is not the greatest plot for discrete response but it shows the basic pattern)

	```{r}
	pp_check(bfit2, nsamples = 50)
	```

	Look at frequencies of 0, 1, 2, ... home runs in
	observed data

	```{r}
	bar_plot(S2$HR)
	table(S2$HR)
	```


	Compare with simulated predictive frequencies

	Here is simulated posterior predictive ofreplicated data

	```{r}
	pp <- posterior_predict(bfit2)
	```

	My clunky attempt to get mean number of y over simulations

	```{r}
	k <- 0:7
	sumk <- function(y, k) sum(y == k)
	ppm <- rep(0, 8)
	for(j in 0:7){
	ppm[j + 1] <- mean(apply(pp, 1, sumk, j))
	}
	```

	Graph

	```{r}
	outa <- data.frame(Number = 0:7,
	Count = c(table(S2$HR)),
	Type = "Observed")
	outb <- data.frame(Number = 0:7,
	Count = ppm,
	Type = "Predicted")

	out <- rbind(outa, outb)
	```

	```{r}
	ggplot(out) +
	geom_line(aes(Number, Count, color = Type)) +
	geom_point(aes(Number, Count, color = Type)) +
	increasefont() +
	centertitle() +
	ggtitle("Observed and Predicted Home Runs\n Per Team Per Game")
	```

	#### Prediction


	Predict the number of home runs in the playoffs

	This function
	- inputs team names and stadium
	and number of BIP and observed HR for each team
	- output a graph of a sample from pp distribution of total HR showing
	a 90% prediction interval and the observed home run count

	```{r}
	predict_hr <- function(draws,
	team1, team2,
	field, BIP1, BIP2,
	obs_hr1, obs_hr2){

	label1 <- paste("r_BAT_TEAM.", team1, ".Intercept.",
	sep="")
	label2 <- paste("r_BAT_TEAM.", team2, ".Intercept.",
	sep="")
	label3 <- paste("r_venue_name.", field,
	".Intercept.", sep = "")

	# this does the simulated predictions
	dr <- draws[, c(label1, label2, label3)]
	lam1 <- exp(log(BIP1) + draws$b_Intercept +
	dr[, 1] + dr[, 3])
	lam2 <- exp(log(BIP2) + draws$b_Intercept +
	dr[, 2] + dr[, 3])
	N <- dim(draws)[1]
	hr1 <- rpois(N, lam1)
	hr2 <- rpois(N, lam2)
	total_hr <- hr1 + hr2
	total_observed <- obs_hr1 + obs_hr2

	df <- data.frame(HR = total_hr,
	Type = "Predicted")
	df[df$HR == total_observed, "Type"] = "Observed"

	q <- quantile(total_hr, c(.05, .95))

	ggplot(df, aes(HR, fill = Type)) +
	geom_bar(width = 0.5) +
	ggtitle(paste(team1, " vs ", team2,
	": 90% Interval: (",
	q[1], ", ", q[2], ")", sep="")) +
	centertitle() +
	xlab("Total Home Runs") + increasefont() +
	scale_fill_manual(values =
	c("black", "red"))
	}

	```

	Illustrate with 4 best-of-five series

	```{r}
	p1 <- predict_hr(draws,
	"NYY", "TB", "Petco.Park", 120, 114,
	10, 11)

	p2 <- predict_hr(draws,
	"HOU", "OAK", "Dodger.Stadium",
	115, 102, 12, 12)

	p3 <- predict_hr(draws,
	"ATL", "MIA", "Minute.Maid.Park",
	73, 69, 5, 1)

	p4 <- predict_hr(draws,
	"LAD", "SD", "Globe.Life.Field",
	79, 68, 1, 2)

	grid.arrange(p1, p2, p3, p4, ncol = 2)
	```
	Some more series:

	Houston/Tampa Bay Championship series.

	```{r}
	p11 <- predict_hr(draws,
	"HOU", "TB", "Petco.Park",
	173, 138, 9, 11)
	```

	Atlanta/Los Angeles Championship series.

	```{r}
	p12 <- predict_hr(draws,
	"ATL", "LAD", "Globe.Life.Field",
	173, 173, 9, 16)
	```


	```{r}
	grid.arrange(p11, p12, ncol = 1)
	```