Created
February 9, 2017 18:32
-
-
Save bayesball/8710f49b94abffd642251808f8897a4e to your computer and use it in GitHub Desktop.
Implementation of a simulation permutation test to detect situational hitting
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
title: "Is David Ortiz Clutch?" | |
output: | |
html_document: default | |
html_notebook: default | |
--- | |
Read in Retrosheet data for the 2000 through the 2013 seasons. | |
Also read in the compute.runs function. | |
```{r} | |
load("~/Library/Mobile Documents/com~apple~CloudDocs/Retrosheet/pbp.00.16.Rdata") | |
source("compute.runs.R") | |
``` | |
The setup_work function ... | |
- computes the runs expectancy matrrix for the particular season | |
- for each possible state (runners and outs), finds the runs value of each possible event (single, double, triple, home run, out, walk/hbp) using particular assumptions about run advancement | |
Read in transitions matrix: | |
```{r} | |
require(readr) | |
tr <- read_csv("transitions.csv") | |
``` | |
```{r} | |
setup_work <- function(rdata, tr){ | |
require(dplyr) | |
rdata_b <- filter(rdata, BAT_EVENT_FL==TRUE) | |
runs <- summarize(group_by(rdata_b, | |
STATE), R=mean(RUNS.ROI)) | |
runs <- rbind(runs, data.frame(STATE="3", R=0)) | |
runs_before <- inner_join(select(tr, STATE), | |
runs, by="STATE") | |
runs_after <- inner_join(select(tr, NEW.STATE), | |
runs, by=c("NEW.STATE"="STATE")) | |
tr_new <- cbind(runs_before, runs_after, tr[, 3:4]) | |
names(tr_new) <- c("STATE", "R_before", "NEW_STATE", | |
"R_after", "RUNS.SCORED", "Type") | |
tr_new <- mutate(tr_new, | |
RUNS=R_after - R_before + RUNS.SCORED) | |
tr_new | |
} | |
``` | |
The clutch work function ... | |
- inputs the name of the player, the retrosheet data, and the tr_new data frame found in the previous function | |
- filters the player's PA's to only include the outcomes above | |
- computes the sum of runs values | |
- assuming that the initial state (runners and bases) is independent of event, randomly permutes the sequence of play events, computing the sum of runs values | |
- outputs the observed runs value and mean of the simulated runs values | |
- if the observed runs value is larger than the mean of the simulated runs, there is some evidence of clutch ability | |
```{r} | |
clutch_work <- function(player, rdata, tr_new){ | |
require(Lahman) | |
require(dplyr) | |
names <- unlist(strsplit(player, " ")) | |
id <- filter(Master, nameLast==names[2], | |
nameFirst == names[1])$retroID | |
results <- select(filter(rdata, BAT_ID == id, BAT_EVENT_FL), | |
STATE, EVENT_CD) | |
results <- filter(results, EVENT_CD %in% | |
c(2, 3, 14, 15, 16, 20, 21, 22, 23)) | |
results <- mutate(results, | |
Type=ifelse(EVENT_CD < 4, "Out", | |
ifelse(EVENT_CD %in% c(14, 15, 16), "Walk", | |
ifelse(EVENT_CD == 20, "Single", | |
ifelse(EVENT_CD == 21, "Double", | |
ifelse(EVENT_CD == 22, "Triple", "Home Run")))))) | |
results <- inner_join(select(results, STATE, Type), | |
select(tr_new, STATE, Type, RUNS), | |
by=c("STATE", "Type")) | |
one_sim <- function(results){ | |
results1 <- results | |
results1$Type <- sample(results1$Type) | |
results1 <- inner_join(select(results1, STATE, Type), | |
select(tr_new, STATE, Type, RUNS), | |
by=c("STATE", "Type")) | |
sum(results1$RUNS) | |
} | |
c(Observed = sum(results$RUNS), | |
Mean_Simulated = mean(replicate(1000, | |
one_sim(results)))) | |
} | |
``` | |
Runs this function for David Ortiz for the 2000 through 2013 seasons. Collects the observed and mean simulated runs for all seasons. | |
```{r} | |
library(dplyr) | |
clutch <- NULL | |
for(season in 2000:2016){ | |
d <- filter(pbp.00.16, Season==season) | |
d <- compute.runs(d) | |
tr_new <- setup_work(d, tr) | |
out <- clutch_work("David Ortiz", d, tr_new) | |
clutch <- rbind(clutch, | |
data.frame(Season=season, Observed=out[1], Simulated=out[2])) | |
} | |
clutch | |
``` | |
Constructs a few graphs: | |
```{r} | |
library(ggplot2) | |
ggplot(clutch, aes(Season, Observed)) + geom_point() + | |
geom_smooth() + | |
geom_point(aes(Season, Simulated), color="red") | |
ggplot(clutch, aes(Season, Observed - Simulated)) + | |
geom_point() + | |
geom_hline(yintercept=0, color="red") | |
``` | |
Try Derek Jeter: | |
```{r} | |
library(dplyr) | |
clutch <- NULL | |
for(season in 2000:2014){ | |
d <- filter(pbp.00.16, Season==season) | |
d <- compute.runs(d) | |
tr_new <- setup_work(d, tr) | |
out <- clutch_work("Derek Jeter", d, tr_new) | |
clutch <- rbind(clutch, | |
data.frame(Season=season, Observed=out[1], Simulated=out[2])) | |
} | |
clutch | |
``` | |
```{r} | |
library(ggplot2) | |
ggplot(clutch, aes(Season, Observed)) + geom_point() + | |
geom_smooth() + | |
geom_point(aes(Season, Simulated), color="red") | |
ggplot(clutch, aes(Season, Observed - Simulated)) + | |
geom_point() + | |
geom_hline(yintercept=0, color="red") | |
``` |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment