Created
March 10, 2014 05:02
-
-
Save cpsievert/9459745 to your computer and use it in GitHub Desktop.
Script used to create figure for first analyzing baseball data with R post
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(pitchRx) | |
library(dplyr) | |
library(mgcv) | |
# Establish a SQLite database connection | |
my_db <- src_sqlite("pitchRx.sqlite3") | |
# DISLCAIMER: this 'pitchfx.sqlite3' database was obtained using pitchRx version 1.2 | |
# The code below probably won't work if you are using data collected from earlier versions or other methods | |
# Anyway, if you want to recreate this analysis, make sure you have pitchRx 1.2 or higher, then run: | |
#scrape(start="2008-01-01", end="2014-01-01", connect=my_db$con) | |
# Once you have your database, its a good idea to create some indicies for faster queries. | |
#dbSendQuery(my_db$con, "CREATE INDEX atbat_link ON atbat(gameday_link)") | |
#dbSendQuery(my_db$con, "CREATE INDEX pitch_link ON pitch(gameday_link)") | |
#dbSendQuery(my_db$con, "CREATE INDEX des_index ON pitch(des)") | |
# Start building an SQL query using dplyr | |
# Note we are 'filtering' to umpire decisions | |
pitches <- tbl(my_db, "pitch") %.% | |
select(px, pz, des, count, num, gameday_link) %.% | |
filter(des == "Called Strike" | des == "Ball") | |
atbats <- tbl(my_db, "atbat") %.% | |
select(stand, b_height, num, gameday_link) | |
# Join these tables together into one table | |
table <- left_join(x = pitches, y = atbats, by = c("gameday_link", "num")) | |
# Bring this data into memory (it will take a minute or two) | |
dat <- collect(table) | |
# Some games (preseason games in particular) don't have a PITCHf/x system in place -- get rid of these pitches | |
dat <- dat[!is.na(dat$px) & !is.na(dat$pz),] | |
# Create an indicator for called strike | |
dat$strike <- as.numeric(dat$des %in% "Called Strike") | |
# Turn relevant model covariates into factors | |
dat$stand <- factor(dat$stand) | |
dat$year <- factor(substr(dat$gameday_link, 5, 8)) | |
# Create an indicator for two strikes from the count variable | |
strikes <- sub("^[0-9]-", "", dat$count) | |
dat$strikes <- factor(sub("[0-1]", "other", strikes)) | |
# Create an indicator for three balls from the count variable | |
balls <- sub("-[0-9]$", "", dat$count) | |
dat$three_balls <- factor(sub("[0-2]", "other", balls)) | |
# Use multiple cores to fit gams. Code derived from Brian Mills' work - http://princeofslides.blogspot.com/2013/07/advanced-sab-r-metrics-parallelization.html | |
library(parallel) | |
cl <- makeCluster(detectCores()-1) | |
# Model for 'mercy' in a two strike count. WARNING: this took several hours on a cluster with 24 nodes! | |
m1 <- bam(strike ~ interaction(stand, year, strikes) + | |
s(px, pz, by = interaction(stand, year, strikes)), | |
data = dat, family = binomial(link = 'logit')) | |
# Make sure you save the model when it is finished running | |
save(m1, file="annual-strike.rda") | |
# Man, this takes a while too... | |
png(file = "strike-plot.png", width = 400, height = 1200, type = "cairo") | |
strikeFX(dat, model = m1, density1 = list(strikes = "2"), | |
density2 = list(strikes = "other"), layer = facet_grid(year ~ stand)) + | |
coord_equal() | |
dev.off() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment