Skip to content

Instantly share code, notes, and snippets.

@cpsievert
Created March 10, 2014 05:02
Show Gist options
  • Save cpsievert/9459745 to your computer and use it in GitHub Desktop.
Save cpsievert/9459745 to your computer and use it in GitHub Desktop.
Script used to create figure for first analyzing baseball data with R post
library(pitchRx)
library(dplyr)
library(mgcv)
# Establish a SQLite database connection
my_db <- src_sqlite("pitchRx.sqlite3")
# DISLCAIMER: this 'pitchfx.sqlite3' database was obtained using pitchRx version 1.2
# The code below probably won't work if you are using data collected from earlier versions or other methods
# Anyway, if you want to recreate this analysis, make sure you have pitchRx 1.2 or higher, then run:
#scrape(start="2008-01-01", end="2014-01-01", connect=my_db$con)
# Once you have your database, its a good idea to create some indicies for faster queries.
#dbSendQuery(my_db$con, "CREATE INDEX atbat_link ON atbat(gameday_link)")
#dbSendQuery(my_db$con, "CREATE INDEX pitch_link ON pitch(gameday_link)")
#dbSendQuery(my_db$con, "CREATE INDEX des_index ON pitch(des)")
# Start building an SQL query using dplyr
# Note we are 'filtering' to umpire decisions
pitches <- tbl(my_db, "pitch") %.%
select(px, pz, des, count, num, gameday_link) %.%
filter(des == "Called Strike" | des == "Ball")
atbats <- tbl(my_db, "atbat") %.%
select(stand, b_height, num, gameday_link)
# Join these tables together into one table
table <- left_join(x = pitches, y = atbats, by = c("gameday_link", "num"))
# Bring this data into memory (it will take a minute or two)
dat <- collect(table)
# Some games (preseason games in particular) don't have a PITCHf/x system in place -- get rid of these pitches
dat <- dat[!is.na(dat$px) & !is.na(dat$pz),]
# Create an indicator for called strike
dat$strike <- as.numeric(dat$des %in% "Called Strike")
# Turn relevant model covariates into factors
dat$stand <- factor(dat$stand)
dat$year <- factor(substr(dat$gameday_link, 5, 8))
# Create an indicator for two strikes from the count variable
strikes <- sub("^[0-9]-", "", dat$count)
dat$strikes <- factor(sub("[0-1]", "other", strikes))
# Create an indicator for three balls from the count variable
balls <- sub("-[0-9]$", "", dat$count)
dat$three_balls <- factor(sub("[0-2]", "other", balls))
# Use multiple cores to fit gams. Code derived from Brian Mills' work - http://princeofslides.blogspot.com/2013/07/advanced-sab-r-metrics-parallelization.html
library(parallel)
cl <- makeCluster(detectCores()-1)
# Model for 'mercy' in a two strike count. WARNING: this took several hours on a cluster with 24 nodes!
m1 <- bam(strike ~ interaction(stand, year, strikes) +
s(px, pz, by = interaction(stand, year, strikes)),
data = dat, family = binomial(link = 'logit'))
# Make sure you save the model when it is finished running
save(m1, file="annual-strike.rda")
# Man, this takes a while too...
png(file = "strike-plot.png", width = 400, height = 1200, type = "cairo")
strikeFX(dat, model = m1, density1 = list(strikes = "2"),
density2 = list(strikes = "other"), layer = facet_grid(year ~ stand)) +
coord_equal()
dev.off()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment