Skip to content

Instantly share code, notes, and snippets.

@seanjtaylor
Created June 15, 2016 16:04
Show Gist options
  • Save seanjtaylor/b7441ee2071ae37c4306b9af6ef8e365 to your computer and use it in GitHub Desktop.
Save seanjtaylor/b7441ee2071ae37c4306b9af6ef8e365 to your computer and use it in GitHub Desktop.
Quick analysis of NFL Officials
library(rvest)
library(stringr)
library(readr)
library(ggplot2)
library(dplyr)
library(tidyr)
library(broom)
library(lubridate)
base.url <- 'http://www.pro-football-reference.com/'
officials.url <- 'http://www.pro-football-reference.com/officials/'
official_urls <- read_html(officials.url) %>%
html_nodes('table a') %>%
html_attr('href')
official_names <- read_html(officials.url) %>%
html_nodes('table a') %>%
html_text()
officials <- data_frame(name = official_names, url = official_urls)
all.data <- officials %>%
group_by(name, url) %>% do({
url <- paste(base.url, .$url, sep = '')
doc <- read_html(url)
off.data <- doc %>%
html_nodes('table#game_logs') %>%
html_table %>%
first %>%
filter(VPen != 'VPen') %>%
group_by(Year, Game, Position) %>% do({
vals <- str_split(.$Game, ' @ ')[[1]]
data_frame(home = str_replace(vals[2], '\\*', ''),
away = str_replace(vals[1], '\\*', ''),
hpts = as.numeric(.$HPts),
vpts = as.numeric(.$VPts),
hpen = as.numeric(.$HPen),
vpen = as.numeric(.$VPen),
hpenyards = as.numeric(.$HPYds),
vpenyards = as.numeric(.$VPYds))
}) %>%
ungroup
}) %>%
ungroup
all.data %>% write_csv('officials_data.csv')
################################################################################
## After scraping has been done once.
all.data <- read_csv('officials_data.csv')
home <- all.data %>%
mutate(win = as.numeric(hpts > vpts)) %>%
select(date = Year, name, team = home, pens = hpen, yds = hpenyards, win) %>%
mutate(home = 1)
away <- all.data %>%
mutate(win = as.numeric(vpts > hpts)) %>%
select(date = Year, name, team = away, pens = vpen, yds = vpenyards, win) %>%
mutate(home = 0)
long.data <- home %>% bind_rows(away)
## Some sanity checks. Who are the top officials?
all.data %>%
group_by(name) %>%
summarise(n = n()) %>%
arrange(-n)
## How many games per team?
long.data %>%
group_by(team) %>%
summarise(n = length(unique(date))) %>%
arrange(-n)
## Convert to wide format so we have one column per official
## So we can make indicator variables.
wide <- long.data %>%
mutate(seas = year(date - 180), ## cheap hack to get "season"
present = 1) %>%
spread(name, present, 0)
## officials.mat has one row per game, one column per official
officials.mat <- as.matrix(wide[,8:ncol(wide)])
## team.mat has one row per game, one column per (team, season)
team.mat <- model.matrix( ~ 0 + team:factor(seas), wide)
## Regression with penalties ~ team + all officials involved
m <- lm(wide$pens ~ wide$home + team.mat + officials.mat)
## Count games per official so we can use this as a filter in the plot below.
official.rollup <- long.data %>%
group_by(name) %>%
summarise(n.games = n()/ 2)
tidy(m) %>%
mutate(official = str_match(term, 'officials\\.mat(.*)')[,2]) %>%
filter(!is.na(official)) %>%
inner_join(official.rollup, by = c('official' = 'name')) %>%
filter(n.games >= 250) %>%
ggplot(aes(x = reorder(official, estimate), y = estimate,
ymin = estimate - 1.96 * std.error,
ymax = estimate + 1.96 * std.error)) +
geom_pointrange() +
coord_flip() +
ylab('Team-Adjusted Extra Penalties per Game') +
xlab('Official Name') +
geom_hline(yintercept = 0.0, linetype = 'dashed') +
theme_bw()
## Teams plots!
tidy(m) %>%
mutate(team = str_match(term, 'team\\.matteam(.*):factor\\(seas\\)(.*)')[,2],
seas = str_match(term, 'team\\.matteam(.*):factor\\(seas\\)(.*)')[,3]) %>%
filter(!is.na(team), seas == 2015) %>%
ggplot(aes(x = reorder(team, estimate), y = estimate,
ymin = estimate - 1.96 * std.error,
ymax = estimate + 1.96 * std.error)) +
geom_pointrange() +
coord_flip() +
ylab('Official-Adjusted Net Penalties per Game') +
xlab('Team') +
geom_hline(yintercept = 0.0, linetype = 'dashed') +
theme_bw()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment