Last active
July 31, 2020 04:01
-
-
Save jthomasmock/b8a1c6e90a199cf72c6c888bd899e84e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(arrow) | |
library(tidyverse) | |
library(tictoc) | |
# create a parent directory | |
dir.create("nflfastr") | |
# create function for partition directories and download parquet files | |
get_data <- function(year){ | |
dir.create(file.path("nflfastr", year)) | |
download.file( | |
glue::glue("https://github.com/guga31bb/nflfastR-data/blob/master/data/play_by_play_{year}.parquet?raw=true"), | |
file.path("nflfastr", year, "data.parquet"), | |
mode = 'wb' | |
) | |
} | |
# create folder and download for each year | |
walk(1999:2019, get_data) | |
# open connection as arrow | |
ds <- open_dataset("nflfastr", partitioning = "year") | |
# Subset of years | |
tic() | |
ds %>% | |
select(year, play_type, yards_gained, epa, penalty, season) %>% | |
filter(year %in% c(1999, 2019), | |
play_type %in% c("run", "pass"), penalty == 0) %>% | |
collect() %>% | |
group_by(season, play_type) %>% | |
summarize( | |
avg_yds = mean(yards_gained, na.rm = TRUE), | |
avg_epa = mean(epa, na.rm = TRUE), | |
n = n() | |
) | |
toc() | |
# 0.094 sec elapsed | |
# # A tibble: 4 x 5 | |
# # Groups: season [2] | |
# season play_type avg_yds avg_epa n | |
# <int> <chr> <dbl> <dbl> <int> | |
# 1 1999 pass 5.84 -0.0214 18476 | |
# 2 1999 run 3.99 -0.0986 13658 | |
# 3 2019 pass 6.27 0.0232 19616 | |
# 4 2019 run 4.47 -0.0494 13344 | |
# All years | |
tic() | |
ds %>% | |
select(year, play_type, yards_gained, epa, penalty, season) %>% | |
filter(play_type %in% c("run", "pass"), penalty == 0) %>% | |
collect() %>% | |
group_by(season, play_type) %>% | |
summarize( | |
avg_yds = mean(yards_gained, na.rm = TRUE), | |
avg_epa = mean(epa, na.rm = TRUE), | |
n = n() | |
) | |
toc() | |
# 0.492 sec elapsed | |
## A tibble: 42 x 5 | |
## Groups: season [21] | |
# season play_type avg_yds avg_epa n | |
# <int> <chr> <dbl> <dbl> <int> | |
# 1 1999 pass 5.84 -0.0214 18476 | |
# 2 1999 run 3.99 -0.0986 13658 | |
# 3 2000 pass 5.80 -0.0159 17974 | |
# 4 2000 run 4.03 -0.0749 13988 | |
# 5 2001 pass 5.84 -0.0153 17988 | |
# 6 2001 run 4.03 -0.0834 14026 | |
# 7 2002 pass 5.85 -0.00606 19125 | |
# 8 2002 run 4.25 -0.0470 14243 | |
# 9 2003 pass 5.82 -0.0322 18089 | |
#10 2003 run 4.24 -0.0513 14620 | |
## … with 32 more rows |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment