Skip to content

Instantly share code, notes, and snippets.

@jthomasmock
Last active July 31, 2020 04:01
Show Gist options
  • Save jthomasmock/b8a1c6e90a199cf72c6c888bd899e84e to your computer and use it in GitHub Desktop.
Save jthomasmock/b8a1c6e90a199cf72c6c888bd899e84e to your computer and use it in GitHub Desktop.
library(arrow)
library(tidyverse)
library(tictoc)
# create a parent directory
dir.create("nflfastr")
# create function for partition directories and download parquet files
get_data <- function(year){
dir.create(file.path("nflfastr", year))
download.file(
glue::glue("https://github.com/guga31bb/nflfastR-data/blob/master/data/play_by_play_{year}.parquet?raw=true"),
file.path("nflfastr", year, "data.parquet"),
mode = 'wb'
)
}
# create folder and download for each year
walk(1999:2019, get_data)
# open connection as arrow
ds <- open_dataset("nflfastr", partitioning = "year")
# Subset of years
tic()
ds %>%
select(year, play_type, yards_gained, epa, penalty, season) %>%
filter(year %in% c(1999, 2019),
play_type %in% c("run", "pass"), penalty == 0) %>%
collect() %>%
group_by(season, play_type) %>%
summarize(
avg_yds = mean(yards_gained, na.rm = TRUE),
avg_epa = mean(epa, na.rm = TRUE),
n = n()
)
toc()
# 0.094 sec elapsed
# # A tibble: 4 x 5
# # Groups: season [2]
# season play_type avg_yds avg_epa n
# <int> <chr> <dbl> <dbl> <int>
# 1 1999 pass 5.84 -0.0214 18476
# 2 1999 run 3.99 -0.0986 13658
# 3 2019 pass 6.27 0.0232 19616
# 4 2019 run 4.47 -0.0494 13344
# All years
tic()
ds %>%
select(year, play_type, yards_gained, epa, penalty, season) %>%
filter(play_type %in% c("run", "pass"), penalty == 0) %>%
collect() %>%
group_by(season, play_type) %>%
summarize(
avg_yds = mean(yards_gained, na.rm = TRUE),
avg_epa = mean(epa, na.rm = TRUE),
n = n()
)
toc()
# 0.492 sec elapsed
## A tibble: 42 x 5
## Groups: season [21]
# season play_type avg_yds avg_epa n
# <int> <chr> <dbl> <dbl> <int>
# 1 1999 pass 5.84 -0.0214 18476
# 2 1999 run 3.99 -0.0986 13658
# 3 2000 pass 5.80 -0.0159 17974
# 4 2000 run 4.03 -0.0749 13988
# 5 2001 pass 5.84 -0.0153 17988
# 6 2001 run 4.03 -0.0834 14026
# 7 2002 pass 5.85 -0.00606 19125
# 8 2002 run 4.25 -0.0470 14243
# 9 2003 pass 5.82 -0.0322 18089
#10 2003 run 4.24 -0.0513 14620
## … with 32 more rows
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment