-
-
Save Torvaney/42cd82addb3ba2c4f33ec3247e66889c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
title: "Scrape understat" | |
--- | |
```{r setup} | |
library(tidyverse) | |
library(understatr) | |
options(dplyr.summarise.inform = FALSE) | |
leagues_meta <- understatr::get_leagues_meta() | |
leagues_meta | |
``` | |
The R package [understatr](https://github.com/ewenme/understatr) provides a handy function | |
`get_match_shots`. However, to make use of it, we have to get a list of match IDs. | |
So, let's create a `fetch_matches` function that takes a league's page URL (as provided in `leagues_meta`) | |
and returns the matches for that league and season. | |
```{r, message=FALSE} | |
library(rvest) | |
extract_json <- function(html, varname) { | |
# Extract a JSON variable from the raw html | |
html %>% | |
html_nodes("script") %>% | |
html_text(trim = TRUE) %>% | |
keep(str_detect, varname) %>% | |
str_extract("(?<=JSON.parse\\(').*(?='\\))") %>% | |
stringi::stri_unescape_unicode() %>% | |
jsonlite::fromJSON() | |
} | |
unnest_df <- function(df, name) { | |
# Hack to convert list/nested dfs to a single df while keeping sane column | |
# names | |
if (!is.data.frame(df)) { | |
return(tibble(!!name := df)) | |
} | |
tbl <- as_tibble(df) | |
colnames(tbl) <- str_c(name, "_", colnames(tbl)) | |
tbl | |
} | |
fetch_matches <- function(league_url) { | |
# Fetch a dataframe of matches for an individual league/season's page | |
league_url %>% | |
read_html() %>% | |
extract_json("datesData") %>% | |
imap_dfc(unnest_df) %>% | |
rename(match_id = id) %>% | |
type_convert() | |
} | |
# For example | |
fetch_matches("https://understat.com/league/EPL/2019") | |
``` | |
```{r, message=FALSE} | |
matches <- | |
leagues_meta %>% | |
filter(year == 2019, | |
!league_name %in% c("Ligue 1", "RFPL")) %>% | |
mutate(matches = map(url, slowly(fetch_matches))) %>% | |
unnest(matches) | |
matches | |
``` | |
```{r, message=FALSE} | |
shots <- | |
matches %>% | |
filter(datetime < lubridate::today()) %>% | |
pull(match_id) %>% | |
map_dfr(slowly(possibly(understatr::get_match_shots, tibble()))) | |
shots %>% | |
left_join(matches, by = c("match_id")) %>% | |
write_csv(here::here("data", "understat-shots.csv")) | |
``` |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment