Skip to content

Instantly share code, notes, and snippets.

@tonyelhabr
Last active July 21, 2023 19:02
Show Gist options
  • Save tonyelhabr/b058e49f0d8d6a0005f1ae982df8f8c4 to your computer and use it in GitHub Desktop.
Save tonyelhabr/b058e49f0d8d6a0005f1ae982df8f8c4 to your computer and use it in GitHub Desktop.
Scrape player stats for the 2023 FIFA Women's World Cup
library(httr)
library(tibble)
library(tidyr)
library(dplyr)
library(purrr)
library(janitor)
library(cli)
library(stringr)
library(readr)

data_dir <- '/path/to/data'
match_stats_dir <- file.path(data_dir, 'match-stats')
live_match_elements_dir <- file.path(data_dir, 'live-match-elements')
dir.create(match_stats_dir, showWarnings = FALSE, recursive = TRUE)
dir.create(live_match_elements_dir, showWarnings = FALSE)

season_id <- 285026

Scrape match info.

matches_resp <- GET(sprintf('https://api.fifa.com/api/v3/calendar/matches?language=en&count=500&idSeason=%s', season_id))
results <- content(matches_resp) |> pluck('Results')

## for incomplete matches (anything beyond the group stage at the moment), there will be `NULL`s
##   which causes `pluck()` to throw an error. using a `.default` of `NA` fixes the issue.
pluck2_chr <- partial(pluck, .default = NA_character_, ... = )
pluck2_int <- partial(pluck, .default = NA_integer_, ... = )

map_pluck_chr <- function(x, ...) {
  map_chr(x, pluck2_chr, ...)
}

map_pluck_int <- function(x, ...) {
  map_int(x, pluck2_int, ...)
}

matches <- tibble(
  competition_id = map_pluck_chr(results, 'IdCompetition'),
  season_id = map_pluck_chr(results, 'IdSeason'),
  stage_id = map_pluck_chr(results, 'IdStage'),
  group_id = map_pluck_chr(results, 'IdGroup'),
  ## this won't join with the match stats, but it seems to be Fifa's "true" match ID
  match_id = map_pluck_chr(results, 'IdMatch'),
  match_status = map_pluck_int(results, 'MatchStatus'),
  ## use this to join with the match stats
  result_id = map_pluck_chr(results, 'Properties', 'IdIFES'),
  home_abbr = map_pluck_chr(results, 'Home', 'Abbreviation'),
  away_abbr = map_pluck_chr(results, 'Away', 'Abbreviation')
) |> 
  filter(match_status == 0L) |> 
  select(-match_status)
matches
# A tibble: 2 × 8
  competition_id season_id stage_id group_id match_id  result_id home_…¹ away_…²
  <chr>          <chr>     <chr>    <chr>    <chr>     <chr>     <chr>   <chr>  
1 103            285026    285033   285037   400222852 131872    NZL     NOR    
2 103            285026    285033   285036   400222851 131878    AUS     IRL    
# … with abbreviated variable names ¹​home_abbr, ²​away_abbr

Scrape stats for players in matches.

scrape_match_stats <- function(result_id) {
  stats_resp <- GET(sprintf('https://fdh-api.fifa.com/v1/stats/match/%s/players.json', result_id))
  stop_for_status(stats_resp)
  stats_resp |> 
    content() |> 
    enframe('player_id', 'values') |> 
    unnest_longer(values)
}

scrape_and_save_match_stats <- function(result_id) {
  path <- file.path(match_stats_dir, paste0(result_id, '.rds'))
  if (file.exists(path)) {
    return(read_rds(path))
  }
  cli_inform('Scraping {result_id}.')
  res <- scrape_match_stats(result_id)
  write_rds(res, path)
  res
}

match_stats <- matches |> 
  pull(result_id) |> 
  map_dfr(
    ~{
      scrape_and_save_match_stats(.x) |> 
        mutate(result_id = !!.x, .before = 1)
    }
  )
match_stats
# A tibble: 5,155 × 3
   result_id player_id values    
   <chr>     <chr>     <list>    
 1 131872    467661    <list [3]>
 2 131872    467661    <list [3]>
 3 131872    467661    <list [3]>
 4 131872    467661    <list [3]>
 5 131872    467661    <list [3]>
 6 131872    467661    <list [3]>
 7 131872    467661    <list [3]>
 8 131872    467661    <list [3]>
 9 131872    467661    <list [3]>
10 131872    467661    <list [3]>
# … with 5,145 more rows
# ℹ Use `print(n = ...)` to see more rows

Get info for players (warning: this is obnoxious)

generate_live_match_url <- function(competition_id, season_id, stage_id, match_id) {
  sprintf(
    'https://api.fifa.com/api/v3/live/football/%s/%s/%s/%s?language=en',
    competition_id,
    season_id,
    stage_id,
    match_id
  )
}

scrape_live_match_elements <- function(url) {
  resp <- GET(url)
  elements <- content(resp) |> 
    enframe('element', 'values')
}

scrape_and_save_live_match_elements <- function(url, result_id, overwrite = FALSE) {
  path <- file.path(live_match_elements_dir, paste0(result_id, '.rds'))
  if (file.exists(path) & isFALSE(overwrite)) {
    return(read_rds(path))
  }
  cli_inform('Scraping {result_id}.')
  res <- scrape_live_match_elements(url)
  write_rds(res, path)
  res
}

live_match_elements <- matches |> 
  mutate(
    live_match_url = generate_live_match_url(
      competition_id = competition_id,
      season_id = season_id,
      stage_id = stage_id,
      match_id = match_id
    )
  ) |> 
  pull(live_match_url, result_id) |> 
  imap_dfr(
    ~{
      scrape_and_save_live_match_elements(
        url = ..1,
        result_id = ..2
      ) |> 
        mutate(
          result_id = !!..2,
          .before = 1
        )
    }
  )

live_match_teams <- live_match_elements |> 
  filter(
    element %in% c(
      'HomeTeam',
      'AwayTeam'
    )
  ) |> 
  unnest_wider(values)

players <- live_match_teams |> 
  transmute(
    country = ShortClubName, 
    country_picture_url = str_replace_all(PictureUrl, c('\\{format\\}' = 'sq', '\\{size\\}' = '4')), 
    Players
  ) |> 
  unnest_longer(Players) |> 
  unnest_wider(Players) |> 
  unnest_wider(where(is.list), names_sep = '_') |> 
  unnest_wider(where(is.list), names_sep = '_') |> 
  distinct(
    player_id = IdPlayer,
    player_name = PlayerName_1_Description,
    player_picture_url = PlayerPicture_PictureUrl,
    country,
    country_picture_url
  )
players
# A tibble: 92 × 5
   player_id player_name      player_picture_url                 country count…¹
   <chr>     <chr>            <chr>                              <chr>   <chr>  
 1 301468    Victoria ESSON   https://digitalhub.fifa.com/trans… New Ze… https:…
 2 358189    C.J. BOTT        https://digitalhub.fifa.com/trans… New Ze… https:…
 3 252502    Ali RILEY        https://digitalhub.fifa.com/trans… New Ze… https:…
 4 355896    Rebekah STOTT    https://digitalhub.fifa.com/trans… New Ze… https:…
 5 301461    Katie BOWEN      https://digitalhub.fifa.com/trans… New Ze… https:…
 6 252519    Ria PERCIVAL     https://digitalhub.fifa.com/trans… New Ze… https:…
 7 397010    Malia STEINMETZ  https://digitalhub.fifa.com/trans… New Ze… https:…
 8 298792    Betsy HASSETT    https://digitalhub.fifa.com/trans… New Ze… https:…
 9 395309    Jacqui HAND      https://digitalhub.fifa.com/trans… New Ze… https:…
10 321169    Hannah WILKINSON https://digitalhub.fifa.com/trans… New Ze… https:…
# … with 82 more rows, and abbreviated variable name ¹​country_picture_url
# ℹ Use `print(n = ...)` to see more rows

Combine everything

unnested_match_stats <- match_stats |> 
  hoist(
    values,
    'stat' = 1,
    'value' = 2
  ) |> 
  select(-values) 

player_stats <- unnested_match_stats |> 
  group_by(result_id, player_id, stat) |> 
  slice_max(value, n = 1, with_ties = FALSE) |> 
  ungroup() |> 
  pivot_wider(
    names_from = stat, 
    values_from = value,
    values_fill = 0
  ) |> 
  clean_names()
player_stats
Rows: 92
Columns: 89
$ result_id                                          <chr> "131872", "131872",…
$ player_id                                          <chr> "252502", "252503",…
$ assists                                            <dbl> 0, 0, 0, 0, 0, 0, 0…
$ attempt_at_goal                                    <dbl> 0, 0, 2, 0, 0, 0, 0…
$ attempt_at_goal_blocked                            <dbl> 0, 0, 0, 0, 0, 0, 0…
$ attempt_at_goal_from_free_kicks                    <dbl> 0, 0, 0, 0, 0, 0, 0…
$ attempt_at_goal_inside_the_penalty_area            <dbl> 0, 0, 1, 0, 0, 0, 0…
$ attempt_at_goal_inside_the_penalty_area_on_target  <dbl> 0, 0, 0, 0, 0, 0, 0…
$ attempt_at_goal_off_target                         <dbl> 0, 0, 2, 0, 0, 0, 0…
$ attempt_at_goal_on_target                          <dbl> 0, 0, 0, 0, 0, 0, 0…
$ attempt_at_goal_outside_the_penalty_area           <dbl> 0, 0, 1, 0, 0, 0, 0…
$ attempt_at_goal_outside_the_penalty_area_on_target <dbl> 0, 0, 0, 0, 0, 0, 0…
$ attempted_ball_progressions                        <dbl> 2, 0, 0, 1, 0, 5, 2…
$ attempted_switches_of_play                         <dbl> 0, 0, 0, 0, 0, 0, 0…
$ avg_speed                                          <dbl> 5.99, 0.00, 6.43, 5…
$ completed_ball_progressions                        <dbl> 2, 0, 0, 0, 0, 4, 2…
$ completed_switches_of_play                         <dbl> 0, 0, 0, 0, 0, 0, 0…
$ corners                                            <dbl> 0, 0, 8, 0, 0, 0, 0…
$ crosses                                            <dbl> 1, 0, 9, 0, 0, 2, 0…
$ crosses_completed                                  <dbl> 0, 0, 1, 0, 0, 0, 0…
$ defensive_pressures_applied                        <dbl> 21, 0, 16, 10, 0, 3…
$ direct_defensive_pressures_applied                 <dbl> 9, 0, 8, 3, 0, 9, 0…
$ direct_free_kicks                                  <dbl> 0, 0, 0, 0, 0, 0, 4…
$ distance_high_speed_running                        <dbl> 2172.3336, 0.0000, …
$ distance_high_speed_sprinting                      <dbl> 220.84722, 0.00000,…
$ distance_jogging                                   <dbl> 3625.9098, 0.0000, …
$ distance_low_speed_sprinting                       <dbl> 507.948606, 0.00000…
$ distance_walking                                   <dbl> 3511.4708, 0.0000, …
$ distributions_completed_under_pressure             <dbl> 9, 0, 12, 4, 0, 17,…
$ distributions_under_pressure                       <dbl> 14, 0, 16, 8, 0, 30…
$ fouls_against                                      <dbl> 0, 0, 1, 1, 0, 0, 0…
$ fouls_for                                          <dbl> 0, 0, 2, 0, 0, 0, 0…
$ free_kicks                                         <dbl> 0, 0, 0, 0, 0, 0, 4…
$ goal_kicks                                         <dbl> 0, 0, 0, 0, 0, 0, 1…
$ goalkeeper_defensive_actions_inside_penalty_area   <dbl> 0, 0, 0, 0, 0, 0, 0…
$ goalkeeper_defensive_actions_outside_penalty_area  <dbl> 0, 0, 0, 0, 0, 0, 0…
$ goalkeeper_goal_preventions                        <dbl> 0, 0, 0, 0, 0, 0, 0…
$ goals                                              <dbl> 0, 0, 0, 0, 0, 0, 0…
$ goals_conceded_from_attempt_at_goal_against        <dbl> 0, 0, 0, 0, 0, 0, 0…
$ goals_from_direct_free_kicks                       <dbl> 0, 0, 0, 0, 0, 0, 0…
$ goals_inside_the_penalty_area                      <dbl> 0, 0, 0, 0, 0, 0, 0…
$ goals_outside_the_penalty_area                     <dbl> 0, 0, 0, 0, 0, 0, 0…
$ headed_attempt_at_goal                             <dbl> 0, 0, 0, 0, 0, 0, 0…
$ indirect_free_kicks                                <dbl> 0, 0, 0, 0, 0, 0, 0…
$ linebreaks_attempted                               <dbl> 17, 0, 14, 14, 0, 1…
$ linebreaks_attempted_all_lines                     <dbl> 1, 0, 1, 0, 0, 3, 0…
$ linebreaks_attempted_attacking_and_midfield_line   <dbl> 3, 0, 1, 5, 0, 3, 1…
$ linebreaks_attempted_attacking_line                <dbl> 5, 0, 2, 7, 0, 3, 1…
$ linebreaks_attempted_attacking_line_completed      <dbl> 3, 0, 1, 3, 0, 1, 7…
$ linebreaks_attempted_completed                     <dbl> 9, 0, 10, 6, 0, 12,…
$ linebreaks_attempted_defensive_line                <dbl> 1, 0, 2, 0, 0, 5, 0…
$ linebreaks_attempted_defensive_line_completed      <dbl> 0, 0, 1, 0, 0, 3, 0…
$ linebreaks_attempted_midfield_and_defensive_line   <dbl> 1, 0, 1, 0, 0, 3, 0…
$ linebreaks_attempted_midfield_line                 <dbl> 11, 0, 10, 7, 0, 11…
$ linebreaks_attempted_midfield_line_completed       <dbl> 6, 0, 8, 3, 0, 8, 5…
$ linebreaks_attempted_under_pressure                <dbl> 3, 0, 12, 2, 0, 16,…
$ linebreaks_completed_all_lines                     <dbl> 0, 0, 1, 0, 0, 1, 0…
$ linebreaks_completed_attacking_and_midfield_line   <dbl> 3, 0, 1, 5, 0, 3, 1…
$ linebreaks_completed_midfield_and_defensive_line   <dbl> 0, 0, 1, 0, 0, 1, 0…
$ linebreaks_completed_under_pressure                <dbl> 1, 0, 10, 0, 0, 9, …
$ offers_to_receive_in_behind                        <dbl> 1, 0, 9, 0, 0, 6, 2…
$ offers_to_receive_in_between                       <dbl> 7, 0, 3, 0, 0, 23, …
$ offers_to_receive_in_front                         <dbl> 12, 0, 4, 4, 0, 2, …
$ offers_to_receive_inside                           <dbl> 1, 0, 9, 2, 0, 14, …
$ offers_to_receive_outside                          <dbl> 19, 0, 7, 2, 0, 17,…
$ offers_to_receive_total                            <dbl> 20, 0, 16, 4, 0, 31…
$ offsides                                           <dbl> 0, 0, 0, 0, 0, 1, 0…
$ own_goals                                          <dbl> 0, 0, 0, 0, 0, 0, 0…
$ passes                                             <dbl> 37, 0, 22, 27, 0, 3…
$ passes_completed                                   <dbl> 29, 0, 18, 21, 0, 2…
$ penalties                                          <dbl> 0, 0, 1, 0, 0, 0, 0…
$ penalties_scored                                   <dbl> 0, 0, 0, 0, 0, 0, 0…
$ received_offers_to_receive                         <dbl> 9, 0, 4, 3, 0, 9, 7…
$ receptions_between_midfield_and_defensive_line     <dbl> 3, 0, 8, 1, 0, 13, …
$ receptions_under_direct_pressure                   <dbl> 0, 0, 3, 1, 0, 4, 0…
$ receptions_under_indirect_pressure                 <dbl> 6, 0, 12, 6, 0, 12,…
$ receptions_under_no_pressure                       <dbl> 21, 0, 12, 22, 0, 2…
$ receptions_under_pressure                          <dbl> 6, 0, 15, 7, 0, 16,…
$ red_cards                                          <dbl> 0, 0, 0, 0, 0, 0, 0…
$ speed_runs                                         <dbl> 165, 0, 183, 149, 0…
$ sprints                                            <dbl> 41, 0, 40, 32, 0, 6…
$ substitutions_in                                   <dbl> 0, 0, 0, 0, 0, 0, 0…
$ substitutions_out                                  <dbl> 0, 0, 0, 0, 0, 0, 0…
$ take_ons_completed                                 <dbl> 0, 0, 0, 0, 0, 0, 0…
$ throw_ins                                          <dbl> 14, 0, 0, 0, 0, 0, …
$ time_played                                        <dbl> 100, 0, 100, 100, 0…
$ top_speed                                          <dbl> 28.77, 0.00, 27.15,…
$ total_distance                                     <dbl> 10038.51, 0.00, 107…
$ yellow_cards                                       <dbl> 0, 0, 0, 0, 0, 0, 0…

Bonus: per 90 stats

player_stats_p90 <- player_stats |> 
  group_by(player_id) |> 
  summarize(
    n_matches = n_distinct(result_id),
    across(
      -c(result_id),
      sum
    )
  ) |> 
  ungroup() |> 
  mutate(
    across(
      -c(player_id, time_played),
      list(p90 = \(.x) 90 * .x / time_played)
    )
  ) |> 
  left_join(
    players, 
    by = join_by(player_id)
  )
@tanho63
Copy link

tanho63 commented Jul 21, 2023

nerdsniped by you mentioning "warning: this is obnoxious"

library(httr)
library(tibble)
library(tidyr)
library(purrr)
library(dplyr)

get_teams_in_competition <- function(season_id = 285026){
  teams <- glue::glue("https://api.fifa.com/api/v3/competitions/teams/{season_id}?language=en") |> 
    httr::RETRY(verb = "GET") |> 
    httr::content(as = "parsed") |> 
    getElement("Results") |> 
    tibble::tibble() |> 
    tidyr::unnest_wider(1)
  
  return(teams)
}

parse_localized_desc <- \(x) purrr::map_chr(x, ~purrr::pluck(.x, 1, "Description", .default = NA))

get_roster <- function(team_id, competition_id = 103, season_id = 285026){
  resp <- glue::glue('https://api.fifa.com/api/v3/teams/{team_id}/squad?idCompetition={competition_id}&idSeason={season_id}&language=en') |> 
    httr::RETRY(verb = "GET") |> 
    httr::content()
  
  players <- resp$Players |> 
    tibble::tibble() |> 
    tidyr::unnest_wider(1) |> 
    dplyr::mutate(
      dplyr::across(
        dplyr::any_of(
        c("PlayerName", 
          "ShortName", 
          "PositionLocalized", 
          "RealPositionLocalized", 
          "RealPositionSideLocalized")
        ),
        parse_localized_desc
      ),
      PlayerPicture = purrr::map_chr(PlayerPicture, \(.x) purrr::pluck(.x, "PictureUrl", .default = NA)),
      PlayerPicture = glue::glue(PlayerPicture, format = 'sq', size = 4),
      Properties = NULL
    )
  
  # can add code to get coaches
  
  return(players)
}

teams <- get_teams_in_competition(285026)

rosters <- purrr::map_dfr(teams$IdTeam, get_roster)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment