mrcaseb/statsbomb.R

## statsbomb.R
## HOW TO USE THIS ##

# Go to https://github.com/statsbomb/amf-open-data/tree/main#getting-started
# and download the zipped json files containing the individual seasons, available via AWS S3
#
# Alternatively click the below links directly
# https://statsbomb-amf-open-data.s3.eu-west-2.amazonaws.com/tracking/SB_tracking_TB12DB_2021.zip
# https://statsbomb-amf-open-data.s3.eu-west-2.amazonaws.com/tracking/SB_tracking_TB12DB_2022.zip
#
# Unzip those files to a local directory. That directory will include deeply nested json files
# Loop over  those files and put their path in the below function. The function will
# parse the nested json and either return a tidy data frame into memory (don't do this) or
# write it to disc as compressed csv (please do this)
#
# Let's say you have all the statsbomb json files saved in the folder "data-raw/raw_sb"
# and you want parsed csv files in the folder "data-raw/stats-bomb-parsed",
# then you could run something like
# purrr::walk(list.files("data-raw/raw_sb/"), parse_statsbomb_tracking, write_to = "data-raw/stats-bomb-parsed")
#
# NB: An average machine needs 25-30 seconds to process one game. The 2021 and 2022
# data consist of 19+18 = 37 games so this could easily run 20 minutes!!

#' @export
parse_statsbomb_tracking <- function(path_to_json,
                                     write_to = "data-raw/stats-bomb-tracking"
                                     ){
  stopifnot(exprs = {
    "Can only parse one game at a time" = length(path_to_json) == 1
    "write_to must be a valid local directory or NULL" = is.null(write_to) || dir.exists(write_to)
  })

  g <- basename(path_to_json) |> tools::file_path_sans_ext()

  cli::cli_progress_step(
    msg = "Processing {.pkg {g}}. Please keep cool, this may take a while...",
    msg_done = "Processed {.pkg {g}}"
  )

  raw <- RJSONIO::fromJSON(path_to_json, simplify = FALSE)

  game_info <- purrr::discard(raw, is.list) |>
    tibble::as_tibble() |>
    dplyr::mutate(
      home = tibble::as_tibble(raw$home_team),
      away = tibble::as_tibble(raw$away_team)
    ) |>
    tidyr::unpack(c("home", "away"), names_sep = "_")

  df <- tibble::tibble(r = raw$plays) |>
    tidyr::unnest_wider(r) |>
    # Number of plays times 22, because data per player on the field
    tidyr::unnest_longer(tracks) |>
    # add track info with 8 additional columns
    tidyr::unnest_wider(tracks, names_sep = "_") |>
    # dd player info with 5 additional columns
    tidyr::unnest_wider(tracks_player) |>
    # Now extend by the frames of each player.
    # The number of frames varies per play AND per player
    tidyr::unnest_longer(tracks_steps) |>
    # the tracking data is now again in a list that needs to be expanded
    tidyr::unnest_wider(tracks_steps)

  # Bind with game info and finish
  out <- dplyr::bind_cols(game_info, df)
  rm(raw, game_info, df)

  if (!is.null(write_to) && dir.exists(write_to)){
    game_id <- out$nfl_game_id[[1]]
    save_path <- file.path(write_to, paste0(game_id, ".csv.gz"))
    data.table::fwrite(out, save_path)
  } else {
    return(out)
  }
}
	## HOW TO USE THIS ##

	# Go to https://github.com/statsbomb/amf-open-data/tree/main#getting-started
	# and download the zipped json files containing the individual seasons, available via AWS S3
	#
	# Alternatively click the below links directly
	# https://statsbomb-amf-open-data.s3.eu-west-2.amazonaws.com/tracking/SB_tracking_TB12DB_2021.zip
	# https://statsbomb-amf-open-data.s3.eu-west-2.amazonaws.com/tracking/SB_tracking_TB12DB_2022.zip
	#
	# Unzip those files to a local directory. That directory will include deeply nested json files
	# Loop over those files and put their path in the below function. The function will
	# parse the nested json and either return a tidy data frame into memory (don't do this) or
	# write it to disc as compressed csv (please do this)
	#
	# Let's say you have all the statsbomb json files saved in the folder "data-raw/raw_sb"
	# and you want parsed csv files in the folder "data-raw/stats-bomb-parsed",
	# then you could run something like
	# purrr::walk(list.files("data-raw/raw_sb/"), parse_statsbomb_tracking, write_to = "data-raw/stats-bomb-parsed")
	#
	# NB: An average machine needs 25-30 seconds to process one game. The 2021 and 2022
	# data consist of 19+18 = 37 games so this could easily run 20 minutes!!

	#' @export
	parse_statsbomb_tracking <- function(path_to_json,
	write_to = "data-raw/stats-bomb-tracking"
	){
	stopifnot(exprs = {
	"Can only parse one game at a time" = length(path_to_json) == 1
	"write_to must be a valid local directory or NULL" = is.null(write_to) \|\| dir.exists(write_to)
	})

	g <- basename(path_to_json) \|> tools::file_path_sans_ext()

	cli::cli_progress_step(
	msg = "Processing {.pkg {g}}. Please keep cool, this may take a while...",
	msg_done = "Processed {.pkg {g}}"
	)

	raw <- RJSONIO::fromJSON(path_to_json, simplify = FALSE)

	game_info <- purrr::discard(raw, is.list) \|>
	tibble::as_tibble() \|>
	dplyr::mutate(
	home = tibble::as_tibble(raw$home_team),
	away = tibble::as_tibble(raw$away_team)
	) \|>
	tidyr::unpack(c("home", "away"), names_sep = "_")

	df <- tibble::tibble(r = raw$plays) \|>
	tidyr::unnest_wider(r) \|>
	# Number of plays times 22, because data per player on the field
	tidyr::unnest_longer(tracks) \|>
	# add track info with 8 additional columns
	tidyr::unnest_wider(tracks, names_sep = "_") \|>
	# dd player info with 5 additional columns
	tidyr::unnest_wider(tracks_player) \|>
	# Now extend by the frames of each player.
	# The number of frames varies per play AND per player
	tidyr::unnest_longer(tracks_steps) \|>
	# the tracking data is now again in a list that needs to be expanded
	tidyr::unnest_wider(tracks_steps)

	# Bind with game info and finish
	out <- dplyr::bind_cols(game_info, df)
	rm(raw, game_info, df)

	if (!is.null(write_to) && dir.exists(write_to)){
	game_id <- out$nfl_game_id[[1]]
	save_path <- file.path(write_to, paste0(game_id, ".csv.gz"))
	data.table::fwrite(out, save_path)
	} else {
	return(out)
	}
	}