Skip to content

Instantly share code, notes, and snippets.

@Ryo-N7
Last active February 14, 2022 15:49
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Ryo-N7/5309ba2496b4f75a0747166bfbc52270 to your computer and use it in GitHub Desktop.
Save Ryo-N7/5309ba2496b4f75a0747166bfbc52270 to your computer and use it in GitHub Desktop.
Liverpool FC's Age-Utility Graph
## Packages
```{r}
library(rvest)
library(polite) # devtools::install_github("dmi3kno/polite")
library(dplyr)
library(tidyr)
library(ggplot2)
library(scales)
library(purrr)
library(stringr)
library(ggrepel)
library(glue)
library(extrafont)
#loadfonts()
```
## Webscrape transfermarkt.com with rvest + polite
```{r}
# first time using "polite" package for responsible web scraping!
session <- bow("https://www.transfermarkt.com/liverpool-fc/leistungsdaten/verein/31/reldata/GB1%262017/plus/1")
print(session)
# "The path is scrapable for this user-agent": OK, looks like we are good to go!
# scraping tranfermarkt is a nightmare...
# scrape each col individually then combine later...
# grab name from photo element instead
result_name <- scrape(session) %>%
html_nodes("#yw1 .bilderrahmen-fixed") %>%
html_attr("title")
# grab age
result_age <- scrape(session) %>%
html_nodes(".posrela+ .zentriert") %>%
html_text()
# grab minutes played in league
result_mins <- scrape(session) %>%
html_nodes("td.rechts") %>%
html_text()
```
## Tidy data
- NOTE: changed y-axis to CURRENT AGE instead so it'll be more up-to-date, code below + plot code reflects these changes
```{r}
# place each vector into list
resultados <- list(result_name, result_age, result_mins)
col_name <- c("name", "age", "minutes")
# then reduce(cbind) to combine them, set names to cols
resultados %>%
reduce(cbind) %>%
as_tibble() %>%
set_names(col_name) -> results_comb
# NOICE.gif
glimpse(results_comb)
# players who have had 2 birthdays since beginning of 17/18 season
age_plus_one <- c("Lovren", "Van Dijk", "Moreno", "Ings")
# fix "strings" into proper formats, calculate % of minutes appeared
lfc_minutes <- results_comb %>%
mutate(age = as.numeric(age),
minutes = minutes %>%
str_replace("\\.", "") %>%
str_replace("'", "") %>%
as.numeric(),
min_perc = (minutes / 3420) %>% round(digits = 3)) %>%
filter(!is.na(minutes)) %>%
separate(name, into = c("first_name", "last_name"), by = " ") %>%
# manually fix some names
mutate(
last_name = case_when(
first_name == "Trent" ~ "Alexander-Arnold",
first_name == "Virgil" ~ "Van Dijk",
first_name == "Alex" ~ "Oxlade-Chamberlain",
TRUE ~ last_name),
age = age + 1) %>% # do CURRENT age instead for plot 2.0
# add an additional year to age for specific players
mutate(
age = case_when(
last_name %in% age_plus_one ~ age + 1,
TRUE ~ age)
) %>%
# can't be arsed to scrape them individually so manually add the new lads
add_row(
first_name = " ",
last_name = "Alisson",
age = 25,
minutes = 3330,
min_perc = 0.974
) %>%
add_row(
first_name = " ",
last_name = "Fabinho",
age = 24,
minutes = 3060,
min_perc = 0.895
) %>%
add_row(
first_name = "Naby",
last_name = "Keita",
age = 23,
minutes = 1966,
min_perc = 0.642
) %>%
add_row(
first_name = "Xherdan",
last_name = "Shaqiri",
age = 26,
minutes = 3049,
min_perc = 0.892
) %>%
# create identifier for new vs. old player for labelling purposes
mutate(
new_player = case_when(
last_name %in% c("Alisson", "Fabinho", "Keita", "Shaqiri") ~ TRUE,
TRUE ~ FALSE)) %>%
arrange(desc(min_perc))
# rectanglular highlight for players in their prime:
rect_df <- data.frame(
xmin = 24, xmax = 30,
ymin = -Inf, ymax = Inf
)
```
## PLOT
```{r fig.height=6, fig.width=8}
lfc_minutes %>%
ggplot(aes(x = age, y = min_perc)) +
geom_rect(
data = rect_df, inherit.aes = FALSE,
aes(xmin = xmin, xmax = xmax,
ymin = ymin, ymax = ymax),
alpha = 0.3,
fill = "firebrick1") +
geom_point(color = "red", size = 2.5) +
geom_text_repel(
data = lfc_minutes %>% filter(!new_player == TRUE),
aes(label = last_name, family = "Roboto Condensed"),
nudge_x = 0.5,
seed = 6) +
geom_text_repel(
data = lfc_minutes %>% filter(new_player == TRUE),
aes(label = last_name, family = "Roboto Condensed", fontface = "bold"),
size = 4, nudge_x = 0.5, nudge_y = 0.02,
seed = 8) +
scale_y_continuous(
expand = c(0.01, 0),
limits = c(0, 1),
labels = percent_format()) +
scale_x_continuous(
breaks = pretty_breaks(n = 10)) +
labs(
x = "Current Age (As of Aug. 5th, 2018)", y = "% of Minutes Played",
title = "Liverpool FC: Age-Utility Matrix",
subtitle = "Premier League 17/18 (Summer 2018 transfers in bold, departed players left in for comparison)",
caption = "Data from transfermarkt.com\nInspired by @FinerMargins\nBy @R_by_Ryo") +
theme_bw() +
theme(
text = element_text(family = "Roboto Condensed"),
panel.grid.minor.y = element_blank()) +
geom_label(
aes(x = 20.5, y = 0.87,
hjust = 0.5,
label = glue("
Encouraging to see Liverpool buying players both in
their prime and regulars in their previous teams.
Our entire best 'Starting XI' are going to be
in their prime this season!
"),
family = "Roboto Condensed"),
size = 3.5)
```
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment