Last active
February 14, 2022 15:49
-
-
Save Ryo-N7/5309ba2496b4f75a0747166bfbc52270 to your computer and use it in GitHub Desktop.
Liverpool FC's Age-Utility Graph
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Packages | |
```{r} | |
library(rvest) | |
library(polite) # devtools::install_github("dmi3kno/polite") | |
library(dplyr) | |
library(tidyr) | |
library(ggplot2) | |
library(scales) | |
library(purrr) | |
library(stringr) | |
library(ggrepel) | |
library(glue) | |
library(extrafont) | |
#loadfonts() | |
``` | |
## Webscrape transfermarkt.com with rvest + polite | |
```{r} | |
# first time using "polite" package for responsible web scraping! | |
session <- bow("https://www.transfermarkt.com/liverpool-fc/leistungsdaten/verein/31/reldata/GB1%262017/plus/1") | |
print(session) | |
# "The path is scrapable for this user-agent": OK, looks like we are good to go! | |
# scraping tranfermarkt is a nightmare... | |
# scrape each col individually then combine later... | |
# grab name from photo element instead | |
result_name <- scrape(session) %>% | |
html_nodes("#yw1 .bilderrahmen-fixed") %>% | |
html_attr("title") | |
# grab age | |
result_age <- scrape(session) %>% | |
html_nodes(".posrela+ .zentriert") %>% | |
html_text() | |
# grab minutes played in league | |
result_mins <- scrape(session) %>% | |
html_nodes("td.rechts") %>% | |
html_text() | |
``` | |
## Tidy data | |
- NOTE: changed y-axis to CURRENT AGE instead so it'll be more up-to-date, code below + plot code reflects these changes | |
```{r} | |
# place each vector into list | |
resultados <- list(result_name, result_age, result_mins) | |
col_name <- c("name", "age", "minutes") | |
# then reduce(cbind) to combine them, set names to cols | |
resultados %>% | |
reduce(cbind) %>% | |
as_tibble() %>% | |
set_names(col_name) -> results_comb | |
# NOICE.gif | |
glimpse(results_comb) | |
# players who have had 2 birthdays since beginning of 17/18 season | |
age_plus_one <- c("Lovren", "Van Dijk", "Moreno", "Ings") | |
# fix "strings" into proper formats, calculate % of minutes appeared | |
lfc_minutes <- results_comb %>% | |
mutate(age = as.numeric(age), | |
minutes = minutes %>% | |
str_replace("\\.", "") %>% | |
str_replace("'", "") %>% | |
as.numeric(), | |
min_perc = (minutes / 3420) %>% round(digits = 3)) %>% | |
filter(!is.na(minutes)) %>% | |
separate(name, into = c("first_name", "last_name"), by = " ") %>% | |
# manually fix some names | |
mutate( | |
last_name = case_when( | |
first_name == "Trent" ~ "Alexander-Arnold", | |
first_name == "Virgil" ~ "Van Dijk", | |
first_name == "Alex" ~ "Oxlade-Chamberlain", | |
TRUE ~ last_name), | |
age = age + 1) %>% # do CURRENT age instead for plot 2.0 | |
# add an additional year to age for specific players | |
mutate( | |
age = case_when( | |
last_name %in% age_plus_one ~ age + 1, | |
TRUE ~ age) | |
) %>% | |
# can't be arsed to scrape them individually so manually add the new lads | |
add_row( | |
first_name = " ", | |
last_name = "Alisson", | |
age = 25, | |
minutes = 3330, | |
min_perc = 0.974 | |
) %>% | |
add_row( | |
first_name = " ", | |
last_name = "Fabinho", | |
age = 24, | |
minutes = 3060, | |
min_perc = 0.895 | |
) %>% | |
add_row( | |
first_name = "Naby", | |
last_name = "Keita", | |
age = 23, | |
minutes = 1966, | |
min_perc = 0.642 | |
) %>% | |
add_row( | |
first_name = "Xherdan", | |
last_name = "Shaqiri", | |
age = 26, | |
minutes = 3049, | |
min_perc = 0.892 | |
) %>% | |
# create identifier for new vs. old player for labelling purposes | |
mutate( | |
new_player = case_when( | |
last_name %in% c("Alisson", "Fabinho", "Keita", "Shaqiri") ~ TRUE, | |
TRUE ~ FALSE)) %>% | |
arrange(desc(min_perc)) | |
# rectanglular highlight for players in their prime: | |
rect_df <- data.frame( | |
xmin = 24, xmax = 30, | |
ymin = -Inf, ymax = Inf | |
) | |
``` | |
## PLOT | |
```{r fig.height=6, fig.width=8} | |
lfc_minutes %>% | |
ggplot(aes(x = age, y = min_perc)) + | |
geom_rect( | |
data = rect_df, inherit.aes = FALSE, | |
aes(xmin = xmin, xmax = xmax, | |
ymin = ymin, ymax = ymax), | |
alpha = 0.3, | |
fill = "firebrick1") + | |
geom_point(color = "red", size = 2.5) + | |
geom_text_repel( | |
data = lfc_minutes %>% filter(!new_player == TRUE), | |
aes(label = last_name, family = "Roboto Condensed"), | |
nudge_x = 0.5, | |
seed = 6) + | |
geom_text_repel( | |
data = lfc_minutes %>% filter(new_player == TRUE), | |
aes(label = last_name, family = "Roboto Condensed", fontface = "bold"), | |
size = 4, nudge_x = 0.5, nudge_y = 0.02, | |
seed = 8) + | |
scale_y_continuous( | |
expand = c(0.01, 0), | |
limits = c(0, 1), | |
labels = percent_format()) + | |
scale_x_continuous( | |
breaks = pretty_breaks(n = 10)) + | |
labs( | |
x = "Current Age (As of Aug. 5th, 2018)", y = "% of Minutes Played", | |
title = "Liverpool FC: Age-Utility Matrix", | |
subtitle = "Premier League 17/18 (Summer 2018 transfers in bold, departed players left in for comparison)", | |
caption = "Data from transfermarkt.com\nInspired by @FinerMargins\nBy @R_by_Ryo") + | |
theme_bw() + | |
theme( | |
text = element_text(family = "Roboto Condensed"), | |
panel.grid.minor.y = element_blank()) + | |
geom_label( | |
aes(x = 20.5, y = 0.87, | |
hjust = 0.5, | |
label = glue(" | |
Encouraging to see Liverpool buying players both in | |
their prime and regulars in their previous teams. | |
Our entire best 'Starting XI' are going to be | |
in their prime this season! | |
"), | |
family = "Roboto Condensed"), | |
size = 3.5) | |
``` |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment