Skip to content

Instantly share code, notes, and snippets.

@schochastics
Created August 24, 2018 21:21
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save schochastics/72f65e449138b685124611384ef6ec04 to your computer and use it in GitHub Desktop.
Save schochastics/72f65e449138b685124611384ef6ec04 to your computer and use it in GitHub Desktop.
scrape mean age and market values for European Football leagues
library(tidyverse)
library(rvest)
library(ggimage)
library(lubridate)
#get first 25 leagues in Europe ----
url <- "https://www.transfermarkt.de/wettbewerbe/europa"
doc <- read_html(url)
leagues <- doc %>% html_nodes(".hauptlink a") %>% html_attr("href")
leagues <- leagues[seq(2,length(leagues),2)]
#function to scrape and plot ----
plot_age_mv <- function(team_url){
base.url <- "https://www.transfermarkt.de"
doc <- read_html(paste0(base.url,team_url))
teams <- doc %>%
html_nodes(".items") %>%
html_table(fill=TRUE) %>%
.[[1]] %>%
.[-1,]
wappen <- doc %>%
html_nodes("td .tiny_wappen") %>%
html_attr("src") %>%
unique()
teams <- janitor::clean_names(teams)
#check if market value is in millions or hundred thousands
teams$val <- teams$gesamtmarktwert %>% str_extract("[a-zA-Z]+")
#league name
league <- doc %>% html_nodes(".spielername-profil") %>% html_text()
#league country
country <- doc %>%
html_table(fill=TRUE) %>%
.[[1]] %>%
.$X2 %>%
.[1] %>%
word(2,sep="-") %>%
str_trim()
#get team name, mean age and mean value
teams <- teams %>%
select(name,kader,gesamtmarktwert,val) %>%
mutate(age=as.numeric(str_replace(kader,",","."))) %>%
mutate(mw=str_replace(gesamtmarktwert,"[a-zA-Z]+\\. €","")) %>%
mutate(mw=as.numeric(str_replace(mw,",","."))) %>%
mutate(mw=ifelse(val=="Mio",mw,mw/1000)) %>%
select(name,age,mw)
#get bigger crests
teams$wappen <- str_replace(wappen,"/tiny/","/head/")
teams$league <- league
ggplot(teams,aes(x=age,y=mw))+geom_image(aes(image=wappen))+
labs(x="average player age",y="average player value (million Euro)",
caption="data from transfermarkt.de",
title=paste0(league,"(",country,")"))+
hrbrthemes::theme_ipsum_rc() -> p
return(list(data=teams,plot=p))
}
#loop over all leagues and save the plots
df <- tibble()
for(i in seq_along(leagues)){
print(i)
res <- plot_age_mv(leagues[i])
out_name <- paste0(res$plot$labels$title,".png")
ggsave(out_name,res$plot,width = 5,height = 5)
df <- bind_rows(df,res$data)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment