Skip to content

Instantly share code, notes, and snippets.

@schochastics
Created September 30, 2018 20:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save schochastics/7a557d77e4f143050cf29e0342e27f16 to your computer and use it in GitHub Desktop.
Save schochastics/7a557d77e4f143050cf29e0342e27f16 to your computer and use it in GitHub Desktop.
get squads from footballsquads.co.uk
library(rvest)
library(tidyverse)
leagues <- "http://www.footballsquads.co.uk/archive.htm" %>%
read_html() %>%
html_nodes("a") %>%
html_attr("href")
leagues_tbl <- as_tibble(str_split(leagues,"/",simplify = T)) %>%
mutate(link=leagues) %>%
dplyr::filter(V2!="") %>%
mutate(V3=str_remove(V3,".htm"))
names(leagues_tbl)[1:3] <- c("country","season","league")
leagues_tbl <- leagues_tbl %>%
rowwise() %>%
mutate(fil = as.integer(str_split(season,"-")[[1]][1])) %>%
ungroup() %>%
dplyr::filter(fil>=2001) %>%
select(-fil)
leagues_tbl <- leagues_tbl %>% group_by(country,season) %>% slice(1)
##########################################################################
club_links <- character(0)
for(i in 1:nrow(leagues_tbl)){
print(i)
url <- paste0("http://www.footballsquads.co.uk/",leagues_tbl$link[i])
links <- read_html(url) %>% html_nodes("a") %>% html_attr("href")
links <- paste0(word(url,1,-2,"/"),"/",links)
club_links <- c(club_links,links)
Sys.sleep(runif(1,0,1))
}
##########################################################################
##########################################################################
club_links <- club_links %>%
dplyr::filter(!str_detect(X1,"\\.\\.")) %>%
dplyr::filter(!str_detect(X1,"mailto")) %>%
pull(X1)
get_squad <- function(url){
club_str <- str_remove(str_split(url,"/")[[1]],".htm")
tst <- as_tibble(html_table(read_html(url))[[1]])
cut <- which(tst$X1=="Players no longer at this club")
tst <- tst[1:(cut-1),]
tst <- janitor::row_to_names(tst,1)
tst <- tst[!tst$Name=="",]
tst <- janitor::clean_names(tst,"snake")
tst$name <- str_squish(tst$name)
tst$club <- club_str[7]
tst$league <- club_str[6]
tst$season <- club_str[5]
tst$country <- club_str[4]
select(tst,name,pos,date_of_birth,club,season,country)
}
outfile <- "playerlist.csv"
for(i in 1:length(club_links)){
if(i%%100==0){
print(i)
}
df <- tryCatch(get_squad(club_links[i]),error=function(e) NULL)
if(!is.data.frame(df)){
print(i)
next()
}
Sys.sleep(runif(1,0,1.5))
write_csv(df,outfile,append=file.exists(outfile))
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment