Skip to content

Instantly share code, notes, and snippets.

@timriffe
Created July 20, 2016 16:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save timriffe/572807ec6854e837d91a0977ede9b56a to your computer and use it in GitHub Desktop.
Save timriffe/572807ec6854e837d91a0977ede9b56a to your computer and use it in GitHub Desktop.
get all players that ever were on a copa america or world cup Uruguay national team. Then make Lexis lines with them. Because fun.
# Author: tim
###############################################################################
library(rvest) # for scraping
library(magrittr) # for pipes
library(lubridate) # for date handling
# some useful but ugly functions. very fragile, depends on wikipedia formatting
# 22 November 1901 (aged 28)
getbday <- function(strings){
as.Date(gsub("[\\(\\)]", "", regmatches(strings, gregexpr("\\(.*?\\)", strings))[[1]])[1])
}
getage <- function(strings){
# yes, this would be cleaner with pipes...
as.integer(gsub("aged ","",
gsub("[\\(\\)]", "", regmatches(strings, gregexpr("\\(.*?\\)", strings))[[1]])[2]))
}
# cup years and unique player used to select right table.
# years are needed for composing web addresses
Years <-c(1930,1950,1954,1962,1966,1970,1974,1986,1990,2002,2010,2014)
# unfortunately I don't know how to grab the Uruguay table by title. That'd be easy.
# I also don't know the table's index position among all tables. So I've decided to find a unique
# identifier piece of text to be able to pick out which table is Uruguay. This consists
# in the last name of a haphazardly-chosen player, which I manually checked was unique on the page
UniquePlayer <- c("Andrade","Britos","Andrade","Troche","Manicera","Mazurkiewicz","Cubilla"
,"Acevedo","Perdomo","Lembo","Fucile","Lodeiro")
UruguayRaw <- list()
# one year at a time
for (i in 1:length(Years)){
# compse url
teams <- paste0('https://en.wikipedia.org/wiki/',
Years[i],'_FIFA_World_Cup_squads')
# make list of html tables on page
temp <- teams %>% read_html %>% html_nodes("table") %>% html_table(fill=TRUE)
# pick out the table containing the name of the unique identifier player = Uruguay
Ur <- which(unlist(lapply(temp,function(x,Player){
any(grepl(Player,x[["Player"]]))
},Player = UniquePlayer[i])))
# select the Uruguay table
Tab <- temp[[Ur]]
# add cup year
Tab$Cup <- Years[i]
# assign to the Uruguay tables list
UruguayRaw[[i]] <- Tab
}
# turn into a long piece of data
UruguayTeamsFifa <- do.call(rbind,UruguayRaw)
# make proper date class birthday
UruguayTeamsFifa$DOB <- do.call(c,lapply(UruguayTeamsFifa[,"Date of birth (age)"],getbday))
# integer age
UruguayTeamsFifa$Age <- do.call(c,lapply(UruguayTeamsFifa[,"Date of birth (age)"],getage))
#####################################
# now repeat same steps for copa america, no annotation, identical to above
YearsCA <- c(1987,1989,1991,1993,1995,1997,1999,2001,2004,2007,2011,2015)
url <- paste0("https://en.wikipedia.org/wiki/",YearsCA[i],"_Copa_Am%C3%A9rica_squads")
UniquePlayerCA <- c("Trasante","Zeoli","Revelez","Francescoli",
"Aguirregaray","Saralegui","Zalayeta",
"Eguren","Bizera","Gargano","Gargano","Stuani")
UruguayRawCA <- list()
for (i in 1:length(Years)){
url <- paste0("https://en.wikipedia.org/wiki/",YearsCA[i],"_Copa_Am%C3%A9rica_squads")
temp <- url %>% read_html %>% html_nodes("table") %>% html_table(fill=TRUE)
Ur <- which(unlist(lapply(temp,function(x,Player){
any(grepl(Player,x[["Player"]]))
},Player = UniquePlayerCA[i])))
Tab <- temp[[Ur]]
Tab$Cup <- YearsCA[i]
UruguayRawCA[[i]] <- Tab
}
# some years have a goals column, throw out.
UruguayRawCA <- lapply(UruguayRawCA,function(x){
if ("Goals" %in% colnames(x)){
x$Goals <- NULL
}
x[,-1]
})
UruguayTeamsCA <- do.call(rbind,UruguayRawCA)
UruguayTeamsCA$DOB <- do.call(c,lapply(UruguayTeamsCA[,3],getbday))
UruguayTeamsCA$Age <- do.call(c,lapply(UruguayTeamsCA[,3],getage))
## this is sloppy, remove. A single NA for age, can infer later.
#UruguayTeamsCA$Age[is.na(UruguayTeamsCA$Age)] <- 34 # eyeball check
# want exact start date of tournaments
Dates <- c("1930-7-13","1950-6-24","1954-6-16","1962-5-30","1966-7-11",
"1970-5-31","1974-6-13","1986-5-31","1990-6-8",
"2002-5-31","2010-6-11","2014-6-12")
# name with cup year, for selection
names(Dates) <- Years
# this is a nice way to select, repeating values as necessary
UruguayTeamsFifa$CupDate <- as.Date(Dates[as.character(UruguayTeamsFifa$Cup)])
# repeat for copa america
DatesCA <- c("1987-6-27","1989-7-1","1991-7-6","1993-6-15","1995-7-5",
"1997-6-11","1999-6-29","2001-7-11","2004-7-6","2007-6-27","2011-7-1","2015-6-11")
names(DatesCA) <- YearsCA
UruguayTeamsCA$CupDate <- as.Date(DatesCA[as.character(UruguayTeamsCA$Cup)])
###############
#colnames(UruguayTeamsCA)
#colnames(UruguayTeamsFifa)
# delete some useless columns
UruguayTeamsFifa[, 1] <- NULL
UruguayTeamsFifa[, 3] <- NULL
UruguayTeamsCA[, 3] <- NULL
# add cup identifier, for coloring
UruguayTeamsFifa$Tournament <- "Fifa"
UruguayTeamsCA$Tournament <- "CA"
# join datasets
UruguayTeams <- rbind(UruguayTeamsFifa, UruguayTeamsCA)
###################################################################
# how many cups for each unique player?
# need to use bday as aux id because there are two Omar Mendez...
cheapid <- with(UruguayTeams,paste(Player, DOB))
Times <- table(table(cheapid))
# unique players
Players <- unique(cheapid)
# give 3 digit code as id
ID <- formatC(1:length(Players), width = 3, format = "d", flag = "0")
# give names for selection
names(ID) <- Players
# assign to dataset (cheapid is in same order)
UruguayTeams$ID <- ID[cheapid]
###########################################################
# get decimals times
UruguayTeams$dob <- decimal_date(UruguayTeams$DOB)
UruguayTeams$cupdate <- decimal_date(UruguayTeams$CupDate)
UruguayTeams$age <- UruguayTeams$cupdate - UruguayTeams$dob
# range(UruguayTeams$age - UruguayTeams$Age , na.rm=TRUE)
# must be data error, checked his bday online.
# UruguayTeams[UruguayTeams$age - UruguayTeams$Age < 0, ]
# assign age in completed years, not used tho
UruguayTeams$Age <- floor(UruguayTeams$age)
###########################################################
# looks like a female fertility curve, sort of.
pdf("/home/tim/workspace/Other/Figures/PlayersAges.pdf")
hist(UruguayTeams$age,freq=FALSE, col = "#0000FF50",breaks=seq(15,40,by=2.5),
xlab = "Edad", ylab = "Proporcion", main = "")
dev.off()
###########################################################
# now Lexis
#dev.new(height=5,width=11)
pdf("/home/tim/workspace/Other/Figures/LexisPlayers1.pdf",height=5,width=11)
par(xaxs="i",yaxs="i")
# open empty device
plot(NULL, type = "n", xlim=range(c(UruguayTeams$dob,UruguayTeams$cupdate)),
ylim=c(0,max(UruguayTeams$age)),asp=1, xlab = "Año", ylab = "Edad")
# verticals for cups
segments(unique(UruguayTeams$cupdate),0,unique(UruguayTeams$cupdate),40,col=gray(.6))
# first draw segments
for (id in unique(UruguayTeams$ID)){
Player <- UruguayTeams[UruguayTeams$ID == id, ]
x1 <- unique(Player$dob)
x2 <- max(Player$cupdate)
y2 <- max(Player$age)
segments(x1,0,x2,y2,col="#00000050",lwd=.5)
}
# then draw points for cups played, colored by which tournament
for (id in unique(UruguayTeams$ID)){
Player <- UruguayTeams[UruguayTeams$ID == id, ]
points(Player$cupdate,Player$age,col=ifelse(Player$Tournament=="CA","#FF000050","#0000FF50"),
pch=19,cex=.7)
}
dev.off()
# repeat, but with lifelines starting at first cup
pdf("/home/tim/workspace/Other/Figures/LexisPlayers2.pdf",height=5,width=11)
par(xaxs="i",yaxs="i")
plot(NULL, type = "n", xlim=range(c(UruguayTeams$dob,UruguayTeams$cupdate)),
ylim=c(0,max(UruguayTeams$age)),asp=1, xlab = "Año", ylab = "Edad")
segments(unique(UruguayTeams$cupdate),0,unique(UruguayTeams$cupdate),40,col=gray(.6))
for (id in unique(UruguayTeams$ID)){
Player <- UruguayTeams[UruguayTeams$ID == id, ]
x1 <- min(Player$cupdate)
x2 <- max(Player$cupdate)
y1 <- min(Player$age)
y2 <- max(Player$age)
segments(x1,y1,x2,y2,col="#00000050",lwd=.5)
points(Player$cupdate,Player$age,col=ifelse(Player$Tournament=="CA","#FF000050","#0000FF50"),
pch=19,cex=.7)
}
dev.off()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment