Created
July 20, 2016 16:18
-
-
Save timriffe/572807ec6854e837d91a0977ede9b56a to your computer and use it in GitHub Desktop.
get all players that ever were on a copa america or world cup Uruguay national team. Then make Lexis lines with them. Because fun.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Author: tim | |
############################################################################### | |
library(rvest) # for scraping | |
library(magrittr) # for pipes | |
library(lubridate) # for date handling | |
# some useful but ugly functions. very fragile, depends on wikipedia formatting | |
# 22 November 1901 (aged 28) | |
getbday <- function(strings){ | |
as.Date(gsub("[\\(\\)]", "", regmatches(strings, gregexpr("\\(.*?\\)", strings))[[1]])[1]) | |
} | |
getage <- function(strings){ | |
# yes, this would be cleaner with pipes... | |
as.integer(gsub("aged ","", | |
gsub("[\\(\\)]", "", regmatches(strings, gregexpr("\\(.*?\\)", strings))[[1]])[2])) | |
} | |
# cup years and unique player used to select right table. | |
# years are needed for composing web addresses | |
Years <-c(1930,1950,1954,1962,1966,1970,1974,1986,1990,2002,2010,2014) | |
# unfortunately I don't know how to grab the Uruguay table by title. That'd be easy. | |
# I also don't know the table's index position among all tables. So I've decided to find a unique | |
# identifier piece of text to be able to pick out which table is Uruguay. This consists | |
# in the last name of a haphazardly-chosen player, which I manually checked was unique on the page | |
UniquePlayer <- c("Andrade","Britos","Andrade","Troche","Manicera","Mazurkiewicz","Cubilla" | |
,"Acevedo","Perdomo","Lembo","Fucile","Lodeiro") | |
UruguayRaw <- list() | |
# one year at a time | |
for (i in 1:length(Years)){ | |
# compse url | |
teams <- paste0('https://en.wikipedia.org/wiki/', | |
Years[i],'_FIFA_World_Cup_squads') | |
# make list of html tables on page | |
temp <- teams %>% read_html %>% html_nodes("table") %>% html_table(fill=TRUE) | |
# pick out the table containing the name of the unique identifier player = Uruguay | |
Ur <- which(unlist(lapply(temp,function(x,Player){ | |
any(grepl(Player,x[["Player"]])) | |
},Player = UniquePlayer[i]))) | |
# select the Uruguay table | |
Tab <- temp[[Ur]] | |
# add cup year | |
Tab$Cup <- Years[i] | |
# assign to the Uruguay tables list | |
UruguayRaw[[i]] <- Tab | |
} | |
# turn into a long piece of data | |
UruguayTeamsFifa <- do.call(rbind,UruguayRaw) | |
# make proper date class birthday | |
UruguayTeamsFifa$DOB <- do.call(c,lapply(UruguayTeamsFifa[,"Date of birth (age)"],getbday)) | |
# integer age | |
UruguayTeamsFifa$Age <- do.call(c,lapply(UruguayTeamsFifa[,"Date of birth (age)"],getage)) | |
##################################### | |
# now repeat same steps for copa america, no annotation, identical to above | |
YearsCA <- c(1987,1989,1991,1993,1995,1997,1999,2001,2004,2007,2011,2015) | |
url <- paste0("https://en.wikipedia.org/wiki/",YearsCA[i],"_Copa_Am%C3%A9rica_squads") | |
UniquePlayerCA <- c("Trasante","Zeoli","Revelez","Francescoli", | |
"Aguirregaray","Saralegui","Zalayeta", | |
"Eguren","Bizera","Gargano","Gargano","Stuani") | |
UruguayRawCA <- list() | |
for (i in 1:length(Years)){ | |
url <- paste0("https://en.wikipedia.org/wiki/",YearsCA[i],"_Copa_Am%C3%A9rica_squads") | |
temp <- url %>% read_html %>% html_nodes("table") %>% html_table(fill=TRUE) | |
Ur <- which(unlist(lapply(temp,function(x,Player){ | |
any(grepl(Player,x[["Player"]])) | |
},Player = UniquePlayerCA[i]))) | |
Tab <- temp[[Ur]] | |
Tab$Cup <- YearsCA[i] | |
UruguayRawCA[[i]] <- Tab | |
} | |
# some years have a goals column, throw out. | |
UruguayRawCA <- lapply(UruguayRawCA,function(x){ | |
if ("Goals" %in% colnames(x)){ | |
x$Goals <- NULL | |
} | |
x[,-1] | |
}) | |
UruguayTeamsCA <- do.call(rbind,UruguayRawCA) | |
UruguayTeamsCA$DOB <- do.call(c,lapply(UruguayTeamsCA[,3],getbday)) | |
UruguayTeamsCA$Age <- do.call(c,lapply(UruguayTeamsCA[,3],getage)) | |
## this is sloppy, remove. A single NA for age, can infer later. | |
#UruguayTeamsCA$Age[is.na(UruguayTeamsCA$Age)] <- 34 # eyeball check | |
# want exact start date of tournaments | |
Dates <- c("1930-7-13","1950-6-24","1954-6-16","1962-5-30","1966-7-11", | |
"1970-5-31","1974-6-13","1986-5-31","1990-6-8", | |
"2002-5-31","2010-6-11","2014-6-12") | |
# name with cup year, for selection | |
names(Dates) <- Years | |
# this is a nice way to select, repeating values as necessary | |
UruguayTeamsFifa$CupDate <- as.Date(Dates[as.character(UruguayTeamsFifa$Cup)]) | |
# repeat for copa america | |
DatesCA <- c("1987-6-27","1989-7-1","1991-7-6","1993-6-15","1995-7-5", | |
"1997-6-11","1999-6-29","2001-7-11","2004-7-6","2007-6-27","2011-7-1","2015-6-11") | |
names(DatesCA) <- YearsCA | |
UruguayTeamsCA$CupDate <- as.Date(DatesCA[as.character(UruguayTeamsCA$Cup)]) | |
############### | |
#colnames(UruguayTeamsCA) | |
#colnames(UruguayTeamsFifa) | |
# delete some useless columns | |
UruguayTeamsFifa[, 1] <- NULL | |
UruguayTeamsFifa[, 3] <- NULL | |
UruguayTeamsCA[, 3] <- NULL | |
# add cup identifier, for coloring | |
UruguayTeamsFifa$Tournament <- "Fifa" | |
UruguayTeamsCA$Tournament <- "CA" | |
# join datasets | |
UruguayTeams <- rbind(UruguayTeamsFifa, UruguayTeamsCA) | |
################################################################### | |
# how many cups for each unique player? | |
# need to use bday as aux id because there are two Omar Mendez... | |
cheapid <- with(UruguayTeams,paste(Player, DOB)) | |
Times <- table(table(cheapid)) | |
# unique players | |
Players <- unique(cheapid) | |
# give 3 digit code as id | |
ID <- formatC(1:length(Players), width = 3, format = "d", flag = "0") | |
# give names for selection | |
names(ID) <- Players | |
# assign to dataset (cheapid is in same order) | |
UruguayTeams$ID <- ID[cheapid] | |
########################################################### | |
# get decimals times | |
UruguayTeams$dob <- decimal_date(UruguayTeams$DOB) | |
UruguayTeams$cupdate <- decimal_date(UruguayTeams$CupDate) | |
UruguayTeams$age <- UruguayTeams$cupdate - UruguayTeams$dob | |
# range(UruguayTeams$age - UruguayTeams$Age , na.rm=TRUE) | |
# must be data error, checked his bday online. | |
# UruguayTeams[UruguayTeams$age - UruguayTeams$Age < 0, ] | |
# assign age in completed years, not used tho | |
UruguayTeams$Age <- floor(UruguayTeams$age) | |
########################################################### | |
# looks like a female fertility curve, sort of. | |
pdf("/home/tim/workspace/Other/Figures/PlayersAges.pdf") | |
hist(UruguayTeams$age,freq=FALSE, col = "#0000FF50",breaks=seq(15,40,by=2.5), | |
xlab = "Edad", ylab = "Proporcion", main = "") | |
dev.off() | |
########################################################### | |
# now Lexis | |
#dev.new(height=5,width=11) | |
pdf("/home/tim/workspace/Other/Figures/LexisPlayers1.pdf",height=5,width=11) | |
par(xaxs="i",yaxs="i") | |
# open empty device | |
plot(NULL, type = "n", xlim=range(c(UruguayTeams$dob,UruguayTeams$cupdate)), | |
ylim=c(0,max(UruguayTeams$age)),asp=1, xlab = "Año", ylab = "Edad") | |
# verticals for cups | |
segments(unique(UruguayTeams$cupdate),0,unique(UruguayTeams$cupdate),40,col=gray(.6)) | |
# first draw segments | |
for (id in unique(UruguayTeams$ID)){ | |
Player <- UruguayTeams[UruguayTeams$ID == id, ] | |
x1 <- unique(Player$dob) | |
x2 <- max(Player$cupdate) | |
y2 <- max(Player$age) | |
segments(x1,0,x2,y2,col="#00000050",lwd=.5) | |
} | |
# then draw points for cups played, colored by which tournament | |
for (id in unique(UruguayTeams$ID)){ | |
Player <- UruguayTeams[UruguayTeams$ID == id, ] | |
points(Player$cupdate,Player$age,col=ifelse(Player$Tournament=="CA","#FF000050","#0000FF50"), | |
pch=19,cex=.7) | |
} | |
dev.off() | |
# repeat, but with lifelines starting at first cup | |
pdf("/home/tim/workspace/Other/Figures/LexisPlayers2.pdf",height=5,width=11) | |
par(xaxs="i",yaxs="i") | |
plot(NULL, type = "n", xlim=range(c(UruguayTeams$dob,UruguayTeams$cupdate)), | |
ylim=c(0,max(UruguayTeams$age)),asp=1, xlab = "Año", ylab = "Edad") | |
segments(unique(UruguayTeams$cupdate),0,unique(UruguayTeams$cupdate),40,col=gray(.6)) | |
for (id in unique(UruguayTeams$ID)){ | |
Player <- UruguayTeams[UruguayTeams$ID == id, ] | |
x1 <- min(Player$cupdate) | |
x2 <- max(Player$cupdate) | |
y1 <- min(Player$age) | |
y2 <- max(Player$age) | |
segments(x1,y1,x2,y2,col="#00000050",lwd=.5) | |
points(Player$cupdate,Player$age,col=ifelse(Player$Tournament=="CA","#FF000050","#0000FF50"), | |
pch=19,cex=.7) | |
} | |
dev.off() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment