Skip to content

Instantly share code, notes, and snippets.

@johnburnmurdoch
Last active July 18, 2019 18:09
Show Gist options
  • Star 11 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save johnburnmurdoch/7b27233cda45df326f5d to your computer and use it in GitHub Desktop.
Save johnburnmurdoch/7b27233cda45df326f5d to your computer and use it in GitHub Desktop.
# The packages we'll be using
packages <- c("rvest","dplyr","tidyr","pipeR","ggplot2","stringr","data.table")
# From those packages, which ones are not yet installed?
newPackages <- packages[!(packages %in% as.character(installed.packages()[,"Package"]))]
# If any weren't already installed, install them now
if(length(newPackages)) install.packages(newPackages)
# Now make sure all necessary packages are loaded
lapply(packages,require,character.only=T)
# Define a function that -- for any given year -- will return a list of URLs, each one referring to a web page containing a list of all players appearing for a Premier League club in a given season (including any who since left the club)
getClubs <- function(year){
# The URL below is for a web page containing a list of clubs taking part in a given Premier League season. Importantly, it also contains a season-specific URL to the 'profile page' of each of those clubs in that season
url <- paste0("http://www.transfermarkt.co.uk/premier-league/startseite/wettbewerb/GB1/plus/?saison_id=",year)
# The line below parses the HTML content of the page we defined above
page <- url %>% read_html()
# Now we're storing each of those club/season page URLs, substituting a few parts of the URL so that instead of getting the basic club/season page, we get the related page showing all players' appearance data
clubUrls <- paste0("http://www.transfermarkt.co.uk",gsub("(saison_id/)+[0-9]+","",gsub("startseite","leistungsdaten",page %>% html_nodes("#yw1 > table tr > td.hauptlink.no-border-links.hide-for-small.hide-for-pad > a.vereinprofil_tooltip") %>% html_attr("href"))),"plus/1?reldata=GB1%26",year)
# Sorted. Return that list of club/season/players pages
return(clubUrls)
}
# Define a second function that will take an input of a single row in an HTML table -- this row will represent one player in a club/season/players page as defined above -- and return a table containing the player's name, nationality(ies) and league minutes played. Why a table and not a single row? Some players have multiple nationalities, and the best way to consistently deal with all possible numbers of nationalities (and make our lives easier in future analysis) is to store each as a row of its own
doPlayer <- function(row){
# Here we just want the player's name
playerName <- row %>% html_nodes("table tr:nth-child(1) > td.hauptlink span.hide-for-small") %>% html_text()
# Now we're getting any/all nationalities
playerNat <- row %>% html_nodes("td > img") %>% html_attr("title")
# And the number of minutes they played
playerMins <- row %>% html_nodes("td.rechts") %>% html_text()
# If they didn't play at all, we just return a null (empty) object, otherwise we want a table containing all the information we've just extracted. The table will be as long as the number of nationalities a player has, with their name and minutes played simply repeated in every row
if(length(playerMins) == 0){return(NULL)}else{
return(data.frame(name=rep(playerName,length(playerNat)),nat=playerNat,mins=rep(playerMins,length(playerNat))))
}
}
# This function combines the result of the first one we defined, with the process defined in the second, i.e we give it a club/season/players URL, and it will extract information about all players' nationality(ies) and their Premier League participation
doClub <- function(url){
# Any future mentions of 'clubUrl' will refer to the club/season/players URL we feed in
clubUrl <- url
# Parse the HTML contents of that page
clubPage <- clubUrl %>% read_html()
# Extract the number of matches a club has played to date in the season, just in case we need that for any future analysis
matchesPlayed <- extract_numeric(str_extract(clubPage %>% html_nodes("#main > div:nth-child(12) > div > div > p.content") %>% html_text(),"(played )+[0-9]+( games)"))
# Extract a list of HTML table rows, each of which we will run through our 'doPlayer' function defined above
playerRows <- clubPage %>% html_nodes("#yw1 > table > tbody > tr")
# Loop through all of those rows, running the function on each, then bind the results into a single table containing the details for all of a team's players
df <- as.data.frame(rbindlist(lapply(playerRows,doPlayer)))
# Strip out the decimal thousand separators for minutes data and make sure the result is read as a numeric value. We're also calculating the share of all of a club's match minutes that each player was involved in. NB: crucial that we also include the URL of the page we're extracting this data from, as this means we will know later on what club & season this data came from, allowing us to slice and dice the data on that basis in our analysis phase
df <- df %>% mutate(mins = extract_numeric(gsub("\\.","",mins)), share = mins/(90*matchesPlayed), club = clubUrl)
# Show a message telling us how far through the list of all clubs/seasons we are. There are probably more efficient equivalents for this part, but for me it's a simple way to give me a rough idea of how long the main scraping phase is going to take
message(paste0("Done ",round(match(url,allClubs)/length(allClubs)*100,1)),"%")
# Now return a table containing all data for all players for a club/season pairing, excluding cases where there is any missing data
return(df[complete.cases(df),])
}
# Sweet, we've defined the functions we're going to need later on, so let's put them into action!
# Generate a list of all club/season URLs for the seasons starting in the years 1992 to 2015, i.e all Premier League seasons to date
allClubs <- lapply(1992:2015,getClubs)
# Now we have a list of vectors, where each vector holds either 22 (before 1995/96) or 20 URLs for the clubs taking part in the Premier League in a given year. The next lineloops throygh that list and concatenates the results into one long list of club/season URLs
allClubs <- rle(unlist(allClubs))$values
# Now we can loop through those URls, returning a table of all player nationalies & minutes played in each case, and then join them all together into one long table on which we can do all subsequent analysis. Our use of 'message' earlier means that every time one club/season page is turned into a table, we'll get a little note telling us how far along we are in the overall process
allData <- as.data.frame(rbindlist(lapply(allClubs,doClub)))
# Let's add a column specifying the season to which each row refers. We can do this using the club/season URL, since its final 4 characters are the year the season started
allData <- allData %>% mutate(seasonStart = substr(club,nchar(club)-3,nchar(club)))
# The point of the exercise for which this script was written was to look at the share of Premier League football (measured in minutes) contributed by players of different nationalities, with a particular focus on EU-nationals. Therefore we need to know which countries make up the EU, and how this has changed since 1992/93.
# Here we're extracting a table from Wikipedia that contains details for every EU member state past and present, incuding the year when they became a member
euDf <- data.frame("https://en.wikipedia.org/wiki/Member_state_of_the_European_Union" %>% read_html() %>% html_nodes("#mw-content-text > table:nth-child(8)") %>% html_table())
# Some countries are simply listed as founders instead of any year being specified, so for each of these I'm replacing that string of text with the year 1957
euDf <- euDf %>% mutate(accession = ifelse(grepl("Founder",Accession),1957,Accession))
# For each year in which a Premier League season began, let's get a list of the members of the EU as it stood at that time
membersYrs <- lapply(unique(allData$seasonStart),function(x) (euDf %>% filter(accession <= x))$Country.name)
# Now to clean up our data
# First of all, we need to sort out a consistency issue: our EU members data refers to the United Kingdom, but footballers' nationalities of course use labels based on national football teams, so we have England, Scotland, Wales and Northern Ireland instead. Let's create a new column called 'euName' that will label players from the UK as such
allData <- allData %>% mutate(euName = ifelse(nat %in% c("England","Scotland","Wales","Northern Ireland"),"United Kingdom",as.character(nat)))
# Now that we've done that, let's differentiate between the UK and other EU members. Here we're going through every row of our master table, and doing the following:
# If a country is in the UK, we can leave that as it is. If not, we check whether that country was in the EU in that year, and return "Other EU" or "non-EU" accordingly
allData <- allData %>% rowwise() %>% mutate(isEu = if(euName == "United Kingdom"){return(euName)}else if(euName %in% membersYrs[[match(seasonStart,1992:2015)]]){return("Other EU")}else{return("non-EU")})
# One other thing we might want to do given the importance that fans place on the individual national teams within the UK, let's now create one final naming structure that also separates out England from the rest of the UK, and Ireland from the rest of the non-UK EU
allData <- allData %>% rowwise() %>% mutate(group = if(nat == "England"){return("England")}else if(nat %in% c("Scotland","Wales","Northern Ireland")){return("Rest of UK")}else if(nat == "Ireland"){return("Ireland")}else if(euName %in% membersYrs[[match(seasonStart,1992:2015)]]){return("Other EU")}else{return("non-EU")})
# Now that we've got that final column of names, we need to carry out one more crucial step before we have a complete dataset to work with. Our master dataset currently still has multiple entries/rows for players with multiple nationalities, but of course we only want each player to be represented once. That's fine, we can just use one row for each, but each row contains a different nationality, so which to use? Going back to the initial purpose of this exercise, the distinction we care about is EU vs non-EU, so in the block below we're going to do the followng:
# Add a column containing the number 0 for EU nationalities, and 1 for non-EU
# Now for each player/club/season entry, sort the rows so any EU nationality comes first (top)
# Next group and summarise the data by player/club/season, so each is recorded in only one row. Our previous step of sorting nationalities means we can specify that in the summary process, we only want the first (top) entry for nationality, i.e any player with any EU nationality will be recorded as an EU national regardless of other nationalities held
allData <- allData %>% group_by(club,name) %>% mutate(nat1 = ifelse(nat %in% membersYrs[[24]] | isEu == "United Kingdom",0,1)) %>% arrange(nat1) %>% summarise(nat = nat[1], mins = mins[1], share = share[1], seasonStart = seasonStart[1], euName = euName[1], isEu = isEu[1], group = group[1], nat1 = nat1[1])
# Now to prepare for visualisation
# First let's group by season and nationality, to show the overall evolution over time in minutes played according to nationality. In this step we simply want to know the total number of minutes played by players of a given nationality grouping in a given season
byYear <- allData %>% group_by(seasonStart,group) %>% summarise(total = sum(mins))
# That last step gave us the numerator for our calculation of the share of minutes played, so now we need the denominator. To find that we need to group only by season, and carry out the same aggregation of minutes. We'll run the final calculation in this step too
byYear <- byYear %>% group_by(seasonStart) %>% mutate(share = total/sum(total))
# We're pretty much done at this point, but in order to easily work with this data in external tools (other than R), let's add a couple of columns that will make the visualisation stage easier. We're going to do some cumulative summing to calculate points at which to draw the bottom and top of each bar in a stached bar chart.
# First we need to determine the order in which the bars within each stack wil be displayed. "Other EU" is central to the story, and "England" will resonate with readers, so let's order the categories so that they appear at the extremes
byYear$group <- factor(byYear$group,levels=c("Other EU","non-EU","Ireland","Rest of UK","England"),ordered=T)
# Now we can arrange the dataset by year and then by our predetermined order of nationality groupings, and then use cumulative summing to calculate bar positions
byYear <- byYear %>% arrange(seasonStart,group) %>% ungroup() %>% group_by(seasonStart) %>% mutate(barTops = cumsum(share), barBases = ifelse(is.na(lag(barTops)),0,lag(barTops)))
# Now we have a nice, clean dataset that we can use to make our stacked bar plot
yearPlot <- ggplot(byYear,aes(seasonStart,share,fill=group)) +
geom_bar(position = "fill",stat = "identity") +
scale_y_continuous(labels = percent_format())
yearPlot
# Finally we can write the result to a CSV file that we can use in other visualisation tools. For the charts we published online and in the print edition of the Financial Times, we used d3.js, since it allows us to easily output both PNGs (for web) and SVGs (for ptint)
write.csv(byYear, file="/Users/john.burn-murdoch/Dropbox/premdata/baseline/euPlayers/data.csv")
# Below is a similar set of transformations carried out on the complete dataset that groups by clubs instead of years, and uses data only for the current season
# First we filter the master dataset to leave only data from the current season, then we group by club and nationality and carry out the same calculation of relative share that we did for seasons. An additional step strips out club names from the longer/messier URLs
byClub <- allData %>% filter(seasonStart == 2015) %>% group_by(club,group) %>% summarise(total = sum(mins)) %>% ungroup() %>% group_by(club) %>% mutate(share = total/sum(total), clubName = gsub("(/leistung).*","",gsub("http://www.transfermarkt.co.uk/","",club)))
# We need to repeat that handy step of determining the order of our nationality groupings for any graphics
byClub$group <- factor(byClub$group,levels=rev(c("Other EU","non-EU","Ireland","Rest of UK","England")),ordered=T)
# Here we're taking a new step: creating a mini dataset for convenience, that we then sort by the nationality grouping of choice. This will help us order the clubs in the resulting stacked bar chart. Here I'm sorting clubs so that those with the highest share of minutes from "Other EU" players are at the top
clubsDf <- byClub %>% filter(group == "England") %>% ungroup() %>% arrange(desc(share))
# Now we can go back to our "byClub" dataset and use the mini dataset we just created to make sure clubs will be plotted in the right order
byClub$clubName <- factor(byClub$clubName,levels=clubsDf$clubName[order(clubsDf$share)],ordered=T)
# Finally we take that step of calculating values for the start and end of each bar
byClub <- byClub %>% arrange(club,group) %>% ungroup() %>% group_by(club) %>% mutate(barTops = cumsum(share), barBases = ifelse(is.na(lag(barTops)),0,lag(barTops)))
# Now we can plot the results:
clubsPlot <- ggplot(byClub,aes(clubName,share,fill=group)) +
geom_bar(position = "fill",stat = "identity") +
scale_y_continuous(labels = percent_format()) +
coord_flip()
clubsPlot
# And export to a CSV
write.csv(byClub, file="/Users/jbm/Dropbox/premdata/baseline/euPlayers/clubs_.csv")
# See the two following gists for the code that generates the final publication-ready charts (https://twitter.com/jburnmurdoch/status/693870028047945728)
# https://gist.github.com/johnburnmurdoch/ab89b8eccef6f1b8628a
# https://gist.github.com/johnburnmurdoch/9ec853213db77ac50719
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment