johnburnmurdoch/federer-ATP-100.R

## federer-ATP-100.R
# Load the packages we’re going to be using:
# Alongside the usual stuff like tidyverse and magrittr, we’ll be using rvest for some web-scraping, jsonline to parse some JSON, and extrafont to load some nice custom fonts
needs(tidyverse, magrittr, rvest, jsonlite, extrafont)

# Before we go on, two things to note:

# First, on web scraping:
# You should always check the terms of the site you are extracting data from, to make sure scraping (often referred to as `crawling`) is not prohibited. One way to do this is to visit the website’s `robots.txt` page, and ensure that a) there is nothing explicitly stating that crawlers are not permitted, and b) ideally, the site simply states that all user agents are permitted (indicated by a line saying `User-Agect: *`). Both of those are the case for our use-case today (see https://www.ultimatetennisstatistics.com/robots.txt).

# And second, about those custom fonts:
# I love the elegant — and completely free! — M+ fonts [https://mplus-fonts.osdn.jp/about-en.html#license], which you can download from here[https://osdn.net/projects/mplus-fonts/releases/62344].
# After downloading and installing them onto your machine, run the `font_import()` command from the `extrafont()` library, and then check that R graphics devices now have access to them by running `View(fonttable())` and making sure the `m-plus` family appears in there.

# Still in the housekeeping stage: let’s set up an empty progress bar...
pb <- txtProgressBar(0,100,0,"|",style=3)
# ...and a function that will print progress during any iterative processes we go through
prg <- function(length,index){
  inc <- ceiling(length/(100))
  if(index %% inc == 0) setTxtProgressBar(pb, round(index/length*100))
}

# Now onto the task at hand.
# First, we want a subset of players to work with. Let’s grab the top 100 players who have won the most singles titles, from the wonderfully helpful Ultimate Tennis Statistics website (henceforth UTS). For the eagle-eyed among you, the &rowcount parameter in the URL says 105, but this is because lots of players are tied on 10 titles.
players <- fromJSON("https://www.ultimatetennisstatistics.com/recordTable?recordId=Titles&current=1&rowCount=105&searchPhrase=&active=false&_=1551544955403") %>%
  extract2("rows")

# Now let’s write a function that, given any unique UTS player ID, will return a data frame containing one row for every one of their title wins.
getTitles <- function(player_ID){
  return(
    fromJSON(paste0("https://www.ultimatetennisstatistics.com/playerEventsTable?playerId=",player_ID,"&current=1&rowCount=-1&sort%5Bdate%5D=desc&searchPhrase=&season=&fromDate=01-01-1990&toDate=&level=&surface=&indoor=&speed=&result=W&tournamentId=&_=1551545320435")) %>%
    extract2("rows")
  )
}

# Next, we have a separate function that, for each player, will grab their date of birth (this will let us calculate their age at each title win), and the date of the first ever ATP Tour match they played (we’ll use this as the starting point for their curves in the graphic)
getBio <- function(player_ID){
  DOB <- paste0("https://www.ultimatetennisstatistics.com/playerProfileTab?playerId=", player_ID) %>%
    read_html %>%
    html_nodes(".row > .col-md-4.col-lg-3 > table.table-condensed.text-nowrap > tr:nth-child(1) td") %>%
    html_text(trim=TRUE) %>%
    str_extract("\\d{2}-\\d{2}-\\d{4}") %>%
    as.Date("%d-%m-%Y") %>%
    na.omit()
  debut <- fromJSON(paste0("https://www.ultimatetennisstatistics.com/playerEventsTable?playerId=",player_ID,"&current=1&rowCount=1&sort%5Bdate%5D=asc&searchPhrase=&season=&fromDate=01-01-1990&toDate=&level=&surface=&indoor=&speed=&result&tournamentId=&")) %>%
    extract2("rows")
  return(
    list(DOB=DOB, debut=debut)
  )
}

# Finally, let’s combine both of those functions into one, that will take any player, grab their title wins, date of birth and debut match, add in their name and calculate their age at each title win
processPlayer <- function(player_ID){
  titles <- getTitles(player_ID)
  if(class(titles) == "list"){
    return(NULL)
  }
  bio <- getBio(player_ID)
  return(
    titles %>%
      bind_rows(bio$debut) %>%
      mutate(
        date = as.Date(date, "%Y-%m-%d"),
        DOB = bio$DOB,
        age = as.numeric(round((date-DOB)/365.25,3)),
        player_name = players$name[players$playerId==player_ID]
      ) %>%
      as_tibble
  )
}

# Now to run that function on all of our players.
# First, we set up an empty list, with length equal to the number of players we’re looking. We can then store each player’s data in this list as we loop through them all with out `processPlayer` function.
processed.players.list <- vector("list", nrow(players))
for(i in 1:100){
  processed.players.list[[i]] <- processPlayer(players$playerId[i])
  # On this next line, we’re running our progress bar function, to monitor how we’re getting on
  prg(100,i)
}
# 100% — great! All our data is in. First things first: let’s bind our list into one nice neat tibble.

plot_data <- processed.players.list %>%
  bind_rows %>%
  distinct()
# That last line — `distinct()` — is just something I do to be super-safe: it’s making sure we don’t have any duplicate rows, which would lead us to double-count a player’s title win.

# Our data is now nice and clean and tidy, but we’re not quite ready to start drawing yet: first we need to calculate our cumulative title wins variable:

plot_data <- plot_data %>%
  # First we group by player and arrange by date to make sure our cumulative count progresses chronologically. We’ll calculate each player’s current title total as well.
  group_by(player_name) %>%
  arrange(date) %>%
  mutate(
    titles = cumsum(result %in% c("W", "G")),
    total = max(titles)
  )

# Let’s take a quick look at the players with the most titles:
plot_data %>%
  group_by(player_name, total) %>%
  summarise %>%
  ungroup %>%
  arrange(desc(total)) %>%
  head(10)

# While that’s in front of us, let’s store the top five names there as players to highlight in the graphic:
highlighted_players <- c("Roger Federer", "Rafael Nadal", "Novak Djokovic", "Pete Sampras", "Andre Agassi")

# Next, one last step before we begin plotting: let’s ungroup our data and create a new variable — `colour` — which sets a unique colour for each of our highlighted players, and the same shade of light grey for all the others. We’re also going to make sure our player names are an `ordered factor`, so that our visual elements will be plotted in the correct order (highlighted players on top) in the rendering stage.
plot_data <- plot_data %>%
  ungroup %>%
  mutate(
    colour = case_when(
      player_name == "Roger Federer" ~ "#00218D",
      player_name == "Rafael Nadal" ~ "#FF2B4F",
      player_name == "Novak Djokovic" ~ "#0083EB",
      player_name == "Pete Sampras" ~ "#FCAB27",
      player_name == "Andre Agassi" ~ "#FF49EF",
      T ~ "gray80"
    ),
    player_name = fct_reorder(player_name, total)
  )

# Now our data is good-to-go. Let’s make a chart!

# Let’s start simple: age on the x-axis, cumulative titles won on the y-axis, and one stepped line for each player (the `group` aesthetic). We’ll use ggplot’s no-frills minimal theme, too.
p1 <- ggplot(plot_data, aes(age, titles, group=player_name)) +
  theme_minimal() +
  geom_step(direction="hv")
p1
ggsave("federer_100_p1.png", width=10, height=6, units="in")
# As a proof of dataviz concept, it’s a solid start, but there’s a lot we can do to improve it.

# First, let’s leverage the colours we set up earlier. We’re going to use ggplot’s `identity` scales here, since we already assigned bespoke colours to the players in our data.
p2 <- ggplot(plot_data, aes(age, titles, group=player_name, col=colour, fill=colour)) +
  theme_minimal() +
  geom_step(direction="hv") +
  scale_color_identity() +
  scale_fill_identity()
p2
ggsave("federer_100_p2.png", width=10, height=6, units="in")
# Cool!
# You’re right: we can’t see which player is which, but don’t worry — we’ll solve that with some direct labelling in a minute (labelling points directly makes things a lot easier for our audience, who won’t have to keep checking back-and-forth between visual elements and a legend — for more on that, read this wonderful blogpost by Eugene Wei [https://www.eugenewei.com/blog/2017/11/13/remove-the-legend]).

# Before we get to the direct labelling, let’s add a marker point to the end of each line, drawing attention to each player’s current total.
# Watch for a few details here:
# We dont’t want a point for every single title won, or our graphic would be extremely dense — we just want one for each player’s latest data point. To do that, within our `geom_point()` function, we’re using the `data = . %>%` syntax to manipulate our plot data on the fly. Specifically, for each player we’re dropping everything but the row containing the highest number in the cumulative title wins column.
# One more thing: with `shape=21` we’re specifying that we want a circle with distinct fill and stroke, so each can be styled separately. This lets us give each circle a white stroke, lifting it visually clear of the line behind it.
p3 <- ggplot(plot_data, aes(age, titles, group=player_name, col=colour, fill=colour)) +
  theme_minimal() +
  geom_step(direction="hv") +
  geom_point(data = . %>% group_by(player_name) %>% top_n(1, titles), shape=21, col="white", size=2.5, stroke=1) +
  scale_color_identity() +
  scale_fill_identity()
p3
ggsave("federer_100_p3.png", width=10, height=6, units="in")

# OK, now onto that direct labelling.
# Let’s start simple: we’ll use the same filtering-on-the-fly as we did with our marker points to add text labels only on the last point for each player, with one additional filtering step: we’re labelling _only_ our five highlighted players.
p4 <- ggplot(plot_data, aes(age, titles, group=player_name, col=colour, fill=colour)) +
  theme_minimal() +
  geom_step(direction="hv") +
  geom_point(data = . %>% group_by(player_name) %>% top_n(1, titles), shape=21, col="white", size=2.5, stroke=1) +
  geom_text(data = . %>% group_by(player_name) %>% top_n(1, titles) %>% filter(player_name %in% highlighted_players), aes(label=player_name), family="M+ 1p medium") +
  scale_color_identity() +
  scale_fill_identity()
p4
ggsave("federer_100_p4.png", width=10, height=6, units="in")

# Decent start, but again we can do better. First off, we’ll want to offset those labels from the points, so they don’t overlap. While we’re at it, how about we go for surnames only? That will save some clutter, and all the names are both a) clearly distinct, and b) instantly recognisable without first names.
# Okay, so how do we do that?
# For the offset, we use the `hjust` parameter. Setting it to `0` will anchor the start of each text label to the data point, meaning the names will run out to the right. To be safe in terms of spacing, let’s also add a couple of non-breaking-space characters in front of each name, giving us daylight between point and label.
# To get surnames only, we’ll use some regex, telling to get rid of any characters before and including a space in each player’s name.
p5 <- ggplot(plot_data, aes(age, titles, group=player_name, col=colour, fill=colour)) +
  theme_minimal() +
  geom_step(direction="hv") +
  geom_point(data = . %>% group_by(player_name) %>% top_n(1, titles), shape=21, col="white", size=2.5, stroke=1) +
  geom_text(data = . %>% group_by(player_name) %>% top_n(1, titles) %>% filter(player_name %in% highlighted_players), aes(label=player_name %>% gsub(".+\\s","",.) %>% paste0("  ",.)), family="M+ 1p medium", hjust=0) +
  scale_color_identity() +
  scale_fill_identity()
p5
ggsave("federer_100_p5.png", width=10, height=6, units="in")

# Cool! But as usual, we can do better. See Nadal? His label is overlapping Federer’s line. Not a disaster, but it would be nice to neaten this up. For this, we need to give him a special `hjust` treatment, offsetting his label to the left instead of the right. For that, the best approach is to add this back into our plot data...
plot_data <- plot_data %>%
  mutate(hj = if_else(player_name == "Rafael Nadal", 1, 0))
# ...and then use that variable to determine the `hjust` value for each player.
# One more little trick here: to get that space between Nadal’s label and point, we need those non-breaking-spaces at the _other end_ of his label. To safe us faffing about with different spacing settings for each player, we can just add spaces to both ends of all names.
p6 <- ggplot(plot_data, aes(age, titles, group=player_name, col=colour, fill=colour)) +
  theme_minimal() +
  geom_step(direction="hv") +
  geom_point(data = . %>% group_by(player_name) %>% top_n(1, titles), shape=21, col="white", size=2.5, stroke=1) +
  geom_text(data = . %>% group_by(player_name) %>% top_n(1, titles) %>% filter(player_name %in% highlighted_players), aes(label=player_name %>% gsub(".+\\s","",.) %>% paste0("  ",.,"  "), hjust=hj), family="M+ 1p medium") +
  scale_color_identity() +
  scale_fill_identity()
p6
ggsave("federer_100_p6.png", width=10, height=6, units="in")

# Nice! Now, one other thing you may have noticed is that Federer’s label has disappeared off the right-hand edge of our chart. That’s not good! To fix that, let’s manually adjust our x-axis, giving it more room to breathe on the right. I’ve chosen limits of 15 years (this means we keep the prodigiously early starts of Richard Gasquet, Lleyton Hewitt and Rafa Nadal), and 41 for that extra room for Federer’s labe.
# Note, though, that none of our players were actually still playing at 40, so let’s make sure that doesn’t misleadingly appear among our x-axis ticks. We can do that by explicitly specificying the tick values we want — ags 15 to 35, at 5 year intervals.
# While we’re on the topic of axes, let’s neaten things up a bit more. It’s often good practice to bring the y-axis over to the right-hand-side on any chronological chart, since the most recent data points are typically the most salient, and therefore would benefit from being closer to axis ticks for easy reading of the values.
# In our `scale_` commands below, the `expand=c(0,0)` argument is telling them ot interpret these limits strictly, rather than giving any additional buffer. I sometimes like to do this just so I know exactly what each bit of space in my plot is being used for.
p7 <- ggplot(plot_data, aes(age, titles, group=player_name, col=colour, fill=colour)) +
  theme_minimal() +
  geom_step(direction="hv") +
  geom_point(data = . %>% group_by(player_name) %>% top_n(1, titles), shape=21, col="white", size=2.5, stroke=1) +
  geom_text(data = . %>% group_by(player_name) %>% top_n(1, titles) %>% filter(player_name %in% highlighted_players), aes(label=player_name %>% gsub(".+\\s","",.) %>% paste0("  ",.,"  "), hjust=hj), family="M+ 1p medium") +
  scale_color_identity() +
  scale_fill_identity() +
  scale_x_continuous(limits=c(15, 41), breaks=seq(15,35,5), expand=c(0,0)) +
  scale_y_continuous(position = "right", expand=expand_scale(add=c(0,5)))
p7
ggsave("federer_100_p7.png", width=10, height=6, units="in")

# Next, how about some styling? We’re using the defaults for our plotting area at the moment: lots of fairly ordinary looking grey gridlines, no strong axis lines etc. I’ve said it before and I’ll say it again: we can do better!
# Let’s make some changes to our `theme` to clean things up. There’s a lot going on here, so I’ll talk through each substantive theme change:
p8 <- ggplot(plot_data, aes(age, titles, group=player_name, col=colour, fill=colour)) +
  theme_minimal() +
  geom_step(direction="hv") +
  geom_point(data = . %>% group_by(player_name) %>% top_n(1, titles), shape=21, col="white", size=2.5, stroke=1) +
  geom_text(data = . %>% group_by(player_name) %>% top_n(1, titles) %>% filter(player_name %in% highlighted_players), aes(label=player_name %>% gsub(".+\\s","",.) %>% paste0("  ",.,"  "), hjust=hj), family="M+ 1p medium") +
  scale_color_identity() +
  scale_fill_identity() +
  scale_x_continuous(limits=c(15, 41), breaks=seq(15,35,5), expand=c(0,0)) +
  scale_y_continuous(position = "right", expand=expand_scale(add=c(0,5))) +
  theme(
    # These first four deal with out gridlines. Minor gridilnes are usually clutter, and that’s true in this case. Let’s get rid of them.
    panel.grid.minor.x = element_blank(),
    panel.grid.minor.y = element_blank(),
    panel.grid.major.x = element_blank(),
    # The main thing we care about is the number of titles each player has, so we’ll keep the y-axis ticks, but let’s make them a bit more minimal by stripping back both the colour and the stroke width.
    panel.grid.major.y = element_line(color="gray85", size=0.3),
    # Those horizontal gridlines will double as y-axis ticks, so we can lose the inbuilt y ticks, but let’s add some nice little x-axis ticks for our age breaks, and let’s darken the x-axis line to make it serve as a clear baseline on which the rest of the chart sits
    axis.ticks.y = element_blank(),
    axis.ticks.x = element_line(color="#212121", size=0.3),
    axis.ticks.length = unit(0.2, "cm"),
    axis.line.x = element_line(size=0.3, color="#212121"),
    # This next one is fairly pedantic, but I think it matters: we should right-align our vertical tick labels so the units are all vertically aligned, as are the tens, the hundreds and so on
    axis.text.y.right = element_text(hjust=1),
    # We’ll get rid of the y-axis title altogether, since we’ll put this in the plot subtitle instead
    axis.title.y = element_blank(),
    text = element_text(family="M+ 1p regular"),
    plot.title = element_text(family="M+ 1p medium"),
    plot.caption = element_text(family="M+ 1p regular", hjust=0)
  )
p8
ggsave("federer_100_p8.png", width=10, height=6, units="in")

# As far as the chart itself goes, we’re almost there, but we’re yet to address one crucial element: the title! Text — and titles in particular — are probably the single most imporant part of any chart [https://vcg.seas.harvard.edu/files/pfister/files/infovis_submission251-camera.pdf], so let’s address that now with a nice narrative title, giving our audience the single most important message we would hope they take away from the chart.
# At the same time, let’s add a subtitle setting out the y-axis units, and a source and credit at the bottom
p9 <- ggplot(plot_data, aes(age, titles, group=player_name, col=colour, fill=colour)) +
  theme_minimal() +
  geom_step(direction="hv") +
  geom_point(data = . %>% group_by(player_name) %>% top_n(1, titles), shape=21, col="white", size=2.5, stroke=1) +
  geom_text(data = . %>% group_by(player_name) %>% top_n(1, titles) %>% filter(player_name %in% highlighted_players), aes(label=player_name %>% gsub(".+\\s","",.) %>% paste0("  ",.,"  "), hjust=hj), family="M+ 1p medium") +
  scale_color_identity() +
  scale_fill_identity() +
  scale_x_continuous(limits=c(15, 41), breaks=seq(15,35,5), expand=c(0,0)) +
  scale_y_continuous(position = "right", expand=expand_scale(add=c(0,5))) +
  theme(
    panel.grid.minor.x = element_blank(),
    panel.grid.minor.y = element_blank(),
    panel.grid.major.x = element_blank(),
    panel.grid.major.y = element_line(color="gray85", size=0.3),
    axis.ticks.y = element_blank(),
    axis.ticks.x = element_line(color="#212121", size=0.3),
    axis.ticks.length = unit(0.2, "cm"),
    axis.line.x = element_line(size=0.3, color="#212121"),
    axis.text.y.right = element_text(hjust=1),
    axis.title.y = element_blank(),
    text = element_text(family="M+ 1p regular"),
    plot.title = element_text(family="M+ 1p medium"),
    plot.caption = element_text(family="M+ 1p regular", hjust=0)
  ) +
  labs(
    x = "Age",
    y ="",
    title = "Roger Federer has become the first ever player to win 100 titles on the ATP Tour",
    subtitle = "Cumulative ATP Tour titles won, by age",
    caption = "Source: Ultimate Tennis Statistics    |    Graphic: John Burn-Murdoch / @jburnmurdoch"
  )
p9
ggsave("federer_100_p9.png", width=10, height=6, units="in")

# Lovely stuff!
# That pretty much does it, but there are a couple of final touches that I think are worth making:
# First, let’s explicitly label the number of titles each of our highlighed players has won. We’ll do that by adding in the additional data to the `paste0()` label-building part our `geom_text` command
# And second, let’s bump up the size of our five highlighted players’s lines, and knock back the opacity of the un-highlighted players slightly. To do this, we’ll use the `alpha` aesthetic across the whole plot, and the size one for our lines.
# Finally we’ll tell ggplot not to display the legends for those size and alpha scales, since we’re just using them to highlight our players of interest, rather than to encode any additional information.
p10 <- ggplot(plot_data, aes(age, titles, group=player_name, col=colour, fill=colour, alpha=player_name %in% highlighted_players)) +
  theme_minimal() +
  geom_step(direction="hv", aes(size=player_name %in% highlighted_players)) +
  geom_point(data = . %>% group_by(player_name) %>% top_n(1, titles), shape=21, col="white", size=2.5, stroke=1) +
  geom_text(data = . %>% group_by(player_name) %>% top_n(1, titles) %>% filter(player_name %in% highlighted_players), aes(label=player_name %>% gsub(".+\\s","",.) %>% paste0("  ",.,":",titles,"  "), hjust=hj), family="M+ 1p medium") +
  scale_color_identity() +
  scale_fill_identity() +
  scale_alpha_manual(values=c(0.7, 1), guide=F) +
  scale_size_manual(values=c(0.5, 0.8), guide=F) +
  scale_x_continuous(limits=c(15, 41), breaks=seq(15,35,5), expand=c(0,0)) +
  scale_y_continuous(position = "right", expand=expand_scale(add=c(0,5))) +
  theme(
    panel.grid.minor.x = element_blank(),
    panel.grid.minor.y = element_blank(),
    panel.grid.major.x = element_blank(),
    panel.grid.major.y = element_line(color="gray85", size=0.3),
    axis.ticks.y = element_blank(),
    axis.ticks.x = element_line(color="#212121", size=0.3),
    axis.ticks.length = unit(0.2, "cm"),
    axis.line.x = element_line(size=0.3, color="#212121"),
    axis.text.y.right = element_text(hjust=1),
    axis.title.y = element_blank(),
    text = element_text(family="M+ 1p regular"),
    plot.title = element_text(family="M+ 1p medium"),
    plot.caption = element_text(family="M+ 1p regular", hjust=0)
  ) +
  labs(
    x = "Age",
    y ="",
    title = "Roger Federer has become the first ever player to win 100 titles on the ATP Tour",
    subtitle = "Cumulative ATP Tour titles won, by age",
    caption = "Source: Ultimate Tennis Statistics    |    Graphic: John Burn-Murdoch / @jburnmurdoch"
  )
p10
ggsave("federer_100_p10.png", width=10, height=6, units="in")