Skip to content

Instantly share code, notes, and snippets.

@johnburnmurdoch
Created April 15, 2020 08:04
Show Gist options
  • Save johnburnmurdoch/1d23978fc8213ff656d9f629608dd1f5 to your computer and use it in GitHub Desktop.
Save johnburnmurdoch/1d23978fc8213ff656d9f629608dd1f5 to your computer and use it in GitHub Desktop.
# Install and load required packages
install.packages("needs")
library(needs)
needs(tidyverse, magrittr, animation, pdftools, png, scales)
# Function that extracts data from Google Mobility PDFs
process_google_mobility <- function(country_code, start_date, end_date){
# Convert first page of PDF into high-res PNG
pdf_convert(paste0("https://www.gstatic.com/covid19/mobility/",end_date,"_",country_code,"_Mobility_Report_en.pdf"), format = "png", pages = 1, dpi = 300, filenames = "IMG1.png")
# Convert second page of PDF into high-res PNG
pdf_convert(paste0("https://www.gstatic.com/covid19/mobility/",end_date,"_",country_code,"_Mobility_Report_en.pdf"), format = "png", pages = 2, dpi = 300, filenames = "IMG2.png")
# Read PDF page one image into R as an array of red, green and blue pixel values
img <- readPNG("IMG1.png")
# Rescale the pixel values from 0-1 into 0-255
r <- img[,,1] * 255
g <- img[,,2] * 255
b <- img[,,3] * 255
# rgb(66,133,244) is the rgb specification of the blue line used in Google’s charts
# retail chart extent is from pixel 1500 to 1786 vertically, and from 853 to 1328 left to right, so we loop through those pixels from left to right, and in each case
val <- c(853:1328) %>%
map_dbl(~{
# find the vertical pixels that match our specified rgb() colour, and average them to get the middle pixel of the line on the chart
mean(which(
r[1500:1822,.x]==66 & g[1500:1822,.x]==133 & b[1500:1822,.x]==244
))
}) %>%
# rescale this value so instead of being a pixel number, it’s a value from -100 to 80 (the domain of the y-axis)
scales::rescale(to=c(80, -100), from = c(1, 322))
# then generate a series of data values equal to the number of horizontal chart pixels, so we can match a date/time to every y-axis value
date <- seq.Date(as.Date(start_date), as.Date(end_date), length.out = length(val))
# then join values to dates, summarise by individual day (one value per day), and add columns specifying the chart’s subject matter and the country
retail <- tibble(date, val) %>%
group_by(date) %>%
summarise(val = mean(val)) %>%
mutate(measure = "Retail", country_code)
# then repeat for parks and transit
# parks
val <- c(853:1328) %>%
map_dbl(~{
mean(which(
r[2476:2798,.x]==66 & g[2476:2798,.x]==133 & b[2476:2798,.x]==244
))
}) %>%
scales::rescale(to=c(80, -100), from = c(1, 322))
date <- seq.Date(as.Date(start_date), as.Date(end_date), length.out = length(val))
parks <- tibble(date, val) %>%
group_by(date) %>%
summarise(val = mean(val)) %>%
mutate(measure = "Parks", country_code)
# transit is on page two, so we need to load in the new image
img <- readPNG("IMG2.png")
r <- img[,,1] * 255
g <- img[,,2] * 255
b <- img[,,3] * 255
# transit
val <- c(785:1259) %>%
map_dbl(~{
mean(which(
r[222:544,.x]==66 & g[222:544,.x]==133 & b[222:544,.x]==244
))
}) %>%
scales::rescale(to=c(80, -100), from = c(1, 322))
date <- seq.Date(as.Date(start_date), as.Date(end_date), length.out = length(val))
transit <- tibble(date, val) %>%
group_by(date) %>%
summarise(val = mean(val)) %>%
mutate(measure = "Transit", country_code)
# then combine retail, parks and transit into one table, and return this
return(bind_rows(retail, parks, transit))
}
# Loop through countries of interest, processing Google’s two reports (to date) for each of them, and joining them together
countries <- c("GB", "US", "FR", "DE", "ES", "IT", "NO", "NL", "SE", "BE", "AT")
countries %>%
map_dfr(~{
data <- bind_rows(
process_google_mobility(.x, "2020-02-16", "2020-03-29"),
process_google_mobility(.x, "2020-02-23", "2020-04-05")
)
}) -> GCM_master
# PLot this dataset as small multiples; one chart for each topic, one line on each per country
GCM_master %>%
# Get rid of any missing data
filter(!is.na(val)) %>%
# Make sure dates are read as dates (days) not date-times
mutate(
date = as.character(as.Date(date, "%Y-%m-%d")),
date = as.Date(date)
) %>%
# Make sure we only have one value per country per topic per day
group_by(measure, date, country_code) %>%
summarise(val = mean(val)) %>%
group_by(measure, country_code) %>%
# Create a smoothed moving average to iron out noise in the daily data (if you want)
mutate(smoothed = roll_meanr(val, 7)) %>%
# Filter for only a subset of countries of interest for this plot
filter(country_code %in% c("GB", "IT", "FR", "ES", "SE", "DE", "US")) %>%
# Plot moving average (or raw data) vs date
ggplot(aes(date, smoothed)) +
# Add a dark grid line for zero on the y-asix
geom_hline(yintercept = 0) +
# Draw one line per country
geom_line(aes(col = country_code), size=0.5) +
# Add a highlight for a country of focus, first adding a thicker white line to create a border behind the main one
geom_line(size=1.5, data = . %>% filter(country_code == "GB"), col = "white") +
geom_line(size=1, data = . %>% filter(country_code == "GB"), col = "black") +
# Add country labels to the end of each line
geom_text_repel(aes(col = country_code, label = country_code), direction = "y", data = . %>% group_by(country_code, measure) %>% top_n(1, date), hjust=0) +
# Clean up the x-axis
scale_x_date(limits = c(as.Date("2020-02-21"), as.Date("2020-04-06")), breaks = c(as.Date("2020-02-22"), as.Date("2020-04-05")), labels = function(x)format(x,"%d %b")) +
# Duplicate y-axis for easy reading of values
scale_y_continuous(limits=c(-100,40), breaks = seq(-80, 40, 20), expand=c(0,0), sec.axis = dup_axis()) +
# Small multiples: one chart per topic
facet_wrap(~measure, nrow=1) +
# Clean up the plot theme
theme_minimal() +
theme(
panel.grid.minor = element_blank(),
panel.grid.major.x = element_blank(),
axis.ticks = element_line(),
axis.title = element_blank(),
legend.position = "none"
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment