johnburnmurdoch/google_covid_mobility_scraper.R

## google_covid_mobility_scraper.R
# Install and load required packages
install.packages("needs")
library(needs)
needs(tidyverse, magrittr, animation, pdftools, png, scales)

# Function that extracts data from Google Mobility PDFs
process_google_mobility <- function(country_code, start_date, end_date){

  # Convert first page of PDF into high-res PNG
  pdf_convert(paste0("https://www.gstatic.com/covid19/mobility/",end_date,"_",country_code,"_Mobility_Report_en.pdf"), format = "png", pages = 1, dpi = 300, filenames = "IMG1.png")

  # Convert second page of PDF into high-res PNG
  pdf_convert(paste0("https://www.gstatic.com/covid19/mobility/",end_date,"_",country_code,"_Mobility_Report_en.pdf"), format = "png", pages = 2, dpi = 300, filenames = "IMG2.png")

  # Read PDF page one image into R as an array of red, green and blue pixel values
  img <- readPNG("IMG1.png")

  # Rescale the pixel values from 0-1 into 0-255
  r <- img[,,1] * 255
  g <- img[,,2] * 255
  b <- img[,,3] * 255

  # rgb(66,133,244) is the rgb specification of the blue line used in Google’s charts

  # retail chart extent is from pixel 1500 to 1786 vertically, and from 853 to 1328 left to right, so we loop through those pixels from left to right, and in each case

  val <- c(853:1328) %>%
    map_dbl(~{
      # find the vertical pixels that match our specified rgb() colour, and average them to get the middle pixel of the line on the chart
      mean(which(
        r[1500:1822,.x]==66 & g[1500:1822,.x]==133 & b[1500:1822,.x]==244
      ))
    }) %>%
    # rescale this value so instead of being a pixel number, it’s a value from -100 to 80 (the domain of the y-axis)
    scales::rescale(to=c(80, -100), from = c(1, 322))
  # then generate a series of data values equal to the number of horizontal chart pixels, so we can match a date/time to every y-axis value
  date <- seq.Date(as.Date(start_date), as.Date(end_date), length.out = length(val))
  # then join values to dates, summarise by individual day (one value per day), and add columns specifying the chart’s subject matter and the country
  retail <- tibble(date, val) %>%
    group_by(date) %>%
    summarise(val = mean(val)) %>%
    mutate(measure = "Retail", country_code)

  # then repeat for parks and transit

  # parks
  val <- c(853:1328) %>%
    map_dbl(~{
      mean(which(
        r[2476:2798,.x]==66 & g[2476:2798,.x]==133 & b[2476:2798,.x]==244
      ))
    }) %>%
    scales::rescale(to=c(80, -100), from = c(1, 322))
  date <- seq.Date(as.Date(start_date), as.Date(end_date), length.out = length(val))
  parks <- tibble(date, val) %>%
    group_by(date) %>%
    summarise(val = mean(val)) %>%
    mutate(measure = "Parks", country_code)

  # transit is on page two, so we need to load in the new image

  img <- readPNG("IMG2.png")

  r <- img[,,1] * 255
  g <- img[,,2] * 255
  b <- img[,,3] * 255

  # transit
  val <- c(785:1259) %>%
    map_dbl(~{
      mean(which(
        r[222:544,.x]==66 & g[222:544,.x]==133 & b[222:544,.x]==244
      ))
    }) %>%
    scales::rescale(to=c(80, -100), from = c(1, 322))
  date <- seq.Date(as.Date(start_date), as.Date(end_date), length.out = length(val))
  transit <- tibble(date, val) %>%
    group_by(date) %>%
    summarise(val = mean(val)) %>%
    mutate(measure = "Transit", country_code)

  # then combine retail, parks and transit into one table, and return this
  return(bind_rows(retail, parks, transit))

}

# Loop through countries of interest, processing Google’s two reports (to date) for each of them, and joining them together
countries <- c("GB", "US", "FR", "DE", "ES", "IT", "NO", "NL", "SE", "BE", "AT")
countries %>%
  map_dfr(~{
    data <- bind_rows(
      process_google_mobility(.x, "2020-02-16", "2020-03-29"),
      process_google_mobility(.x, "2020-02-23", "2020-04-05")
    )
  }) -> GCM_master

# PLot this dataset as small multiples; one chart for each topic, one line on each per country
GCM_master %>%
  # Get rid of any missing data
  filter(!is.na(val)) %>%
  # Make sure dates are read as dates (days) not date-times
  mutate(
    date = as.character(as.Date(date, "%Y-%m-%d")),
    date = as.Date(date)
  ) %>%
  # Make sure we only have one value per country per topic per day
  group_by(measure, date, country_code) %>%
  summarise(val = mean(val)) %>%
  group_by(measure, country_code) %>%
  # Create a smoothed moving average to iron out noise in the daily data (if you want)
  mutate(smoothed = roll_meanr(val, 7)) %>%
  # Filter for only a subset of countries of interest for this plot
  filter(country_code %in% c("GB", "IT", "FR", "ES", "SE", "DE", "US")) %>%
  # Plot moving average (or raw data) vs date
  ggplot(aes(date, smoothed)) +
  # Add a dark grid line for zero on the y-asix
  geom_hline(yintercept = 0) +
  # Draw one line per country
  geom_line(aes(col = country_code), size=0.5) +
  # Add a highlight for a country of focus, first adding a thicker white line to create a border behind the main one
  geom_line(size=1.5, data = . %>% filter(country_code == "GB"), col = "white") +
  geom_line(size=1, data = . %>% filter(country_code == "GB"), col = "black") +
  # Add country labels to the end of each line
  geom_text_repel(aes(col = country_code, label = country_code), direction = "y", data = . %>% group_by(country_code, measure) %>% top_n(1, date), hjust=0) +
  # Clean up the x-axis
  scale_x_date(limits = c(as.Date("2020-02-21"), as.Date("2020-04-06")), breaks = c(as.Date("2020-02-22"), as.Date("2020-04-05")), labels = function(x)format(x,"%d %b")) +
  # Duplicate y-axis for easy reading of values
  scale_y_continuous(limits=c(-100,40), breaks = seq(-80, 40, 20), expand=c(0,0), sec.axis = dup_axis()) +
  # Small multiples: one chart per topic
  facet_wrap(~measure, nrow=1) +
  # Clean up the plot theme
  theme_minimal() +
  theme(
    panel.grid.minor = element_blank(),
    panel.grid.major.x = element_blank(),
    axis.ticks = element_line(),
    axis.title = element_blank(),
    legend.position = "none"
  )
	# Install and load required packages
	install.packages("needs")
	library(needs)
	needs(tidyverse, magrittr, animation, pdftools, png, scales)

	# Function that extracts data from Google Mobility PDFs
	process_google_mobility <- function(country_code, start_date, end_date){

	# Convert first page of PDF into high-res PNG
	pdf_convert(paste0("https://www.gstatic.com/covid19/mobility/",end_date,"_",country_code,"_Mobility_Report_en.pdf"), format = "png", pages = 1, dpi = 300, filenames = "IMG1.png")

	# Convert second page of PDF into high-res PNG
	pdf_convert(paste0("https://www.gstatic.com/covid19/mobility/",end_date,"_",country_code,"_Mobility_Report_en.pdf"), format = "png", pages = 2, dpi = 300, filenames = "IMG2.png")

	# Read PDF page one image into R as an array of red, green and blue pixel values
	img <- readPNG("IMG1.png")

	# Rescale the pixel values from 0-1 into 0-255
	r <- img[,,1] * 255
	g <- img[,,2] * 255
	b <- img[,,3] * 255

	# rgb(66,133,244) is the rgb specification of the blue line used in Google’s charts

	# retail chart extent is from pixel 1500 to 1786 vertically, and from 853 to 1328 left to right, so we loop through those pixels from left to right, and in each case

	val <- c(853:1328) %>%
	map_dbl(~{
	# find the vertical pixels that match our specified rgb() colour, and average them to get the middle pixel of the line on the chart
	mean(which(
	r[1500:1822,.x]==66 & g[1500:1822,.x]==133 & b[1500:1822,.x]==244
	))
	}) %>%
	# rescale this value so instead of being a pixel number, it’s a value from -100 to 80 (the domain of the y-axis)
	scales::rescale(to=c(80, -100), from = c(1, 322))
	# then generate a series of data values equal to the number of horizontal chart pixels, so we can match a date/time to every y-axis value
	date <- seq.Date(as.Date(start_date), as.Date(end_date), length.out = length(val))
	# then join values to dates, summarise by individual day (one value per day), and add columns specifying the chart’s subject matter and the country
	retail <- tibble(date, val) %>%
	group_by(date) %>%
	summarise(val = mean(val)) %>%
	mutate(measure = "Retail", country_code)

	# then repeat for parks and transit

	# parks
	val <- c(853:1328) %>%
	map_dbl(~{
	mean(which(
	r[2476:2798,.x]==66 & g[2476:2798,.x]==133 & b[2476:2798,.x]==244
	))
	}) %>%
	scales::rescale(to=c(80, -100), from = c(1, 322))
	date <- seq.Date(as.Date(start_date), as.Date(end_date), length.out = length(val))
	parks <- tibble(date, val) %>%
	group_by(date) %>%
	summarise(val = mean(val)) %>%
	mutate(measure = "Parks", country_code)

	# transit is on page two, so we need to load in the new image

	img <- readPNG("IMG2.png")

	r <- img[,,1] * 255
	g <- img[,,2] * 255
	b <- img[,,3] * 255

	# transit
	val <- c(785:1259) %>%
	map_dbl(~{
	mean(which(
	r[222:544,.x]==66 & g[222:544,.x]==133 & b[222:544,.x]==244
	))
	}) %>%
	scales::rescale(to=c(80, -100), from = c(1, 322))
	date <- seq.Date(as.Date(start_date), as.Date(end_date), length.out = length(val))
	transit <- tibble(date, val) %>%
	group_by(date) %>%
	summarise(val = mean(val)) %>%
	mutate(measure = "Transit", country_code)

	# then combine retail, parks and transit into one table, and return this
	return(bind_rows(retail, parks, transit))

	}

	# Loop through countries of interest, processing Google’s two reports (to date) for each of them, and joining them together
	countries <- c("GB", "US", "FR", "DE", "ES", "IT", "NO", "NL", "SE", "BE", "AT")
	countries %>%
	map_dfr(~{
	data <- bind_rows(
	process_google_mobility(.x, "2020-02-16", "2020-03-29"),
	process_google_mobility(.x, "2020-02-23", "2020-04-05")
	)
	}) -> GCM_master

	# PLot this dataset as small multiples; one chart for each topic, one line on each per country
	GCM_master %>%
	# Get rid of any missing data
	filter(!is.na(val)) %>%
	# Make sure dates are read as dates (days) not date-times
	mutate(
	date = as.character(as.Date(date, "%Y-%m-%d")),
	date = as.Date(date)
	) %>%
	# Make sure we only have one value per country per topic per day
	group_by(measure, date, country_code) %>%
	summarise(val = mean(val)) %>%
	group_by(measure, country_code) %>%
	# Create a smoothed moving average to iron out noise in the daily data (if you want)
	mutate(smoothed = roll_meanr(val, 7)) %>%
	# Filter for only a subset of countries of interest for this plot
	filter(country_code %in% c("GB", "IT", "FR", "ES", "SE", "DE", "US")) %>%
	# Plot moving average (or raw data) vs date
	ggplot(aes(date, smoothed)) +
	# Add a dark grid line for zero on the y-asix
	geom_hline(yintercept = 0) +
	# Draw one line per country
	geom_line(aes(col = country_code), size=0.5) +
	# Add a highlight for a country of focus, first adding a thicker white line to create a border behind the main one
	geom_line(size=1.5, data = . %>% filter(country_code == "GB"), col = "white") +
	geom_line(size=1, data = . %>% filter(country_code == "GB"), col = "black") +
	# Add country labels to the end of each line
	geom_text_repel(aes(col = country_code, label = country_code), direction = "y", data = . %>% group_by(country_code, measure) %>% top_n(1, date), hjust=0) +
	# Clean up the x-axis
	scale_x_date(limits = c(as.Date("2020-02-21"), as.Date("2020-04-06")), breaks = c(as.Date("2020-02-22"), as.Date("2020-04-05")), labels = function(x)format(x,"%d %b")) +
	# Duplicate y-axis for easy reading of values
	scale_y_continuous(limits=c(-100,40), breaks = seq(-80, 40, 20), expand=c(0,0), sec.axis = dup_axis()) +
	# Small multiples: one chart per topic
	facet_wrap(~measure, nrow=1) +
	# Clean up the plot theme
	theme_minimal() +
	theme(
	panel.grid.minor = element_blank(),
	panel.grid.major.x = element_blank(),
	axis.ticks = element_line(),
	axis.title = element_blank(),
	legend.position = "none"
	)