Skip to content

Instantly share code, notes, and snippets.

@jeanpaulrsoucy
Created April 8, 2021 23:48
Show Gist options
  • Save jeanpaulrsoucy/2132259122e68c17ba39b26128aa3eda to your computer and use it in GitHub Desktop.
Save jeanpaulrsoucy/2132259122e68c17ba39b26128aa3eda to your computer and use it in GitHub Desktop.
# Scraping and plotting Canadian COVID-19 VOC data
*Jean-Paul R. Soucy*
Let's scrape the VOC data from the [CTV News variant tracker](https://www.ctvnews.ca/health/coronavirus/tracking-variants-of-the-novel-coronavirus-in-canada-1.5296141) (maintained by journalists [Jesse Tahirali](https://twitter.com/jessetahirali) and [Stephanie Liu](https://twitter.com/_stephanieliu)) and the [Public Health Agency of Canada](https://health-infobase.canada.ca/covid-19/epidemiological-summary-covid-19-cases.html#VOC).
```{r variant-data}
# load packages
library(jsonlite)
suppressPackageStartupMessages(library(dplyr))
library(tidyr)
# load and process data
phac <- read.csv(
"https://health-infobase.canada.ca/src/data/covidLive/covid19-epiSummary-voc.csv",
stringsAsFactors = FALSE) %>%
## keep relevant columns
select(report_date, prov, b117, b1351, p1) %>%
## rename columns
rename(date = report_date, province = prov, B117 = b117, B1351 = b1351, P1 = p1) %>%
## convert dates
mutate(
date = as.Date(date)
) %>%
## rename "CA" to "Canada"
mutate(
province = case_when(
province == "CA" ~ "Canada",
TRUE ~ province
)
) %>%
## fill in missing dates
complete(., expand(., date, province), fill = list(B117 = 0, B1351 = 0, P1 = 0)) %>%
## arrange
arrange(date, province)
# wide to long for easier plotting
phac_plot <-
pivot_longer(
phac,
cols = c(B117, B1351, P1),
names_to = "variant",
values_to = "count"
)
# load and process data
ctv <- fromJSON(
"https://beta.ctvnews.ca/content/dam/common/exceltojson/COVID-Variants.txt",
flatten = FALSE) %>%
## remove blank data and summary data
filter(!Date %in% c("", "Updated", "Total")) %>%
## convert Excel dates
mutate(date = as.Date(as.integer(Date), origin = "1899-12-30"))
# create usable table
ctv <- bind_cols(
select(ctv, date, contains("B117")) %>%
pivot_longer(
cols = ends_with("B117"),
names_to = c("province", ".value"),
names_sep = "_",
values_to = "B117",
values_transform = list(B117 = as.integer)
) %>%
arrange(date, province) %>%
group_by(province) %>%
fill(3, .direction = "down") %>%
ungroup,
select(ctv, date, contains("B1351")) %>%
pivot_longer(
cols = ends_with("B1351"),
names_to = c("province", ".value"),
names_sep = "_",
values_to = "B1351",
values_transform = list(B1351 = as.integer)
) %>%
arrange(date, province) %>%
group_by(province) %>%
fill(3, .direction = "down") %>%
ungroup %>%
select(3),
select(ctv, date, contains("P1")) %>%
pivot_longer(
cols = ends_with("P1"),
names_to = c("province", ".value"),
names_sep = "_",
values_to = "P1",
values_transform = list(P1 = as.integer)
) %>%
arrange(date, province) %>%
group_by(province) %>%
fill(3, .direction = "down") %>%
ungroup %>%
select(3)
) %>%
replace_na(list(B117 = 0, B1351 = 0, P1 = 0))
# wide to long for easier plotting
ctv_plot <-
pivot_longer(
ctv,
cols = c(B117, B1351, P1),
names_to = "variant",
values_to = "count"
)
# combine data
phac_plot$source <- "PHAC"
ctv_plot$source <- "CTV News"
variants <- bind_rows(phac_plot, ctv_plot)
```
Let's plot the two time series for Canada.
```{r variants-can, fig.cap = "Cumulative time series for three COVID-19 variants of concern in Canada. Data from CTV News (solid) and Public Health Agency of Canada (dashed)."}
# load packages
library(ggplot2)
library(ggpubr)
# plot Canadian data
ggplot(data = variants %>% filter(province == "Canada"),
aes(x = date, y = count, colour = variant, linetype = source)) +
geom_line() +
labs(
title = "COVID Variants in Canada",
x = "Public reporting date",
y = "Cumulative variants reported",
colour = "Variant",
linetype = "Source") +
theme_pubr() +
theme(
legend.position = "bottom",
plot.title = element_text(size = 16, hjust = 0.5),
axis.title = element_text(size = 14),
axis.text = element_text(size = 11),
legend.title = element_text(size = 13),
legend.text = element_text(size = 11),
axis.title.y = element_text(margin = margin(
t = 0,
r = 5,
b = 0,
l = 0
)))
```
Let's plot the two time series for Ontario.
```{r variants-on, fig.cap = "Cumulative time series for three COVID-19 variants of concern in Canada. Data from CTV News (solid) and Public Health Agency of Canada (dashed)."}
# plot Ontario data
ggplot(data = variants %>% filter(province == "ON"),
aes(x = date, y = count, colour = variant, linetype = source)) +
geom_line() +
labs(
title = "COVID Variants in Ontario",
x = "Public reporting date",
y = "Cumulative variants reported",
colour = "Variant",
linetype = "Source") +
theme_pubr() +
theme(
legend.position = "bottom",
plot.title = element_text(size = 16, hjust = 0.5),
axis.title = element_text(size = 14),
axis.text = element_text(size = 11),
legend.title = element_text(size = 13),
legend.text = element_text(size = 11),
axis.title.y = element_text(margin = margin(
t = 0,
r = 5,
b = 0,
l = 0
)))
```
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment