Skip to content

Instantly share code, notes, and snippets.

@sbfnk
Last active January 29, 2020 09:40
Show Gist options
  • Save sbfnk/77ffae24c7a61d70e112e470e388027a to your computer and use it in GitHub Desktop.
Save sbfnk/77ffae24c7a61d70e112e470e388027a to your computer and use it in GitHub Desktop.
code to estimate time-varying R0 for nCoV
library('tidyverse')
library('ggplot2')
##' Samples size (the number of trials) of a binomial distribution
##' copied from https://github.com/sbfnk/bpmodels/blob/master/R/utils.r
rbinom_size <- function(n, x, prob) {
x + stats::rnbinom(n, x + 1, prob)
}
##
## get data
##
data_dir <-
path.expand(file.path("~", "Dropbox", "nCoV-2019", "data_sources",
"case_data"))
china_onsets <-
readr::read_csv(file.path(data_dir, "time_series_China_CDC.csv"))
wuhan_confirmed <-
readr::read_csv(file.path(data_dir, "time_series_HKU_Wuhan.csv"))
linelist <- WuhanSeedingVsTransmission::get_linelist()
## distribution of delays over time
time_delays <- linelist %>%
filter(!is.na(delay_confirmation)) %>%
mutate(days_confirmation = as.integer(delay_confirmation)) %>%
group_by(date_confirmation) %>%
summarise(mean = mean(days_confirmation),
min.90 = quantile(days_confirmation, 0.05),
max.90 = quantile(days_confirmation, 0.95),
min.iqr = quantile(days_confirmation, 0.25),
max.iqr = quantile(days_confirmation, 0.75),
median = median(days_confirmation))
## TODO: improve on arbitrary cutoff date
cutoff <- as.Date("2020-01-21")
## extract confirmation delays from linelist
delays <- linelist %>%
dplyr::filter(date_confirmation > cutoff, !is.na(delay_confirmation)) %>%
.$delay_confirmation %>%
as.integer()
p <- ggplot(time_delays, aes(x = date_confirmation, y = median)) +
geom_line() +
geom_ribbon(aes(ymin = min.iqr, ymax = max.iqr), alpha = 0.4) +
geom_ribbon(aes(ymin = min.90, ymax = max.90), alpha = 0.2) +
geom_vline(xintercept = cutoff + 1, linetype = "dashed") +
xlab("Date of confirmation") + ylab("Delay (in days)") +
cowplot::theme_cowplot() +
expand_limits(y = 0)
cowplot::save_plot("time_delays.pdf", p)
##
## get reporting delay distribution
##
## Report was on the 26th, data up to the 25th.
## We combine 0- and 1-day delays into one bin.
##
counts <- table(delays)
## wrap 0-delays into 1-delays
counts[2] <- counts[2] + counts[1]
## remove 0-delays
counts <- counts[-1]
## get cumulative sum
cum_counts <- cumsum(counts)
## get cumulative frequency
cum_freq <- cum_counts / max(cum_counts)
## remove last data point (as it's 1 anyway)
cum_freq <- cum_freq[-length(cum_freq)]
##
## augment data with reporting delays
##
report_date <- as.Date("2020-01-26")
## number of "augmentations"
naug <- 100
## get confirmed cases
onsets <- china_onsets$Confirmed
## arbitrary starting point at day 31 -- too many zeroes before
## TODO: apply more general criterion for starting point or use all the data
onsets <- onsets[31:length(onsets)]
## sample onsets naug times
sampled_onsets <- lapply(seq_len(naug), function(x) {
## indices to sample, from the last day in the data backwards
indices <- length(onsets) + 1 - as.integer(names(cum_freq))
## combine unsampled onsets (those that are longer ago than the longest
## reporting delay) and sampled onsets (using binomial sampler above)
x_onsets <-
c(onsets[seq_len(min(indices-1))],
rev(rbinom_size(length(cum_freq), onsets[indices], cum_freq)))
## get dates
## TODO: this dates thing is done too often, should be simplified
dates <-
china_onsets$t[seq(nrow(china_onsets) + 1 - length(x_onsets),
nrow(china_onsets))]
return(tibble(date = dates, cases = x_onsets))
})
##
## get time-varying reproduction number with three serial interval distributions
##
## TODO: replace triple copy-pasted code with loop
rep_R <- list()
rep_R[["SARS-like"]] <- lapply(sampled_onsets, function(x) {
## estimate R with (relatively short) 2-day smoothing window
R <-
EpiEstim::estimate_R(x$cases,
method = "parametric_si",
config =
EpiEstim::make_config(
list(mean_si = 8.4, std_si = 3.8,
t_start = 2:26, t_end = 3:27)))$R
## TODO: better way of filtering out NAs (this breaks if there are NAs in the
## middle of the dataset for whatever reason)
R <- R[!is.na(R$`Mean(R)`), ]
## get dates
## TODO: this dates thing is done too often, should be simplified
dates <-
china_onsets$t[seq(nrow(china_onsets) + 1 - nrow(R),
nrow(china_onsets))]
return(tibble(date = dates, R = rnorm(nrow(R), R$`Mean(R)`, R$`Std(R)`)))
})
rep_R[["early SARS-like"]] <- lapply(sampled_onsets, function(x) {
## estimate R with (relatively short) 2-day smoothing window
R <-
EpiEstim::estimate_R(x$cases,
method = "parametric_si",
config =
EpiEstim::make_config(
list(mean_si = 10.0, std_si = 2.8,
t_start = 2:26, t_end = 3:27 )))$R
## TODO: better way of filtering out NAs (this breaks if there are NAs in the
## middle of the dataset for whatever reason)
R <- R[!is.na(R$`Mean(R)`), ]
## get dates
## TODO: this dates thing is done too often, should be simplified
dates <-
china_onsets$t[seq(nrow(china_onsets) + 1 - nrow(R),
nrow(china_onsets))]
return(tibble(date = dates, R = rnorm(nrow(R), R$`Mean(R)`, R$`Std(R)`)))
})
rep_R[["MERS-like"]] <- lapply(sampled_onsets, function(x) {
## estimate R with (relatively short) 2-day smoothing window
R <-
EpiEstim::estimate_R(x$cases,
method = "parametric_si",
config =
EpiEstim::make_config(
list(mean_si = 6.8, std_si = 4.1,
t_start = 2:26, t_end = 3:27 )))$R
## TODO: better way of filtering out NAs (this breaks if there are NAs in the
## middle of the dataset for whatever reason)
R <- R[!is.na(R$`Mean(R)`), ]
## get dates
## TODO: this dates thing is done too often, should be simplified
dates <-
china_onsets$t[seq(nrow(china_onsets) + 1 - nrow(R),
nrow(china_onsets))]
return(tibble(date = dates, R = rnorm(nrow(R), R$`Mean(R)`, R$`Std(R)`)))
})
## summarise sampled onsets
cases <- bind_rows(sampled_onsets) %>%
group_by(date) %>%
summarise(min.90 = quantile(cases, 0.05),
max.90 = quantile(cases, 0.95),
min.iqr = quantile(cases, 0.25),
max.iqr = quantile(cases, 0.75),
median = median(cases),
mean = mean(cases))
## summarise sampled reproduction numbers
R <- lapply(names(rep_R), function(x) {
bind_rows(rep_R[[x]]) %>%
mutate(`Serial interval distribution` = x)
}) %>%
bind_rows() %>%
group_by(date, `Serial interval distribution`) %>%
summarise(min.90 = quantile(R, 0.05),
max.90 = quantile(R, 0.95),
min.iqr = quantile(R, 0.25),
max.iqr = quantile(R, 0.75),
median = median(R),
mean = mean(R))
## plot
p <- ggplot(R, aes(x = date, y = mean, color = `Serial interval distribution`,
fill = `Serial interval distribution`)) +
geom_line() +
geom_ribbon(aes(ymin = min.iqr, ymax = max.iqr), alpha = 0.4) +
geom_ribbon(aes(ymin = min.90, ymax = max.90), alpha = 0.2) +
scale_color_brewer(palette = "Dark2") +
scale_fill_brewer(palette = "Dark2") +
xlab("") + ylab("R") +
cowplot::theme_cowplot() +
expand_limits(y = 0)
cowplot::save_plot("time_R.pdf", p)
## plot
p <- ggplot(cases, aes(x = date, y = mean)) +
geom_line() +
geom_line(data = china_onsets, aes(x = t, y = Confirmed), color = "red") +
geom_ribbon(aes(ymin = min.iqr, ymax = max.iqr), alpha = 0.4) +
geom_ribbon(aes(ymin = min.90, ymax = max.90), alpha = 0.2) +
xlab("") + ylab("Cases") +
cowplot::theme_cowplot() +
expand_limits(y = 0)
cowplot::save_plot("time_cases.pdf", p)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment