sbfnk/time_R.r

## time_R.r
library('tidyverse')
library('ggplot2')

##' Samples size (the number of trials) of a binomial distribution
##' copied from https://github.com/sbfnk/bpmodels/blob/master/R/utils.r
rbinom_size <- function(n, x, prob) {
    x + stats::rnbinom(n, x + 1, prob)
}

##
## get data
##
data_dir <-
  path.expand(file.path("~", "Dropbox", "nCoV-2019", "data_sources",
                        "case_data"))

china_onsets <-
  readr::read_csv(file.path(data_dir, "time_series_China_CDC.csv"))
wuhan_confirmed <-
  readr::read_csv(file.path(data_dir, "time_series_HKU_Wuhan.csv"))

linelist <- WuhanSeedingVsTransmission::get_linelist()

## distribution of delays over time
time_delays <- linelist %>%
  filter(!is.na(delay_confirmation)) %>%
  mutate(days_confirmation = as.integer(delay_confirmation)) %>%
  group_by(date_confirmation) %>%
  summarise(mean = mean(days_confirmation),
            min.90 = quantile(days_confirmation, 0.05),
            max.90 = quantile(days_confirmation, 0.95),
            min.iqr = quantile(days_confirmation, 0.25),
            max.iqr = quantile(days_confirmation, 0.75),
            median = median(days_confirmation))

## TODO: improve on arbitrary cutoff date
cutoff <- as.Date("2020-01-21")
## extract confirmation delays from linelist
delays <- linelist %>%
    dplyr::filter(date_confirmation > cutoff, !is.na(delay_confirmation)) %>%
    .$delay_confirmation %>%
    as.integer()

p <- ggplot(time_delays, aes(x = date_confirmation, y = median)) +
  geom_line() +
  geom_ribbon(aes(ymin = min.iqr, ymax = max.iqr), alpha = 0.4) +
  geom_ribbon(aes(ymin = min.90, ymax = max.90), alpha = 0.2) +
  geom_vline(xintercept = cutoff + 1, linetype = "dashed") +
  xlab("Date of confirmation") + ylab("Delay (in days)") +
  cowplot::theme_cowplot() +
  expand_limits(y = 0)

cowplot::save_plot("time_delays.pdf", p)

##
## get reporting delay distribution
##
## Report was on the 26th, data up to the 25th.
## We combine 0- and 1-day delays into one bin.
##

counts <- table(delays)
## wrap 0-delays into 1-delays
counts[2] <- counts[2] + counts[1]
## remove 0-delays
counts <- counts[-1]
## get cumulative sum
cum_counts <- cumsum(counts)
## get cumulative frequency
cum_freq <- cum_counts / max(cum_counts)
## remove last data point (as it's 1 anyway)
cum_freq <- cum_freq[-length(cum_freq)]

##
## augment data with reporting delays
##

report_date <- as.Date("2020-01-26")
## number of "augmentations"
naug <- 100

## get confirmed cases
onsets <- china_onsets$Confirmed
## arbitrary starting point at day 31 -- too many zeroes before
## TODO: apply more general criterion for starting point or use all the data
onsets <- onsets[31:length(onsets)]

## sample onsets naug times
sampled_onsets <- lapply(seq_len(naug), function(x) {
  ## indices to sample, from the last day in the data backwards
  indices <- length(onsets) + 1 - as.integer(names(cum_freq))
  ## combine unsampled onsets (those that are longer ago than the longest
  ## reporting delay) and sampled onsets (using binomial sampler above)
  x_onsets <-
    c(onsets[seq_len(min(indices-1))],
      rev(rbinom_size(length(cum_freq), onsets[indices], cum_freq)))
  ## get dates
  ## TODO: this dates thing is done too often, should be simplified
  dates <-
    china_onsets$t[seq(nrow(china_onsets) + 1 - length(x_onsets),
                       nrow(china_onsets))]
  return(tibble(date = dates, cases = x_onsets))
})

##
## get time-varying reproduction number with three serial interval distributions
##

## TODO: replace triple copy-pasted code with loop

rep_R <- list()

rep_R[["SARS-like"]] <- lapply(sampled_onsets, function(x) {
  ## estimate R with (relatively short) 2-day smoothing window
  R <-
    EpiEstim::estimate_R(x$cases,
                         method = "parametric_si",
                         config =
                           EpiEstim::make_config(
                                       list(mean_si = 8.4, std_si = 3.8,
                                            t_start = 2:26, t_end = 3:27)))$R
  ## TODO: better way of filtering out NAs (this breaks if there are NAs in the
  ## middle of the dataset for whatever reason)
  R <- R[!is.na(R$`Mean(R)`), ]
  ## get dates
  ## TODO: this dates thing is done too often, should be simplified
  dates <-
    china_onsets$t[seq(nrow(china_onsets) + 1 - nrow(R),
                       nrow(china_onsets))]
  return(tibble(date = dates, R = rnorm(nrow(R), R$`Mean(R)`, R$`Std(R)`)))
})

rep_R[["early SARS-like"]] <- lapply(sampled_onsets, function(x) {
  ## estimate R with (relatively short) 2-day smoothing window
  R <-
    EpiEstim::estimate_R(x$cases,
                         method = "parametric_si",
                         config =
                           EpiEstim::make_config(
                                       list(mean_si = 10.0, std_si = 2.8,
                                            t_start = 2:26, t_end = 3:27 )))$R
  ## TODO: better way of filtering out NAs (this breaks if there are NAs in the
  ## middle of the dataset for whatever reason)
  R <- R[!is.na(R$`Mean(R)`), ]
  ## get dates
  ## TODO: this dates thing is done too often, should be simplified
  dates <-
    china_onsets$t[seq(nrow(china_onsets) + 1 - nrow(R),
                       nrow(china_onsets))]
  return(tibble(date = dates, R = rnorm(nrow(R), R$`Mean(R)`, R$`Std(R)`)))
})

rep_R[["MERS-like"]] <- lapply(sampled_onsets, function(x) {
  ## estimate R with (relatively short) 2-day smoothing window
  R <-
    EpiEstim::estimate_R(x$cases,
                         method = "parametric_si",
                         config =
                           EpiEstim::make_config(
                                       list(mean_si = 6.8, std_si = 4.1,
                                            t_start = 2:26, t_end = 3:27 )))$R
  ## TODO: better way of filtering out NAs (this breaks if there are NAs in the
  ## middle of the dataset for whatever reason)
  R <- R[!is.na(R$`Mean(R)`), ]
  ## get dates
  ## TODO: this dates thing is done too often, should be simplified
  dates <-
    china_onsets$t[seq(nrow(china_onsets) + 1 - nrow(R),
                       nrow(china_onsets))]
  return(tibble(date = dates, R = rnorm(nrow(R), R$`Mean(R)`, R$`Std(R)`)))
})

## summarise sampled onsets
cases <- bind_rows(sampled_onsets) %>%
    group_by(date) %>%
    summarise(min.90 = quantile(cases, 0.05),
              max.90 = quantile(cases, 0.95),
              min.iqr = quantile(cases, 0.25),
              max.iqr = quantile(cases, 0.75),
              median = median(cases),
              mean = mean(cases))

## summarise sampled reproduction numbers
R <- lapply(names(rep_R), function(x) {
    bind_rows(rep_R[[x]]) %>%
        mutate(`Serial interval distribution` = x)
}) %>%
    bind_rows() %>%
    group_by(date, `Serial interval distribution`) %>%
    summarise(min.90 = quantile(R, 0.05),
              max.90 = quantile(R, 0.95),
              min.iqr = quantile(R, 0.25),
              max.iqr = quantile(R, 0.75),
              median = median(R),
              mean = mean(R))

## plot
p <- ggplot(R, aes(x = date, y = mean, color = `Serial interval distribution`,
              fill = `Serial interval distribution`)) +
    geom_line() +
    geom_ribbon(aes(ymin = min.iqr, ymax = max.iqr), alpha = 0.4) +
    geom_ribbon(aes(ymin = min.90, ymax = max.90), alpha = 0.2) +
    scale_color_brewer(palette = "Dark2") +
    scale_fill_brewer(palette = "Dark2") +
    xlab("") + ylab("R") +
    cowplot::theme_cowplot() +
    expand_limits(y = 0)

cowplot::save_plot("time_R.pdf", p)

## plot
p <- ggplot(cases, aes(x = date, y = mean)) +
    geom_line() +
    geom_line(data = china_onsets, aes(x = t, y = Confirmed), color = "red") +
    geom_ribbon(aes(ymin = min.iqr, ymax = max.iqr), alpha = 0.4) +
    geom_ribbon(aes(ymin = min.90, ymax = max.90), alpha = 0.2) +
    xlab("") + ylab("Cases") +
    cowplot::theme_cowplot() +
    expand_limits(y = 0)

cowplot::save_plot("time_cases.pdf", p)
	library('tidyverse')
	library('ggplot2')

	##' Samples size (the number of trials) of a binomial distribution
	##' copied from https://github.com/sbfnk/bpmodels/blob/master/R/utils.r
	rbinom_size <- function(n, x, prob) {
	x + stats::rnbinom(n, x + 1, prob)
	}

	##
	## get data
	##
	data_dir <-
	path.expand(file.path("~", "Dropbox", "nCoV-2019", "data_sources",
	"case_data"))

	china_onsets <-
	readr::read_csv(file.path(data_dir, "time_series_China_CDC.csv"))
	wuhan_confirmed <-
	readr::read_csv(file.path(data_dir, "time_series_HKU_Wuhan.csv"))

	linelist <- WuhanSeedingVsTransmission::get_linelist()

	## distribution of delays over time
	time_delays <- linelist %>%
	filter(!is.na(delay_confirmation)) %>%
	mutate(days_confirmation = as.integer(delay_confirmation)) %>%
	group_by(date_confirmation) %>%
	summarise(mean = mean(days_confirmation),
	min.90 = quantile(days_confirmation, 0.05),
	max.90 = quantile(days_confirmation, 0.95),
	min.iqr = quantile(days_confirmation, 0.25),
	max.iqr = quantile(days_confirmation, 0.75),
	median = median(days_confirmation))

	## TODO: improve on arbitrary cutoff date
	cutoff <- as.Date("2020-01-21")
	## extract confirmation delays from linelist
	delays <- linelist %>%
	dplyr::filter(date_confirmation > cutoff, !is.na(delay_confirmation)) %>%
	.$delay_confirmation %>%
	as.integer()

	p <- ggplot(time_delays, aes(x = date_confirmation, y = median)) +
	geom_line() +
	geom_ribbon(aes(ymin = min.iqr, ymax = max.iqr), alpha = 0.4) +
	geom_ribbon(aes(ymin = min.90, ymax = max.90), alpha = 0.2) +
	geom_vline(xintercept = cutoff + 1, linetype = "dashed") +
	xlab("Date of confirmation") + ylab("Delay (in days)") +
	cowplot::theme_cowplot() +
	expand_limits(y = 0)

	cowplot::save_plot("time_delays.pdf", p)

	##
	## get reporting delay distribution
	##
	## Report was on the 26th, data up to the 25th.
	## We combine 0- and 1-day delays into one bin.
	##

	counts <- table(delays)
	## wrap 0-delays into 1-delays
	counts[2] <- counts[2] + counts[1]
	## remove 0-delays
	counts <- counts[-1]
	## get cumulative sum
	cum_counts <- cumsum(counts)
	## get cumulative frequency
	cum_freq <- cum_counts / max(cum_counts)
	## remove last data point (as it's 1 anyway)
	cum_freq <- cum_freq[-length(cum_freq)]

	##
	## augment data with reporting delays
	##

	report_date <- as.Date("2020-01-26")
	## number of "augmentations"
	naug <- 100

	## get confirmed cases
	onsets <- china_onsets$Confirmed
	## arbitrary starting point at day 31 -- too many zeroes before
	## TODO: apply more general criterion for starting point or use all the data
	onsets <- onsets[31:length(onsets)]

	## sample onsets naug times
	sampled_onsets <- lapply(seq_len(naug), function(x) {
	## indices to sample, from the last day in the data backwards
	indices <- length(onsets) + 1 - as.integer(names(cum_freq))
	## combine unsampled onsets (those that are longer ago than the longest
	## reporting delay) and sampled onsets (using binomial sampler above)
	x_onsets <-
	c(onsets[seq_len(min(indices-1))],
	rev(rbinom_size(length(cum_freq), onsets[indices], cum_freq)))
	## get dates
	## TODO: this dates thing is done too often, should be simplified
	dates <-
	china_onsets$t[seq(nrow(china_onsets) + 1 - length(x_onsets),
	nrow(china_onsets))]
	return(tibble(date = dates, cases = x_onsets))
	})

	##
	## get time-varying reproduction number with three serial interval distributions
	##

	## TODO: replace triple copy-pasted code with loop

	rep_R <- list()

	rep_R[["SARS-like"]] <- lapply(sampled_onsets, function(x) {
	## estimate R with (relatively short) 2-day smoothing window
	R <-
	EpiEstim::estimate_R(x$cases,
	method = "parametric_si",
	config =
	EpiEstim::make_config(
	list(mean_si = 8.4, std_si = 3.8,
	t_start = 2:26, t_end = 3:27)))$R
	## TODO: better way of filtering out NAs (this breaks if there are NAs in the
	## middle of the dataset for whatever reason)
	R <- R[!is.na(R$`Mean(R)`), ]
	## get dates
	## TODO: this dates thing is done too often, should be simplified
	dates <-
	china_onsets$t[seq(nrow(china_onsets) + 1 - nrow(R),
	nrow(china_onsets))]
	return(tibble(date = dates, R = rnorm(nrow(R), R$`Mean(R)`, R$`Std(R)`)))
	})

	rep_R[["early SARS-like"]] <- lapply(sampled_onsets, function(x) {
	## estimate R with (relatively short) 2-day smoothing window
	R <-
	EpiEstim::estimate_R(x$cases,
	method = "parametric_si",
	config =
	EpiEstim::make_config(
	list(mean_si = 10.0, std_si = 2.8,
	t_start = 2:26, t_end = 3:27 )))$R
	## TODO: better way of filtering out NAs (this breaks if there are NAs in the
	## middle of the dataset for whatever reason)
	R <- R[!is.na(R$`Mean(R)`), ]
	## get dates
	## TODO: this dates thing is done too often, should be simplified
	dates <-
	china_onsets$t[seq(nrow(china_onsets) + 1 - nrow(R),
	nrow(china_onsets))]
	return(tibble(date = dates, R = rnorm(nrow(R), R$`Mean(R)`, R$`Std(R)`)))
	})

	rep_R[["MERS-like"]] <- lapply(sampled_onsets, function(x) {
	## estimate R with (relatively short) 2-day smoothing window
	R <-
	EpiEstim::estimate_R(x$cases,
	method = "parametric_si",
	config =
	EpiEstim::make_config(
	list(mean_si = 6.8, std_si = 4.1,
	t_start = 2:26, t_end = 3:27 )))$R
	## TODO: better way of filtering out NAs (this breaks if there are NAs in the
	## middle of the dataset for whatever reason)
	R <- R[!is.na(R$`Mean(R)`), ]
	## get dates
	## TODO: this dates thing is done too often, should be simplified
	dates <-
	china_onsets$t[seq(nrow(china_onsets) + 1 - nrow(R),
	nrow(china_onsets))]
	return(tibble(date = dates, R = rnorm(nrow(R), R$`Mean(R)`, R$`Std(R)`)))
	})

	## summarise sampled onsets
	cases <- bind_rows(sampled_onsets) %>%
	group_by(date) %>%
	summarise(min.90 = quantile(cases, 0.05),
	max.90 = quantile(cases, 0.95),
	min.iqr = quantile(cases, 0.25),
	max.iqr = quantile(cases, 0.75),
	median = median(cases),
	mean = mean(cases))

	## summarise sampled reproduction numbers
	R <- lapply(names(rep_R), function(x) {
	bind_rows(rep_R[[x]]) %>%
	mutate(`Serial interval distribution` = x)
	}) %>%
	bind_rows() %>%
	group_by(date, `Serial interval distribution`) %>%
	summarise(min.90 = quantile(R, 0.05),
	max.90 = quantile(R, 0.95),
	min.iqr = quantile(R, 0.25),
	max.iqr = quantile(R, 0.75),
	median = median(R),
	mean = mean(R))

	## plot
	p <- ggplot(R, aes(x = date, y = mean, color = `Serial interval distribution`,
	fill = `Serial interval distribution`)) +
	geom_line() +
	geom_ribbon(aes(ymin = min.iqr, ymax = max.iqr), alpha = 0.4) +
	geom_ribbon(aes(ymin = min.90, ymax = max.90), alpha = 0.2) +
	scale_color_brewer(palette = "Dark2") +
	scale_fill_brewer(palette = "Dark2") +
	xlab("") + ylab("R") +
	cowplot::theme_cowplot() +
	expand_limits(y = 0)

	cowplot::save_plot("time_R.pdf", p)

	## plot
	p <- ggplot(cases, aes(x = date, y = mean)) +
	geom_line() +
	geom_line(data = china_onsets, aes(x = t, y = Confirmed), color = "red") +
	geom_ribbon(aes(ymin = min.iqr, ymax = max.iqr), alpha = 0.4) +
	geom_ribbon(aes(ymin = min.90, ymax = max.90), alpha = 0.2) +
	xlab("") + ylab("Cases") +
	cowplot::theme_cowplot() +
	expand_limits(y = 0)

	cowplot::save_plot("time_cases.pdf", p)