Skip to content

Instantly share code, notes, and snippets.

@MatsuuraKentaro
Last active July 11, 2020 18:36
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save MatsuuraKentaro/3a2b6ef99777cecb5f5ad92b7ee8951a to your computer and use it in GitHub Desktop.
Save MatsuuraKentaro/3a2b6ef99777cecb5f5ad92b7ee8951a to your computer and use it in GitHub Desktop.
COVID-19: estimate effective reproduction number
functions {
// calculating the convolutions
vector convolution(vector x, vector y_rev) {
int T = num_elements(x);
vector[T-1] res;
for (t in 2:T) {
res[t-1] = dot_product(x[1:(t-1)], y_rev[(T-t+2):T]);
}
return res;
}
// continuous Poisson distribution
real continuous_poisson_lpdf(vector y, vector lam) {
real lp = sum(y .* log(lam) - lam - lgamma(y + 1));
return lp;
}
}
data {
int<lower=1> T; // number of days
vector<lower=0>[T] imported_infect;
vector<lower=0>[T] domestic_infect;
vector[T] p_gt_rev; // generation time distribution (reverse order)
}
transformed data {
vector[T] cases_infect = imported_infect + domestic_infect;
}
parameters {
vector<lower=0>[T-1] Rt;
}
model {
domestic_infect[2:T] ~ continuous_poisson(Rt .* convolution(cases_infect, p_gt_rev));
}
functions {
// calculating the convolutions
vector convolution(vector x, vector y_rev) {
int T = num_elements(x);
vector[T-1] res;
for (t in 2:T) {
res[t-1] = dot_product(x[1:(t-1)], y_rev[(T-t+2):T]);
}
return res;
}
// continuous Poisson distribution
real continuous_poisson_lpdf(vector y, vector lam) {
real lp = sum(y .* log(lam) - lam - lgamma(y + 1));
return lp;
}
}
data {
int<lower=1> T; // number of days
int<lower=0> domestic_onset[T];
int<lower=0> imported_onset[T];
int<lower=0> domestic_report[T];
int<lower=0> imported_report[T];
vector[T] p_gt_rev; // generation time distribution (reverse order)
vector[T] p_inc_rev; // incubation period distribution (reverse order)
vector[T] p_rep_rev; // report time distribution (reverse order)
}
parameters {
vector<lower=0>[T-1] Rt;
real<lower=0> s_smooth; // smoothing factor
simplex[T] domestic_infect_from_onset_raw; // distribution of domestic infected people from onset data
simplex[T] imported_infect_from_onset_raw;
simplex[T] domestic_infect_from_report_raw;
simplex[T] imported_infect_from_report_raw;
}
transformed parameters {
vector[T] domestic_infect_from_onset = domestic_infect_from_onset_raw * sum(domestic_onset); // Number of domestic infected people by using back projection from onset data.
vector[T] imported_infect_from_onset = imported_infect_from_onset_raw * sum(imported_onset);
vector[T] domestic_infect_from_report = domestic_infect_from_report_raw * sum(domestic_report);
vector[T] imported_infect_from_report = imported_infect_from_report_raw * sum(imported_report);
vector[T] domestic_infect = domestic_infect_from_onset + domestic_infect_from_report;
vector[T] imported_infect = imported_infect_from_onset + imported_infect_from_report;
vector[T] cases_infect = domestic_infect + imported_infect;
}
model {
// smoothing on the raw scale
domestic_infect_from_onset_raw[2:T] ~ normal(domestic_infect_from_onset_raw[1:(T-1)], s_smooth);
imported_infect_from_onset_raw[2:T] ~ normal(imported_infect_from_onset_raw[1:(T-1)], s_smooth);
domestic_infect_from_report_raw[2:T] ~ normal(domestic_infect_from_report_raw[1:(T-1)], s_smooth);
imported_infect_from_report_raw[2:T] ~ normal(imported_infect_from_report_raw[1:(T-1)], s_smooth);
// back projection
domestic_onset[2:T] ~ poisson(convolution(domestic_infect_from_onset, p_inc_rev));
imported_onset[2:T] ~ poisson(convolution(imported_infect_from_onset, p_inc_rev));
domestic_report[2:T] ~ poisson(convolution(domestic_infect_from_report, p_rep_rev));
imported_report[2:T] ~ poisson(convolution(imported_infect_from_report, p_rep_rev));
// estimating effective reproduction number
domestic_infect[2:T] ~ continuous_poisson(Rt .* convolution(cases_infect, p_gt_rev));
}
functions {
// calculating the convolutions
vector convolution(vector x, vector y_rev) {
int T = num_elements(x);
vector[T-1] res;
for (t in 2:T) {
res[t-1] = dot_product(x[1:(t-1)], y_rev[(T-t+2):T]);
}
return res;
}
// continuous Poisson distribution
real continuous_poisson_lpdf(vector y, vector lam) {
real lp = sum(y .* log(lam) - lam - lgamma(y + 1));
return lp;
}
}
data {
int<lower=1> T; // number of days
int<lower=0> domestic_onset[T];
int<lower=0> imported_onset[T];
int<lower=0> domestic_report[T];
int<lower=0> imported_report[T];
vector[T] p_gt_rev; // generation time distribution (reverse order)
vector[T] p_inc_rev; // incubation period distribution (reverse order)
vector[T] p_rep_rev; // report time distribution (reverse order)
vector[T] p_rep_cum; // report time cumulative distribution
}
parameters {
vector<lower=0>[T-1] Rt;
real<lower=0> s_smooth; // smoothing factor
simplex[T] domestic_infect_from_onset_raw; // distribution of domestic infected people from onset data
simplex[T] imported_infect_from_onset_raw;
simplex[T] domestic_infect_from_report_raw;
simplex[T] imported_infect_from_report_raw;
simplex[T] domestic_infect_latent_raw;
simplex[T] imported_infect_latent_raw;
real<lower=sum(domestic_onset)+sum(domestic_report)> domestic_latent_scale;
real<lower=sum(imported_onset)+sum(imported_report)> imported_latent_scale;
}
transformed parameters {
vector[T] domestic_infect_from_onset = domestic_infect_from_onset_raw * sum(domestic_onset); // Number of domestic infected people by using back projection from onset data.
vector[T] imported_infect_from_onset = imported_infect_from_onset_raw * sum(imported_onset);
vector[T] domestic_infect_from_report = domestic_infect_from_report_raw * sum(domestic_report);
vector[T] imported_infect_from_report = imported_infect_from_report_raw * sum(imported_report);
vector[T] domestic_infect = domestic_infect_from_onset + domestic_infect_from_report;
vector[T] imported_infect = imported_infect_from_onset + imported_infect_from_report;
vector[T] domestic_infect_latent = domestic_infect_latent_raw * domestic_latent_scale;
vector[T] imported_infect_latent = imported_infect_latent_raw * imported_latent_scale;
vector[T] cases_infect_latent = domestic_infect_latent + imported_infect_latent;
}
model {
// smoothing on the raw scale
domestic_infect_from_onset_raw[2:T] ~ normal(domestic_infect_from_onset_raw[1:(T-1)], s_smooth);
imported_infect_from_onset_raw[2:T] ~ normal(imported_infect_from_onset_raw[1:(T-1)], s_smooth);
domestic_infect_from_report_raw[2:T] ~ normal(domestic_infect_from_report_raw[1:(T-1)], s_smooth);
imported_infect_from_report_raw[2:T] ~ normal(imported_infect_from_report_raw[1:(T-1)], s_smooth);
// back projection
domestic_onset[2:T] ~ poisson(convolution(domestic_infect_from_onset, p_inc_rev));
imported_onset[2:T] ~ poisson(convolution(imported_infect_from_onset, p_inc_rev));
domestic_report[2:T] ~ poisson(convolution(domestic_infect_from_report, p_rep_rev));
imported_report[2:T] ~ poisson(convolution(imported_infect_from_report, p_rep_rev));
// reporting delay
domestic_infect[1:T] ~ continuous_poisson(domestic_infect_latent .* p_rep_cum);
imported_infect[1:T] ~ continuous_poisson(imported_infect_latent .* p_rep_cum);
// estimating effective reproduction number
domestic_infect_latent[2:T] ~ continuous_poisson(Rt .* convolution(cases_infect_latent, p_gt_rev));
}
lapply(c('data.table', 'dplyr', 'tidyr', 'rstan', 'ggplot2'), library, character.only=TRUE)
rstan_options(auto_write=TRUE)
options(mc.cores=4)
datemin <- as.Date('2019-12-25') # particular choice
datemax <- as.Date('2020-05-10') # particular choice
##------ loading the dataset ------
fread('data/JapaneseDataCOVID19 (200510).csv') %>%
mutate(report = if_else(is.na(confirmed), reported, confirmed),
report = as.Date(report),
onset = as.Date(onset)) %>%
select(-confirmed, -reported) -> df
df_onset <- df %>%
filter(!is.na(onset)) %>%
group_by(onset, exp_type) %>%
summarise(count = n()) %>% ungroup() %>%
spread(key = exp_type, value = count, fill = 0) %>%
rename(domestic_onset = domestic, imported_onset = imported)
df_report <- df %>%
filter(is.na(onset)) %>%
group_by(report, exp_type) %>%
summarise(count = n()) %>% ungroup() %>%
spread(key = exp_type, value = count, fill = 0) %>%
rename(domestic_report = domestic, imported_report = imported)
df_cases <- data.frame(date = seq(datemin, datemax, by = 1)) %>%
left_join(df_onset, by = c('date'='onset'), fill = 0) %>%
left_join(df_report, by = c('date'='report'), fill = 0) %>%
replace_na(list(domestic_onset = 0, imported_onset = 0, domestic_report = 0, imported_report = 0))
##------ predetermined distributions ------
## discretesized distribution of generation time (Nishiura, et al, 2020)
T <- nrow(df_cases)
c_gt <- pweibull(0:T, shape = 2.305, scale = 5.452)
p_gt_rev <- c_gt %>% diff() %>% rev()
## discretesized distribution of incubation period (from infection to onset) (Linton et al 2020 - with right truncation excl. Wuhan residents)
c_inc <- plnorm(0:T, meanlog = 1.519, sdlog = 0.615)
p_inc_rev <- c_inc %>% diff() %>% rev()
## discretesized distribution of the delay from infection to report
# right-truncated reporting delay from onset to repot (estimated values using the MHLW data): dweibull(t, shape = 1.741, scale = 8.573)
cdf_rep <- Vectorize(function(t) {
integrand <- function(x) {
pweibull(t - x, shape = 1.741, scale = 8.573) * dlnorm(x, meanlog = 1.519, sdlog = 0.615)
}
val <- integrate(integrand, lower=0, upper=t)$value
return(val)
})
c_rep <- cdf_rep(0:T)
p_rep_rev <- c_rep %>% diff() %>% rev()
##------ estimation ------
stan_data = list(
T = T,
domestic_onset = df_cases$domestic_onset,
imported_onset = df_cases$imported_onset,
domestic_report = df_cases$domestic_report,
imported_report = df_cases$imported_report,
p_gt_rev = p_gt_rev,
p_inc_rev = p_inc_rev,
p_rep_rev = p_rep_rev
)
stanmodel <- stan_model(file='model/model2.stan')
fit <- sampling(stanmodel,
data = stan_data,
iter = 6000,
warmup = 1000,
thin = 5,
pars = c('Rt', 's_smooth', 'domestic_infect', 'imported_infect', 'cases_infect'),
seed = 1234)
write.table(data.frame(summary(fit)$summary), file='output/fit-summary-model2.csv', sep=',', quote=TRUE, col.names=NA)
save.image(file = 'output/result-model2.RData')
##------ plot ------
ms <- rstan::extract(fit)
qua <- apply(ms$Rt, 2, quantile, probs=c(2.5, 25, 50, 75, 97.5)/100)
d_plot <- data.frame(day = df_cases$date[-T], t(qua), check.names = FALSE) %>%
filter(between(day, as.Date('2020-02-01'), as.Date('2020-05-03')))
qua <- apply(ms$cases_infect, 2, quantile, probs=c(2.5, 25, 50, 75, 97.5)/100)
d_plot2 <- data.frame(day = df_cases$date, t(qua), check.names = FALSE) %>%
filter(between(day, as.Date('2020-02-01'), as.Date('2020-05-03')))
scaling <- 130
p <- ggplot() +
geom_bar(data=d_plot2, aes(x=day, y=`50%`/scaling), fill='#FAAB18', stat='identity', width=0.7) +
geom_errorbar(data=d_plot2, aes(x=day, ymin=`2.5%`/scaling, ymax=`97.5%`/scaling), width=0.5, alpha = 0.4) +
geom_ribbon(data=d_plot, aes(x=day, ymin=`2.5%`, ymax=`97.5%`), fill='#1380A1', alpha=0.4) +
geom_line(data=d_plot, aes(x=day, y=`50%`), color='#1380A1') +
labs(x='\nDate', y='Effective reproduction number\n') +
theme(text=element_text(size=12), axis.text=element_text(size=10), legend.position='none') +
scale_x_date(date_labels='%m/%d',date_breaks='10 day', expand=c(0, 0)) +
scale_y_continuous(limits=c(0, 5.6), expand=c(0, 0),
sec.axis=sec_axis(~. * scaling, breaks=c(200, 400, 600), name='Cases\n')) +
geom_hline(yintercept=1, linetype='dashed', color='#1380A1', size=0.7)
ggsave(p, file = 'output/fig-model2.png', dpi=300, w=5.6, h=4)
lapply(c('data.table', 'dplyr', 'tidyr', 'rstan', 'ggplot2'), library, character.only=TRUE)
rstan_options(auto_write=TRUE)
options(mc.cores=4)
datemin <- as.Date('2019-12-25') # particular choice
datemax <- as.Date('2020-05-10') # particular choice
##------ loading the dataset ------
fread('data/JapaneseDataCOVID19 (200510).csv') %>%
mutate(report = if_else(is.na(confirmed), reported, confirmed),
report = as.Date(report),
onset = as.Date(onset)) %>%
select(-confirmed, -reported) -> df
df_onset <- df %>%
filter(!is.na(onset)) %>%
group_by(onset, exp_type) %>%
summarise(count = n()) %>% ungroup() %>%
spread(key = exp_type, value = count, fill = 0) %>%
rename(domestic_onset = domestic, imported_onset = imported)
df_report <- df %>%
filter(is.na(onset)) %>%
group_by(report, exp_type) %>%
summarise(count = n()) %>% ungroup() %>%
spread(key = exp_type, value = count, fill = 0) %>%
rename(domestic_report = domestic, imported_report = imported)
df_cases <- data.frame(date = seq(datemin, datemax, by = 1)) %>%
left_join(df_onset, by = c('date'='onset'), fill = 0) %>%
left_join(df_report, by = c('date'='report'), fill = 0) %>%
replace_na(list(domestic_onset = 0, imported_onset = 0, domestic_report = 0, imported_report = 0))
##------ predetermined distributions ------
## discretesized distribution of generation time (Nishiura, et al, 2020)
T <- nrow(df_cases)
c_gt <- pweibull(0:T, shape = 2.305, scale = 5.452)
p_gt_rev <- c_gt %>% diff() %>% rev()
## discretesized distribution of incubation period (from infection to onset) (Linton et al 2020 - with right truncation excl. Wuhan residents)
c_inc <- plnorm(0:T, meanlog = 1.519, sdlog = 0.615)
p_inc_rev <- c_inc %>% diff() %>% rev()
## discretesized distribution of the delay from infection to report
# right-truncated reporting delay from onset to repot (estimated values using the MHLW data): dweibull(t, shape = 1.741, scale = 8.573)
cdf_rep <- Vectorize(function(t) {
integrand <- function(x) {
pweibull(t - x, shape = 1.741, scale = 8.573) * dlnorm(x, meanlog = 1.519, sdlog = 0.615)
}
val <- integrate(integrand, lower=0, upper=t)$value
return(val)
})
c_rep <- cdf_rep(0:T)
p_rep_rev <- c_rep %>% diff() %>% rev()
p_rep_cum <- c_rep %>% rev() %>% .[1:T]
##------ estimation ------
stan_data = list(
T = T,
domestic_onset = df_cases$domestic_onset,
imported_onset = df_cases$imported_onset,
domestic_report = df_cases$domestic_report,
imported_report = df_cases$imported_report,
p_gt_rev = p_gt_rev,
p_inc_rev = p_inc_rev,
p_rep_rev = p_rep_rev,
p_rep_cum = p_rep_cum
)
stanmodel <- stan_model(file='model/model3.stan')
fit <- sampling(stanmodel,
data = stan_data,
iter = 6000,
warmup = 1000,
thin = 5,
pars = c('Rt', 's_smooth', 'domestic_infect_latent', 'imported_infect_latent', 'cases_infect_latent', 'domestic_infect', 'imported_infect'),
seed = 1234)
write.table(data.frame(summary(fit)$summary), file='output/fit-summary-model3.csv', sep=',', quote=TRUE, col.names=NA)
save.image(file = 'output/result-model3.RData')
##------ plot ------
ms <- rstan::extract(fit)
qua <- apply(ms$Rt, 2, quantile, probs=c(2.5, 25, 50, 75, 97.5)/100)
d_plot <- data.frame(day = df_cases$date[-T], t(qua), check.names = FALSE) %>%
filter(between(day, as.Date('2020-02-01'), as.Date('2020-05-03')))
qua <- apply(ms$cases_infect_latent, 2, quantile, probs=c(2.5, 25, 50, 75, 97.5)/100)
d_plot2 <- data.frame(day = df_cases$date, t(qua), check.names = FALSE) %>%
filter(between(day, as.Date('2020-02-01'), as.Date('2020-05-03')))
scaling <- 130
p <- ggplot() +
geom_bar(data=d_plot2, aes(x=day, y=`50%`/scaling), fill='#FAAB18', stat='identity', width=0.7) +
geom_errorbar(data=d_plot2, aes(x=day, ymin=`2.5%`/scaling, ymax=`97.5%`/scaling), width=0.5, alpha = 0.4) +
geom_ribbon(data=d_plot, aes(x=day, ymin=`2.5%`, ymax=`97.5%`), fill='#1380A1', alpha=0.4) +
geom_line(data=d_plot, aes(x=day, y=`50%`), color='#1380A1') +
labs(x='\nDate', y='Effective reproduction number\n') +
theme(text=element_text(size=12), axis.text=element_text(size=10), legend.position='none') +
scale_x_date(date_labels='%m/%d',date_breaks='10 day', expand=c(0, 0)) +
scale_y_continuous(limits=c(0, 5.6), expand=c(0, 0),
sec.axis=sec_axis(~. * scaling, breaks=c(200, 400, 600), name='Cases\n')) +
geom_hline(yintercept=1, linetype='dashed', color='#1380A1', size=0.7)
ggsave(p, file = 'output/fig-model3.png', dpi=300, w=5.6, h=4)
@aakhmetz
Copy link

Dear Matsuuraさん,

Thank you for your effort – It was great to read about implementation of stan code for calculating the Rt! In meanwhile, I would like to point out few things regarding your stan implementation, because I have been thinking on something similar.

  1. About backprojection vs smoothing: the back-projection algorithm implemented in the surveillance package by Michael Höhle does account not only for shifting the dates of illness onset by the incubation period, but also ensures that the backprojected values are grouped closer to each other (which is reasoned by the fact that there should be a common source of infection when possible). Your algorithm of smoothing leads to the uniform error s_smooth over the whole period of time and does not group the cases of infections. This leads to the rather large uncertainty of Rt at the beginning of the outbreak. See https://staff.math.su.se/hoehle/teaching/moid2011/moid.pdf for some details about backprojection.

  2. I am not certain about using your own continuous_poisson distribution – I don't know how it relates to the ordinary poisson, etc. At least, I would go with conventional gamma, negative binomial, or poisson when possible. For example, here I would try to use of negative binomial to address the unreporting problem. Regarding the nowcasting (~prediction of not yet reported counts), there are again some research works either ours, or Hohle (doi:10.1111/biom.12194), which showed that nowcasting in strict Bayesian sense is not easy.

  3. I know that there should be variability and I agree that the CIs are likely to be wider. However, it is a bit tricky on how to do the nowcasting properly. In this sense, I would rather truncate the last XX days, and say that the situation on how Rt evolves is rather uncertain at the very end (it is similar to the conclusion of London school, when the say that small counts at the end do not allow to estimate the Rt with great precision). It is probably the same as to say that Rt can be anywhere between zero and two.

I will think more and thank you again for your input.

Best regards,
Andrei

@aakhmetz
Copy link

aakhmetz commented May 18, 2020

ps: for backprojection, see slide #96 in moid.pdf – the error bars for backprojected values are not uniform as in your result (they are wider at the peak, but become narrow at the beginning and at the end of the outbreak when the counts are small)

@MatsuuraKentaro
Copy link
Author

MatsuuraKentaro commented May 19, 2020

Hi Andrei,

Thank you very much for your comments.

Your algorithm of smoothing leads to the uniform error s_smooth over the whole period of time and does not group the cases of infections. This leads to the rather large uncertainty of Rt at the beginning of the outbreak.

What does "group" mean? The error bars at the beginning of the outbreak for slide #96 in moid.pdf seem to be proportional to the variance of the Poisson distribution. I feel that the error bars at the beginning are too narrow. On the other hand, I think my smoothing method includes the uncertainty of the incubation period (i.e. uncertainty of the time axis), so the error bars are wider.

Also the shape of my smoothing method is the same as the first-order trend model of the state-space model and there is room for change. For example, a stronger second-order trend model could be considered:

domestic_infect_from_onset_raw[3:T] ~ normal(2*domestic_infect_from_onset_raw[2:(T-1)] - domestic_infect_from_onset_raw[1:(T-2)], s_smooth);

That means "domestic_infect_from_onset_raw[t] - domestic_infect_from_onset_raw[t-1] ~ normal(domestic_infect_from_onset_raw[t-1] - domestic_infect_from_onset_raw[t-2], s_smooth) for t = 3,4,...,T"
However, a strong smoothing generates optimistic credible intervals. In the case of decision-making issues, we adopted a weaker smoothing method because we should be pessimistic and avoid underestimating the credible intervals.

The Poisson distribution in Stan accepts only integer response variables. It can be extended using the gamma function. Certainly, the normalization constant depends on the parameter, but since it is close to 1, we can use the continuous Poisson distribution in the above Stan code as an approximation. See:
https://stats.stackexchange.com/questions/310676/continuous-generalization-of-the-negative-binomial-distribution/311927
See below for examples of use:
http://ac.inf.elte.hu/Vol_039_2013/137_39.pdf.
In addition, Sungmok Jung also uses the continuous Poisson distribution in In [49] in the MLE version:

return(-(-Cs+dt.backproj.T$domestic_delay[t+1]*log(Cs)-lgamma(dt.backproj.T$domestic_delay[t+1]+1)))}

The Poisson distribution in OpenBUGS also adopted the continuous Poisson distribution by using the gamma function. But I can't remember the URL of implementation part (Pascal code).

I agree about the difficulty of the nowcasting. I also had to truncate the estimated results of the last 1 week. However, looking at the distribution of reporting delay, I think that truncating two weeks is overkill; data from one to two weeks ago can be used effectively, so it wouldn't be a bad idea to show it as a way of informing uncertainty.
Also, if the smoothing method is a second-order trend model, the Rt will not necessarily be a broad confidence interval centered on 1, because the Rt will be calculated to take into account recent trends in the number of cases. But, I think we should avoid situations where the results depend on the trend model largely. After all, I feel that the nowcasting is a difficult problem.

Best regards,
Kentaro

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment