Skip to content

Instantly share code, notes, and snippets.

@MatsuuraKentaro
Last active July 11, 2020 18:36
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save MatsuuraKentaro/3a2b6ef99777cecb5f5ad92b7ee8951a to your computer and use it in GitHub Desktop.
Save MatsuuraKentaro/3a2b6ef99777cecb5f5ad92b7ee8951a to your computer and use it in GitHub Desktop.
COVID-19: estimate effective reproduction number
functions {
// calculating the convolutions
vector convolution(vector x, vector y_rev) {
int T = num_elements(x);
vector[T-1] res;
for (t in 2:T) {
res[t-1] = dot_product(x[1:(t-1)], y_rev[(T-t+2):T]);
}
return res;
}
// continuous Poisson distribution
real continuous_poisson_lpdf(vector y, vector lam) {
real lp = sum(y .* log(lam) - lam - lgamma(y + 1));
return lp;
}
}
data {
int<lower=1> T; // number of days
vector<lower=0>[T] imported_infect;
vector<lower=0>[T] domestic_infect;
vector[T] p_gt_rev; // generation time distribution (reverse order)
}
transformed data {
vector[T] cases_infect = imported_infect + domestic_infect;
}
parameters {
vector<lower=0>[T-1] Rt;
}
model {
domestic_infect[2:T] ~ continuous_poisson(Rt .* convolution(cases_infect, p_gt_rev));
}
functions {
// calculating the convolutions
vector convolution(vector x, vector y_rev) {
int T = num_elements(x);
vector[T-1] res;
for (t in 2:T) {
res[t-1] = dot_product(x[1:(t-1)], y_rev[(T-t+2):T]);
}
return res;
}
// continuous Poisson distribution
real continuous_poisson_lpdf(vector y, vector lam) {
real lp = sum(y .* log(lam) - lam - lgamma(y + 1));
return lp;
}
}
data {
int<lower=1> T; // number of days
int<lower=0> domestic_onset[T];
int<lower=0> imported_onset[T];
int<lower=0> domestic_report[T];
int<lower=0> imported_report[T];
vector[T] p_gt_rev; // generation time distribution (reverse order)
vector[T] p_inc_rev; // incubation period distribution (reverse order)
vector[T] p_rep_rev; // report time distribution (reverse order)
}
parameters {
vector<lower=0>[T-1] Rt;
real<lower=0> s_smooth; // smoothing factor
simplex[T] domestic_infect_from_onset_raw; // distribution of domestic infected people from onset data
simplex[T] imported_infect_from_onset_raw;
simplex[T] domestic_infect_from_report_raw;
simplex[T] imported_infect_from_report_raw;
}
transformed parameters {
vector[T] domestic_infect_from_onset = domestic_infect_from_onset_raw * sum(domestic_onset); // Number of domestic infected people by using back projection from onset data.
vector[T] imported_infect_from_onset = imported_infect_from_onset_raw * sum(imported_onset);
vector[T] domestic_infect_from_report = domestic_infect_from_report_raw * sum(domestic_report);
vector[T] imported_infect_from_report = imported_infect_from_report_raw * sum(imported_report);
vector[T] domestic_infect = domestic_infect_from_onset + domestic_infect_from_report;
vector[T] imported_infect = imported_infect_from_onset + imported_infect_from_report;
vector[T] cases_infect = domestic_infect + imported_infect;
}
model {
// smoothing on the raw scale
domestic_infect_from_onset_raw[2:T] ~ normal(domestic_infect_from_onset_raw[1:(T-1)], s_smooth);
imported_infect_from_onset_raw[2:T] ~ normal(imported_infect_from_onset_raw[1:(T-1)], s_smooth);
domestic_infect_from_report_raw[2:T] ~ normal(domestic_infect_from_report_raw[1:(T-1)], s_smooth);
imported_infect_from_report_raw[2:T] ~ normal(imported_infect_from_report_raw[1:(T-1)], s_smooth);
// back projection
domestic_onset[2:T] ~ poisson(convolution(domestic_infect_from_onset, p_inc_rev));
imported_onset[2:T] ~ poisson(convolution(imported_infect_from_onset, p_inc_rev));
domestic_report[2:T] ~ poisson(convolution(domestic_infect_from_report, p_rep_rev));
imported_report[2:T] ~ poisson(convolution(imported_infect_from_report, p_rep_rev));
// estimating effective reproduction number
domestic_infect[2:T] ~ continuous_poisson(Rt .* convolution(cases_infect, p_gt_rev));
}
functions {
// calculating the convolutions
vector convolution(vector x, vector y_rev) {
int T = num_elements(x);
vector[T-1] res;
for (t in 2:T) {
res[t-1] = dot_product(x[1:(t-1)], y_rev[(T-t+2):T]);
}
return res;
}
// continuous Poisson distribution
real continuous_poisson_lpdf(vector y, vector lam) {
real lp = sum(y .* log(lam) - lam - lgamma(y + 1));
return lp;
}
}
data {
int<lower=1> T; // number of days
int<lower=0> domestic_onset[T];
int<lower=0> imported_onset[T];
int<lower=0> domestic_report[T];
int<lower=0> imported_report[T];
vector[T] p_gt_rev; // generation time distribution (reverse order)
vector[T] p_inc_rev; // incubation period distribution (reverse order)
vector[T] p_rep_rev; // report time distribution (reverse order)
vector[T] p_rep_cum; // report time cumulative distribution
}
parameters {
vector<lower=0>[T-1] Rt;
real<lower=0> s_smooth; // smoothing factor
simplex[T] domestic_infect_from_onset_raw; // distribution of domestic infected people from onset data
simplex[T] imported_infect_from_onset_raw;
simplex[T] domestic_infect_from_report_raw;
simplex[T] imported_infect_from_report_raw;
simplex[T] domestic_infect_latent_raw;
simplex[T] imported_infect_latent_raw;
real<lower=sum(domestic_onset)+sum(domestic_report)> domestic_latent_scale;
real<lower=sum(imported_onset)+sum(imported_report)> imported_latent_scale;
}
transformed parameters {
vector[T] domestic_infect_from_onset = domestic_infect_from_onset_raw * sum(domestic_onset); // Number of domestic infected people by using back projection from onset data.
vector[T] imported_infect_from_onset = imported_infect_from_onset_raw * sum(imported_onset);
vector[T] domestic_infect_from_report = domestic_infect_from_report_raw * sum(domestic_report);
vector[T] imported_infect_from_report = imported_infect_from_report_raw * sum(imported_report);
vector[T] domestic_infect = domestic_infect_from_onset + domestic_infect_from_report;
vector[T] imported_infect = imported_infect_from_onset + imported_infect_from_report;
vector[T] domestic_infect_latent = domestic_infect_latent_raw * domestic_latent_scale;
vector[T] imported_infect_latent = imported_infect_latent_raw * imported_latent_scale;
vector[T] cases_infect_latent = domestic_infect_latent + imported_infect_latent;
}
model {
// smoothing on the raw scale
domestic_infect_from_onset_raw[2:T] ~ normal(domestic_infect_from_onset_raw[1:(T-1)], s_smooth);
imported_infect_from_onset_raw[2:T] ~ normal(imported_infect_from_onset_raw[1:(T-1)], s_smooth);
domestic_infect_from_report_raw[2:T] ~ normal(domestic_infect_from_report_raw[1:(T-1)], s_smooth);
imported_infect_from_report_raw[2:T] ~ normal(imported_infect_from_report_raw[1:(T-1)], s_smooth);
// back projection
domestic_onset[2:T] ~ poisson(convolution(domestic_infect_from_onset, p_inc_rev));
imported_onset[2:T] ~ poisson(convolution(imported_infect_from_onset, p_inc_rev));
domestic_report[2:T] ~ poisson(convolution(domestic_infect_from_report, p_rep_rev));
imported_report[2:T] ~ poisson(convolution(imported_infect_from_report, p_rep_rev));
// reporting delay
domestic_infect[1:T] ~ continuous_poisson(domestic_infect_latent .* p_rep_cum);
imported_infect[1:T] ~ continuous_poisson(imported_infect_latent .* p_rep_cum);
// estimating effective reproduction number
domestic_infect_latent[2:T] ~ continuous_poisson(Rt .* convolution(cases_infect_latent, p_gt_rev));
}
lapply(c('data.table', 'dplyr', 'tidyr', 'rstan', 'ggplot2'), library, character.only=TRUE)
rstan_options(auto_write=TRUE)
options(mc.cores=4)
datemin <- as.Date('2019-12-25') # particular choice
datemax <- as.Date('2020-05-10') # particular choice
##------ loading the dataset ------
fread('data/JapaneseDataCOVID19 (200510).csv') %>%
mutate(report = if_else(is.na(confirmed), reported, confirmed),
report = as.Date(report),
onset = as.Date(onset)) %>%
select(-confirmed, -reported) -> df
df_onset <- df %>%
filter(!is.na(onset)) %>%
group_by(onset, exp_type) %>%
summarise(count = n()) %>% ungroup() %>%
spread(key = exp_type, value = count, fill = 0) %>%
rename(domestic_onset = domestic, imported_onset = imported)
df_report <- df %>%
filter(is.na(onset)) %>%
group_by(report, exp_type) %>%
summarise(count = n()) %>% ungroup() %>%
spread(key = exp_type, value = count, fill = 0) %>%
rename(domestic_report = domestic, imported_report = imported)
df_cases <- data.frame(date = seq(datemin, datemax, by = 1)) %>%
left_join(df_onset, by = c('date'='onset'), fill = 0) %>%
left_join(df_report, by = c('date'='report'), fill = 0) %>%
replace_na(list(domestic_onset = 0, imported_onset = 0, domestic_report = 0, imported_report = 0))
##------ predetermined distributions ------
## discretesized distribution of generation time (Nishiura, et al, 2020)
T <- nrow(df_cases)
c_gt <- pweibull(0:T, shape = 2.305, scale = 5.452)
p_gt_rev <- c_gt %>% diff() %>% rev()
## discretesized distribution of incubation period (from infection to onset) (Linton et al 2020 - with right truncation excl. Wuhan residents)
c_inc <- plnorm(0:T, meanlog = 1.519, sdlog = 0.615)
p_inc_rev <- c_inc %>% diff() %>% rev()
## discretesized distribution of the delay from infection to report
# right-truncated reporting delay from onset to repot (estimated values using the MHLW data): dweibull(t, shape = 1.741, scale = 8.573)
cdf_rep <- Vectorize(function(t) {
integrand <- function(x) {
pweibull(t - x, shape = 1.741, scale = 8.573) * dlnorm(x, meanlog = 1.519, sdlog = 0.615)
}
val <- integrate(integrand, lower=0, upper=t)$value
return(val)
})
c_rep <- cdf_rep(0:T)
p_rep_rev <- c_rep %>% diff() %>% rev()
##------ estimation ------
stan_data = list(
T = T,
domestic_onset = df_cases$domestic_onset,
imported_onset = df_cases$imported_onset,
domestic_report = df_cases$domestic_report,
imported_report = df_cases$imported_report,
p_gt_rev = p_gt_rev,
p_inc_rev = p_inc_rev,
p_rep_rev = p_rep_rev
)
stanmodel <- stan_model(file='model/model2.stan')
fit <- sampling(stanmodel,
data = stan_data,
iter = 6000,
warmup = 1000,
thin = 5,
pars = c('Rt', 's_smooth', 'domestic_infect', 'imported_infect', 'cases_infect'),
seed = 1234)
write.table(data.frame(summary(fit)$summary), file='output/fit-summary-model2.csv', sep=',', quote=TRUE, col.names=NA)
save.image(file = 'output/result-model2.RData')
##------ plot ------
ms <- rstan::extract(fit)
qua <- apply(ms$Rt, 2, quantile, probs=c(2.5, 25, 50, 75, 97.5)/100)
d_plot <- data.frame(day = df_cases$date[-T], t(qua), check.names = FALSE) %>%
filter(between(day, as.Date('2020-02-01'), as.Date('2020-05-03')))
qua <- apply(ms$cases_infect, 2, quantile, probs=c(2.5, 25, 50, 75, 97.5)/100)
d_plot2 <- data.frame(day = df_cases$date, t(qua), check.names = FALSE) %>%
filter(between(day, as.Date('2020-02-01'), as.Date('2020-05-03')))
scaling <- 130
p <- ggplot() +
geom_bar(data=d_plot2, aes(x=day, y=`50%`/scaling), fill='#FAAB18', stat='identity', width=0.7) +
geom_errorbar(data=d_plot2, aes(x=day, ymin=`2.5%`/scaling, ymax=`97.5%`/scaling), width=0.5, alpha = 0.4) +
geom_ribbon(data=d_plot, aes(x=day, ymin=`2.5%`, ymax=`97.5%`), fill='#1380A1', alpha=0.4) +
geom_line(data=d_plot, aes(x=day, y=`50%`), color='#1380A1') +
labs(x='\nDate', y='Effective reproduction number\n') +
theme(text=element_text(size=12), axis.text=element_text(size=10), legend.position='none') +
scale_x_date(date_labels='%m/%d',date_breaks='10 day', expand=c(0, 0)) +
scale_y_continuous(limits=c(0, 5.6), expand=c(0, 0),
sec.axis=sec_axis(~. * scaling, breaks=c(200, 400, 600), name='Cases\n')) +
geom_hline(yintercept=1, linetype='dashed', color='#1380A1', size=0.7)
ggsave(p, file = 'output/fig-model2.png', dpi=300, w=5.6, h=4)
lapply(c('data.table', 'dplyr', 'tidyr', 'rstan', 'ggplot2'), library, character.only=TRUE)
rstan_options(auto_write=TRUE)
options(mc.cores=4)
datemin <- as.Date('2019-12-25') # particular choice
datemax <- as.Date('2020-05-10') # particular choice
##------ loading the dataset ------
fread('data/JapaneseDataCOVID19 (200510).csv') %>%
mutate(report = if_else(is.na(confirmed), reported, confirmed),
report = as.Date(report),
onset = as.Date(onset)) %>%
select(-confirmed, -reported) -> df
df_onset <- df %>%
filter(!is.na(onset)) %>%
group_by(onset, exp_type) %>%
summarise(count = n()) %>% ungroup() %>%
spread(key = exp_type, value = count, fill = 0) %>%
rename(domestic_onset = domestic, imported_onset = imported)
df_report <- df %>%
filter(is.na(onset)) %>%
group_by(report, exp_type) %>%
summarise(count = n()) %>% ungroup() %>%
spread(key = exp_type, value = count, fill = 0) %>%
rename(domestic_report = domestic, imported_report = imported)
df_cases <- data.frame(date = seq(datemin, datemax, by = 1)) %>%
left_join(df_onset, by = c('date'='onset'), fill = 0) %>%
left_join(df_report, by = c('date'='report'), fill = 0) %>%
replace_na(list(domestic_onset = 0, imported_onset = 0, domestic_report = 0, imported_report = 0))
##------ predetermined distributions ------
## discretesized distribution of generation time (Nishiura, et al, 2020)
T <- nrow(df_cases)
c_gt <- pweibull(0:T, shape = 2.305, scale = 5.452)
p_gt_rev <- c_gt %>% diff() %>% rev()
## discretesized distribution of incubation period (from infection to onset) (Linton et al 2020 - with right truncation excl. Wuhan residents)
c_inc <- plnorm(0:T, meanlog = 1.519, sdlog = 0.615)
p_inc_rev <- c_inc %>% diff() %>% rev()
## discretesized distribution of the delay from infection to report
# right-truncated reporting delay from onset to repot (estimated values using the MHLW data): dweibull(t, shape = 1.741, scale = 8.573)
cdf_rep <- Vectorize(function(t) {
integrand <- function(x) {
pweibull(t - x, shape = 1.741, scale = 8.573) * dlnorm(x, meanlog = 1.519, sdlog = 0.615)
}
val <- integrate(integrand, lower=0, upper=t)$value
return(val)
})
c_rep <- cdf_rep(0:T)
p_rep_rev <- c_rep %>% diff() %>% rev()
p_rep_cum <- c_rep %>% rev() %>% .[1:T]
##------ estimation ------
stan_data = list(
T = T,
domestic_onset = df_cases$domestic_onset,
imported_onset = df_cases$imported_onset,
domestic_report = df_cases$domestic_report,
imported_report = df_cases$imported_report,
p_gt_rev = p_gt_rev,
p_inc_rev = p_inc_rev,
p_rep_rev = p_rep_rev,
p_rep_cum = p_rep_cum
)
stanmodel <- stan_model(file='model/model3.stan')
fit <- sampling(stanmodel,
data = stan_data,
iter = 6000,
warmup = 1000,
thin = 5,
pars = c('Rt', 's_smooth', 'domestic_infect_latent', 'imported_infect_latent', 'cases_infect_latent', 'domestic_infect', 'imported_infect'),
seed = 1234)
write.table(data.frame(summary(fit)$summary), file='output/fit-summary-model3.csv', sep=',', quote=TRUE, col.names=NA)
save.image(file = 'output/result-model3.RData')
##------ plot ------
ms <- rstan::extract(fit)
qua <- apply(ms$Rt, 2, quantile, probs=c(2.5, 25, 50, 75, 97.5)/100)
d_plot <- data.frame(day = df_cases$date[-T], t(qua), check.names = FALSE) %>%
filter(between(day, as.Date('2020-02-01'), as.Date('2020-05-03')))
qua <- apply(ms$cases_infect_latent, 2, quantile, probs=c(2.5, 25, 50, 75, 97.5)/100)
d_plot2 <- data.frame(day = df_cases$date, t(qua), check.names = FALSE) %>%
filter(between(day, as.Date('2020-02-01'), as.Date('2020-05-03')))
scaling <- 130
p <- ggplot() +
geom_bar(data=d_plot2, aes(x=day, y=`50%`/scaling), fill='#FAAB18', stat='identity', width=0.7) +
geom_errorbar(data=d_plot2, aes(x=day, ymin=`2.5%`/scaling, ymax=`97.5%`/scaling), width=0.5, alpha = 0.4) +
geom_ribbon(data=d_plot, aes(x=day, ymin=`2.5%`, ymax=`97.5%`), fill='#1380A1', alpha=0.4) +
geom_line(data=d_plot, aes(x=day, y=`50%`), color='#1380A1') +
labs(x='\nDate', y='Effective reproduction number\n') +
theme(text=element_text(size=12), axis.text=element_text(size=10), legend.position='none') +
scale_x_date(date_labels='%m/%d',date_breaks='10 day', expand=c(0, 0)) +
scale_y_continuous(limits=c(0, 5.6), expand=c(0, 0),
sec.axis=sec_axis(~. * scaling, breaks=c(200, 400, 600), name='Cases\n')) +
geom_hline(yintercept=1, linetype='dashed', color='#1380A1', size=0.7)
ggsave(p, file = 'output/fig-model3.png', dpi=300, w=5.6, h=4)
@MatsuuraKentaro
Copy link
Author

MatsuuraKentaro commented May 19, 2020

Hi Andrei,

Thank you very much for your comments.

Your algorithm of smoothing leads to the uniform error s_smooth over the whole period of time and does not group the cases of infections. This leads to the rather large uncertainty of Rt at the beginning of the outbreak.

What does "group" mean? The error bars at the beginning of the outbreak for slide #96 in moid.pdf seem to be proportional to the variance of the Poisson distribution. I feel that the error bars at the beginning are too narrow. On the other hand, I think my smoothing method includes the uncertainty of the incubation period (i.e. uncertainty of the time axis), so the error bars are wider.

Also the shape of my smoothing method is the same as the first-order trend model of the state-space model and there is room for change. For example, a stronger second-order trend model could be considered:

domestic_infect_from_onset_raw[3:T] ~ normal(2*domestic_infect_from_onset_raw[2:(T-1)] - domestic_infect_from_onset_raw[1:(T-2)], s_smooth);

That means "domestic_infect_from_onset_raw[t] - domestic_infect_from_onset_raw[t-1] ~ normal(domestic_infect_from_onset_raw[t-1] - domestic_infect_from_onset_raw[t-2], s_smooth) for t = 3,4,...,T"
However, a strong smoothing generates optimistic credible intervals. In the case of decision-making issues, we adopted a weaker smoothing method because we should be pessimistic and avoid underestimating the credible intervals.

The Poisson distribution in Stan accepts only integer response variables. It can be extended using the gamma function. Certainly, the normalization constant depends on the parameter, but since it is close to 1, we can use the continuous Poisson distribution in the above Stan code as an approximation. See:
https://stats.stackexchange.com/questions/310676/continuous-generalization-of-the-negative-binomial-distribution/311927
See below for examples of use:
http://ac.inf.elte.hu/Vol_039_2013/137_39.pdf.
In addition, Sungmok Jung also uses the continuous Poisson distribution in In [49] in the MLE version:

return(-(-Cs+dt.backproj.T$domestic_delay[t+1]*log(Cs)-lgamma(dt.backproj.T$domestic_delay[t+1]+1)))}

The Poisson distribution in OpenBUGS also adopted the continuous Poisson distribution by using the gamma function. But I can't remember the URL of implementation part (Pascal code).

I agree about the difficulty of the nowcasting. I also had to truncate the estimated results of the last 1 week. However, looking at the distribution of reporting delay, I think that truncating two weeks is overkill; data from one to two weeks ago can be used effectively, so it wouldn't be a bad idea to show it as a way of informing uncertainty.
Also, if the smoothing method is a second-order trend model, the Rt will not necessarily be a broad confidence interval centered on 1, because the Rt will be calculated to take into account recent trends in the number of cases. But, I think we should avoid situations where the results depend on the trend model largely. After all, I feel that the nowcasting is a difficult problem.

Best regards,
Kentaro

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment