dirkschumacher/disease_video.R

## disease_video.R
# LICENSE MIT
# Data from RKI with special Terms

library(dplyr)
library(readr)
library(tidyr)

# go to https://survstat.rki.de/Content/Query/Create.aspx
# Selected calendar weeks for rows and diseases for columns.
# had to manually edit, because not a valid csv
# Remvoe first line
# saved with UTF8 encoding

data <- readr::read_tsv("Data.csv")
data[is.na(data)] <- 0
# bring it to a long format

long_data <- gather(data, disease, count, -isoweek)

# select the top 9 in 2016
top_diseases <- long_data %>%
  filter(isoweek >= "2016-KW01", isoweek <= "2016-KW52") %>%
  group_by(disease) %>%
  summarise(count = sum(count, na.rm = TRUE)) %>%
  arrange(desc(count)) %>%
  head(9)

# translate to english
disease_translation <- c(
  "Norovirus-Gastroenteritis" = "Norovirus-Gastroenteritis",
  "Campylobacter-Enteritis" = "Campylobacter-Enteritis",
  "Influenza" = "Influenza",
  "Windpocken" = "Chickenpox",
  "Rotavirus-Gastroenteritis" = "Rotavirus-Gastroenteritis",
  "Keuchhusten" = "Pertussis",
  "Salmonellose" = "Salmonellosis",
  "Borreliose" = "Borreliosis",
  "Tuberkulose" = "Tuberculosis"
)

# generate the data for plotting
plot_data <- long_data %>%
  semi_join(top_diseases, by = "disease") %>%
  filter(isoweek >= "2005-KW01") %>%
  mutate(disease = disease_translation[disease]) %>%
  mutate(disease = forcats::fct_reorder(factor(disease), count, function(x) -1 * mean(x))) %>%
  mutate(week = as.integer(stringr::str_sub(isoweek, start = -2)),
         year = stringr::str_sub(isoweek, end = 4),
         line_group = paste0(disease, "#", year),
         iso_week = stringr::str_replace(isoweek, "KW", "")) %>%
  filter(week >= 1, week <= 52)

# we compute a model fit for each week and disease
library(broom)
gam_smooth <- plot_data %>%
  group_by(disease, isoweek) %>%
  do({
    di <- head(.$disease, 1)
    isoweek_limit <- head(.$isoweek, 1)
    fit_data <- filter(plot_data, disease == di, isoweek <= isoweek_limit)
    if (nrow(fit_data) <= 10) {
      data.frame()
    } else {
      fit <- mgcv::gam(count ~ s(week, bs = "cc"), family = poisson, data = fit_data)
      cbind(data.frame(fit_week = head(.$iso_week, 1),
                       iso_week = fit_data$iso_week,
                       week = fit_data$week,
                       disease = di), broom::tidy(predict(fit, type = "response")))
    }
  }) %>%
  mutate(line_group = paste0(disease, "#", fit_week)) %>%
  mutate(fit_week = as.character(fit_week),
                     line_group = as.character(line_group))

# now animate it
library(ggplot2)
library(gganimate)
library(ggthemes)
p <- ggplot(plot_data, aes(x = week)) +
  geom_point(aes(y = count, frame = iso_week, group = disease), color = "red", alpha = 0.6) +
  geom_line(aes(y = count, frame = iso_week, cumulative = TRUE, group = line_group), color = "red", alpha = 0.1, size = 1.2) +
  facet_wrap(~disease, scales = "free_y") +
  ggthemes::theme_fivethirtyeight() +
  geom_line(data = gam_smooth, aes(y = x, x = week,
                                   frame = fit_week,
                                   group = line_group), color = "red", linetype = "dashed", alpha = 0.8) +
  theme(axis.title = element_text(),
        strip.text = element_text(hjust = 0, size = rel(1), face = "bold")) +
  ylab("Reported cases (different y scales)") +
  xlab("Calendar ISO week") +
  ggtitle("The top 9 reported infectious diseases in Germany from 2005 to 2017 - ") +
  expand_limits(y = 0)
p

gganimate(p, interval = .05, ani.width = 900, ani.height = 600, filename = "video.mp4", saver = )
	# LICENSE MIT
	# Data from RKI with special Terms

	library(dplyr)
	library(readr)
	library(tidyr)

	# go to https://survstat.rki.de/Content/Query/Create.aspx
	# Selected calendar weeks for rows and diseases for columns.
	# had to manually edit, because not a valid csv
	# Remvoe first line
	# saved with UTF8 encoding

	data <- readr::read_tsv("Data.csv")
	data[is.na(data)] <- 0
	# bring it to a long format

	long_data <- gather(data, disease, count, -isoweek)

	# select the top 9 in 2016
	top_diseases <- long_data %>%
	filter(isoweek >= "2016-KW01", isoweek <= "2016-KW52") %>%
	group_by(disease) %>%
	summarise(count = sum(count, na.rm = TRUE)) %>%
	arrange(desc(count)) %>%
	head(9)

	# translate to english
	disease_translation <- c(
	"Norovirus-Gastroenteritis" = "Norovirus-Gastroenteritis",
	"Campylobacter-Enteritis" = "Campylobacter-Enteritis",
	"Influenza" = "Influenza",
	"Windpocken" = "Chickenpox",
	"Rotavirus-Gastroenteritis" = "Rotavirus-Gastroenteritis",
	"Keuchhusten" = "Pertussis",
	"Salmonellose" = "Salmonellosis",
	"Borreliose" = "Borreliosis",
	"Tuberkulose" = "Tuberculosis"
	)

	# generate the data for plotting
	plot_data <- long_data %>%
	semi_join(top_diseases, by = "disease") %>%
	filter(isoweek >= "2005-KW01") %>%
	mutate(disease = disease_translation[disease]) %>%
	mutate(disease = forcats::fct_reorder(factor(disease), count, function(x) -1 * mean(x))) %>%
	mutate(week = as.integer(stringr::str_sub(isoweek, start = -2)),
	year = stringr::str_sub(isoweek, end = 4),
	line_group = paste0(disease, "#", year),
	iso_week = stringr::str_replace(isoweek, "KW", "")) %>%
	filter(week >= 1, week <= 52)

	# we compute a model fit for each week and disease
	library(broom)
	gam_smooth <- plot_data %>%
	group_by(disease, isoweek) %>%
	do({
	di <- head(.$disease, 1)
	isoweek_limit <- head(.$isoweek, 1)
	fit_data <- filter(plot_data, disease == di, isoweek <= isoweek_limit)
	if (nrow(fit_data) <= 10) {
	data.frame()
	} else {
	fit <- mgcv::gam(count ~ s(week, bs = "cc"), family = poisson, data = fit_data)
	cbind(data.frame(fit_week = head(.$iso_week, 1),
	iso_week = fit_data$iso_week,
	week = fit_data$week,
	disease = di), broom::tidy(predict(fit, type = "response")))
	}
	}) %>%
	mutate(line_group = paste0(disease, "#", fit_week)) %>%
	mutate(fit_week = as.character(fit_week),
	line_group = as.character(line_group))

	# now animate it
	library(ggplot2)
	library(gganimate)
	library(ggthemes)
	p <- ggplot(plot_data, aes(x = week)) +
	geom_point(aes(y = count, frame = iso_week, group = disease), color = "red", alpha = 0.6) +
	geom_line(aes(y = count, frame = iso_week, cumulative = TRUE, group = line_group), color = "red", alpha = 0.1, size = 1.2) +
	facet_wrap(~disease, scales = "free_y") +
	ggthemes::theme_fivethirtyeight() +
	geom_line(data = gam_smooth, aes(y = x, x = week,
	frame = fit_week,
	group = line_group), color = "red", linetype = "dashed", alpha = 0.8) +
	theme(axis.title = element_text(),
	strip.text = element_text(hjust = 0, size = rel(1), face = "bold")) +
	ylab("Reported cases (different y scales)") +
	xlab("Calendar ISO week") +
	ggtitle("The top 9 reported infectious diseases in Germany from 2005 to 2017 - ") +
	expand_limits(y = 0)
	p

	gganimate(p, interval = .05, ani.width = 900, ani.height = 600, filename = "video.mp4", saver = )