Skip to content

Instantly share code, notes, and snippets.

@dirkschumacher
Created August 4, 2017 10:23
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dirkschumacher/4be7daf6ad9f0d4b5e97c793b346507d to your computer and use it in GitHub Desktop.
Save dirkschumacher/4be7daf6ad9f0d4b5e97c793b346507d to your computer and use it in GitHub Desktop.
# LICENSE MIT
# Data from RKI with special Terms
library(dplyr)
library(readr)
library(tidyr)
# go to https://survstat.rki.de/Content/Query/Create.aspx
# Selected calendar weeks for rows and diseases for columns.
# had to manually edit, because not a valid csv
# Remvoe first line
# saved with UTF8 encoding
data <- readr::read_tsv("Data.csv")
data[is.na(data)] <- 0
# bring it to a long format
long_data <- gather(data, disease, count, -isoweek)
# select the top 9 in 2016
top_diseases <- long_data %>%
filter(isoweek >= "2016-KW01", isoweek <= "2016-KW52") %>%
group_by(disease) %>%
summarise(count = sum(count, na.rm = TRUE)) %>%
arrange(desc(count)) %>%
head(9)
# translate to english
disease_translation <- c(
"Norovirus-Gastroenteritis" = "Norovirus-Gastroenteritis",
"Campylobacter-Enteritis" = "Campylobacter-Enteritis",
"Influenza" = "Influenza",
"Windpocken" = "Chickenpox",
"Rotavirus-Gastroenteritis" = "Rotavirus-Gastroenteritis",
"Keuchhusten" = "Pertussis",
"Salmonellose" = "Salmonellosis",
"Borreliose" = "Borreliosis",
"Tuberkulose" = "Tuberculosis"
)
# generate the data for plotting
plot_data <- long_data %>%
semi_join(top_diseases, by = "disease") %>%
filter(isoweek >= "2005-KW01") %>%
mutate(disease = disease_translation[disease]) %>%
mutate(disease = forcats::fct_reorder(factor(disease), count, function(x) -1 * mean(x))) %>%
mutate(week = as.integer(stringr::str_sub(isoweek, start = -2)),
year = stringr::str_sub(isoweek, end = 4),
line_group = paste0(disease, "#", year),
iso_week = stringr::str_replace(isoweek, "KW", "")) %>%
filter(week >= 1, week <= 52)
# we compute a model fit for each week and disease
library(broom)
gam_smooth <- plot_data %>%
group_by(disease, isoweek) %>%
do({
di <- head(.$disease, 1)
isoweek_limit <- head(.$isoweek, 1)
fit_data <- filter(plot_data, disease == di, isoweek <= isoweek_limit)
if (nrow(fit_data) <= 10) {
data.frame()
} else {
fit <- mgcv::gam(count ~ s(week, bs = "cc"), family = poisson, data = fit_data)
cbind(data.frame(fit_week = head(.$iso_week, 1),
iso_week = fit_data$iso_week,
week = fit_data$week,
disease = di), broom::tidy(predict(fit, type = "response")))
}
}) %>%
mutate(line_group = paste0(disease, "#", fit_week)) %>%
mutate(fit_week = as.character(fit_week),
line_group = as.character(line_group))
# now animate it
library(ggplot2)
library(gganimate)
library(ggthemes)
p <- ggplot(plot_data, aes(x = week)) +
geom_point(aes(y = count, frame = iso_week, group = disease), color = "red", alpha = 0.6) +
geom_line(aes(y = count, frame = iso_week, cumulative = TRUE, group = line_group), color = "red", alpha = 0.1, size = 1.2) +
facet_wrap(~disease, scales = "free_y") +
ggthemes::theme_fivethirtyeight() +
geom_line(data = gam_smooth, aes(y = x, x = week,
frame = fit_week,
group = line_group), color = "red", linetype = "dashed", alpha = 0.8) +
theme(axis.title = element_text(),
strip.text = element_text(hjust = 0, size = rel(1), face = "bold")) +
ylab("Reported cases (different y scales)") +
xlab("Calendar ISO week") +
ggtitle("The top 9 reported infectious diseases in Germany from 2005 to 2017 - ") +
expand_limits(y = 0)
p
gganimate(p, interval = .05, ani.width = 900, ani.height = 600, filename = "video.mp4", saver = )
@hrbrmstr
Copy link

hrbrmstr commented Aug 4, 2017

You can use stringi to avoid manual editing:

library(stringi)
library(tidyverse)

lines <- stri_read_lines("~/data/survstat/Data.csv")

stri_split_fixed(lines[2], "\t")[[1]] %>% 
  stri_replace_all_regex('"', "") %>% 
  tidy_names() -> header
header[1:2] <- c("isowk", "col2")

xdf <- read_tsv(paste0(lines[3:length(lines)], collapse="\n"), col_names = header)

xdf[is.na(xdf)] <- 0

select(xdf, -col2) %>% 
  gather(disease, count, -isowk)

## # A tibble: 61,273 x 3
##       isowk                                        disease count
##       <chr>                                          <chr> <dbl>
##  1 2001-w01 (e) Acinetobacter-Infektion oder –Kolonisation     0
##  2 2001-w02 (e) Acinetobacter-Infektion oder –Kolonisation     0
##  3 2001-w03 (e) Acinetobacter-Infektion oder –Kolonisation     0
##  4 2001-w04 (e) Acinetobacter-Infektion oder –Kolonisation     0
##  5 2001-w05 (e) Acinetobacter-Infektion oder –Kolonisation     0
##  6 2001-w06 (e) Acinetobacter-Infektion oder –Kolonisation     0
##  7 2001-w07 (e) Acinetobacter-Infektion oder –Kolonisation     0
##  8 2001-w08 (e) Acinetobacter-Infektion oder –Kolonisation     0
##  9 2001-w09 (e) Acinetobacter-Infektion oder –Kolonisation     0
## 10 2001-w10 (e) Acinetobacter-Infektion oder –Kolonisation     0
## # ... with 61,263 more rows

@knbknb
Copy link

knbknb commented Aug 4, 2017

I had to convert the output file with iconv

# I did this on the linux bash command line
if(! file.exists("survstat/data.tsv")){
        system("iconv -f UTF16 -t UTF8 survstat/Data.csv > survstat/data.tsv")
}

By the way, I'm German, but I've used an English-locale Browser to grab the data. Then Robert-Koch-Institute, RKI, gave me everything in English. No translation was necessary.
These are RKI's terms/translations.

              disease  count
                   <chr>  <dbl>
1 Noroviral gastroenteritis 125945
2        Campylobacteriosis  82145
3                 Influenza  80910
4 Rotavirus gastroenteritis  25953
5                Chickenpox  25504
6                 Pertussis  22160
7             Salmonellosis  14723
8              Lyme disease   8521
9              Tuberculosis   6158

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment