Skip to content

Instantly share code, notes, and snippets.

@jakeybob
Created June 7, 2021 20:22
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jakeybob/902ff979cd8b7c5cabd8b0628a2b70ff to your computer and use it in GitHub Desktop.
Save jakeybob/902ff979cd8b7c5cabd8b0628a2b70ff to your computer and use it in GitHub Desktop.
Scotland standardised age/sex interpolated populations
library(tidyverse)
library(janitor)
library(lubridate)
library(zoo)
library(ckanr)
# generates age/sex population lookup for Scotland based on NRS mid-year
# population estimates/projections, interpolated linearly to day level
# (July 2nd used as mid-year reference date).
#
# output is aggregated to age band and sex level, with data being
# a) population of that age/sex breakdown on each date,
# b) national population on each date, and
# c) standardised age/sex population on each date (standardised to most recent year in the NRS estimates opendata)
# SETUP ####
# output file will be clipped to these dates
min_date <- dmy("01/06/2013")
max_date <- dmy("01/01/2023")
ckanr_setup(url = "https://www.opendata.nhs.scot/")
res_est <- resource_show(id = "27a72cc8-d6d8-430c-8b4f-3109a9ceadb1") # pop estimates
res_proj <- resource_show(id = "0876fc67-05e6-4e87-bc30-c4b0756fff04") # pop projections
# GET DATA ####
# national population estimates (retrospective)
data_est <- ckan_fetch(x=res_est$url) %>%
clean_names() %>%
filter(hb == "S92000003",
sex != "All") %>%
select(year, sex, contains("age"), -contains("ages"))
# national population projections (for years with no estimates available)
data_proj <- ckan_fetch(x=res_proj$url) %>%
clean_names() %>%
filter(hb == "S92000003",
sex != "All") %>%
select(year, sex, contains("age"), -contains("ages")) %>%
filter(year %in% unique(data_est$year) == FALSE)
# combine and format estimate and projections
data <- data_est %>%
bind_rows(data_proj) %>%
distinct() %>%
mutate(sex = tolower(sex)) %>%
arrange(year, sex) %>%
pivot_longer(starts_with("age"), names_to = "age", values_to = "pop") %>%
mutate(age = as.integer(str_extract(age, "\\d+"))) %>%
mutate(date = dmy(paste0("02/07/", year))) %>%
select(-year) %>%
mutate(age_band = case_when(age < 20 ~ "<20",
age <= 29 ~ "20-29",
age <= 39 ~ "30-39",
age <= 49 ~ "40-49",
age <= 59 ~ "50-59",
age <= 69 ~ "60-69",
age <= 79 ~ "70-79",
age >= 80 ~ "80+",
TRUE ~ NA_character_)) %>%
group_by(sex, date, age_band) %>%
summarise(pop = sum(pop)) %>%
group_by(date) %>%
mutate(pop_scot = sum(pop))
# STANDARDISE ####
# use most recent NRS population estimate (i.e. not projection) as the year to standardise to
standard_pop <- data %>%
ungroup() %>%
filter(date == dmy(paste0("02/07/", max(data_est$year)))) %>%
mutate(factor = pop/sum(pop)) %>%
select(-pop)
data <- data %>%
left_join(standard_pop %>% select(-date, -pop_scot)) %>%
mutate(pop_std = factor*pop_scot) %>% select(-factor) %>% ungroup()
# create df of all date/sex/age combinations, join on the population data,
# then linearly interpolate between all the mid-year points
df <- crossing(date = seq.Date(from = dmy(paste0("02/07/", min(data_est$year, data_proj$year))),
to = dmy(paste0("02/07/", max(data_est$year, data_proj$year))),
by = "day"),
sex = unique(data$sex),
age_band = unique(data$age_band)) %>%
left_join(data) %>%
group_by(sex, age_band) %>%
arrange(sex, age_band, date) %>%
mutate(across(starts_with("pop"), ~na.approx(.))) %>%
ungroup() %>%
select(date, sex, age_band, starts_with("pop"))
# OUTPUT ####
write_rds(df %>% filter(date >= min_date, date <= max_date),
"std_pop_daily.rds", compress = "gz")
# # quick plot test
# df %>%
# filter(date >= min_date, date <= max_date) %>%
# ggplot(aes(x = date, y = pop_std, colour = sex)) +
# geom_line() +
# facet_wrap(~age_band)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment