Last active
September 14, 2022 21:06
-
-
Save TimTaylor/4187a7a49cd9dc29846e223a6254c920 to your computer and use it in GitHub Desktop.
pop categories to ages - bootstrap approach
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
url <- "https://raw.githubusercontent.com/TimTaylor/census_pop_2021/main/output/census-2021-england-and-wales-total-population.csv" | |
(dat <- read.csv(url)) | |
# add start and end columns | |
start <- sub("\\[([0-9]+), .+)", "\\1", dat$age_category) | |
end <- sub(".+, (.+))", "\\1", dat$age_category) | |
dat$start <- as.double(start) | |
dat$end <- as.double(end) | |
# function to bootstrap with a single category | |
bootstrap_category <- function( | |
interval_start, | |
interval_end, | |
count, | |
reps = 1, | |
max_end = 100 | |
) { | |
if (!is.infinite(interval_end)) { | |
ages <- interval_start:(interval_end-1) | |
weights <- 1 / (interval_end - interval_start) | |
weights <- rep_len(weights, length(ages)) | |
} else { | |
# linear tail off to zero | |
if (max_end <= interval_start) | |
max_end <- interval_start + 1 | |
interval_end <- max_end | |
ages <- interval_start:(interval_end-1) | |
n <- interval_end - interval_start | |
m <- 200/(n*(n+1)) | |
weights <- m * ( (interval_end) - ages ) / 100 | |
} | |
`row.names<-`( | |
rmultinom(n=reps, size = count, prob = weights), | |
ages | |
) | |
} | |
# function to bootstrap across all categories | |
bootstrap <- function(start_vector, end_vector, count_vector, n = 1, max_end = 100) { | |
out <- .mapply( | |
bootstrap_category, | |
dots = list( | |
interval_start = dat$start, | |
interval_end = dat$end, | |
count = dat$value | |
), | |
MoreArgs = list( | |
reps = n, | |
max_end = max_end | |
) | |
) | |
do.call(rbind, out) | |
} | |
# example with 5 different bootstraps | |
# note - rownames are ages | |
out=bootstrap(dat$start, dat$end, dat$value, n=5) | |
head(out) | |
tail(out) | |
# quick (memory main issue as returns as matrix) | |
# million bootstraps | |
system.time( | |
out <- bootstrap(dat$start, dat$end, dat$value, n=1000000) | |
) | |
# if only one sample needed | |
# single bootstrap | |
system.time( | |
out <- bootstrap(dat$start, dat$end, dat$value, n=1) | |
) | |
Most time will be spent in re-categorisation face
# with the recategorisation
# million bootstraps
library(ympes)
system.time({
out <- bootstrap(dat$start, dat$end, dat$value, n=1000000)
out2 <- apply(out, 2, imp_aggregate_age_counts_c)
})
#> user system elapsed
#> 22.397 0.362 22.809
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
with timings - note single run negligible so would need microbenchmark
Created on 2022-09-14 with reprex v2.0.2