Skip to content

Instantly share code, notes, and snippets.

@TimTaylor
Last active September 14, 2022 21:06
Show Gist options
  • Save TimTaylor/4187a7a49cd9dc29846e223a6254c920 to your computer and use it in GitHub Desktop.
Save TimTaylor/4187a7a49cd9dc29846e223a6254c920 to your computer and use it in GitHub Desktop.
pop categories to ages - bootstrap approach
url <- "https://raw.githubusercontent.com/TimTaylor/census_pop_2021/main/output/census-2021-england-and-wales-total-population.csv"
(dat <- read.csv(url))
# add start and end columns
start <- sub("\\[([0-9]+), .+)", "\\1", dat$age_category)
end <- sub(".+, (.+))", "\\1", dat$age_category)
dat$start <- as.double(start)
dat$end <- as.double(end)
# function to bootstrap with a single category
bootstrap_category <- function(
interval_start,
interval_end,
count,
reps = 1,
max_end = 100
) {
if (!is.infinite(interval_end)) {
ages <- interval_start:(interval_end-1)
weights <- 1 / (interval_end - interval_start)
weights <- rep_len(weights, length(ages))
} else {
# linear tail off to zero
if (max_end <= interval_start)
max_end <- interval_start + 1
interval_end <- max_end
ages <- interval_start:(interval_end-1)
n <- interval_end - interval_start
m <- 200/(n*(n+1))
weights <- m * ( (interval_end) - ages ) / 100
}
`row.names<-`(
rmultinom(n=reps, size = count, prob = weights),
ages
)
}
# function to bootstrap across all categories
bootstrap <- function(start_vector, end_vector, count_vector, n = 1, max_end = 100) {
out <- .mapply(
bootstrap_category,
dots = list(
interval_start = dat$start,
interval_end = dat$end,
count = dat$value
),
MoreArgs = list(
reps = n,
max_end = max_end
)
)
do.call(rbind, out)
}
# example with 5 different bootstraps
# note - rownames are ages
out=bootstrap(dat$start, dat$end, dat$value, n=5)
head(out)
tail(out)
# quick (memory main issue as returns as matrix)
# million bootstraps
system.time(
out <- bootstrap(dat$start, dat$end, dat$value, n=1000000)
)
# if only one sample needed
# single bootstrap
system.time(
out <- bootstrap(dat$start, dat$end, dat$value, n=1)
)
@TimTaylor
Copy link
Author

Most time will be spent in re-categorisation face

# with the recategorisation
# million bootstraps
library(ympes)
system.time({
    out <- bootstrap(dat$start, dat$end, dat$value, n=1000000)
    out2 <- apply(out, 2, imp_aggregate_age_counts_c)
})
#>   user  system elapsed 
#> 22.397   0.362  22.809 

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment