Skip to content

Instantly share code, notes, and snippets.

@TimTaylor
Last active September 14, 2022 21:06
Show Gist options
  • Save TimTaylor/4187a7a49cd9dc29846e223a6254c920 to your computer and use it in GitHub Desktop.
Save TimTaylor/4187a7a49cd9dc29846e223a6254c920 to your computer and use it in GitHub Desktop.
pop categories to ages - bootstrap approach
url <- "https://raw.githubusercontent.com/TimTaylor/census_pop_2021/main/output/census-2021-england-and-wales-total-population.csv"
(dat <- read.csv(url))
# add start and end columns
start <- sub("\\[([0-9]+), .+)", "\\1", dat$age_category)
end <- sub(".+, (.+))", "\\1", dat$age_category)
dat$start <- as.double(start)
dat$end <- as.double(end)
# function to bootstrap with a single category
bootstrap_category <- function(
interval_start,
interval_end,
count,
reps = 1,
max_end = 100
) {
if (!is.infinite(interval_end)) {
ages <- interval_start:(interval_end-1)
weights <- 1 / (interval_end - interval_start)
weights <- rep_len(weights, length(ages))
} else {
# linear tail off to zero
if (max_end <= interval_start)
max_end <- interval_start + 1
interval_end <- max_end
ages <- interval_start:(interval_end-1)
n <- interval_end - interval_start
m <- 200/(n*(n+1))
weights <- m * ( (interval_end) - ages ) / 100
}
`row.names<-`(
rmultinom(n=reps, size = count, prob = weights),
ages
)
}
# function to bootstrap across all categories
bootstrap <- function(start_vector, end_vector, count_vector, n = 1, max_end = 100) {
out <- .mapply(
bootstrap_category,
dots = list(
interval_start = dat$start,
interval_end = dat$end,
count = dat$value
),
MoreArgs = list(
reps = n,
max_end = max_end
)
)
do.call(rbind, out)
}
# example with 5 different bootstraps
# note - rownames are ages
out=bootstrap(dat$start, dat$end, dat$value, n=5)
head(out)
tail(out)
# quick (memory main issue as returns as matrix)
# million bootstraps
system.time(
out <- bootstrap(dat$start, dat$end, dat$value, n=1000000)
)
# if only one sample needed
# single bootstrap
system.time(
out <- bootstrap(dat$start, dat$end, dat$value, n=1)
)
@TimTaylor
Copy link
Author

with timings - note single run negligible so would need microbenchmark

url <- "https://raw.githubusercontent.com/TimTaylor/census_pop_2021/main/output/census-2021-england-and-wales-total-population.csv"
(dat <- read.csv(url))
#>    area_code         area_name age_category   value
#> 1  K04000001 England and Wales       [0, 5) 3232100
#> 2  K04000001 England and Wales      [5, 10) 3524600
#> 3  K04000001 England and Wales     [10, 15) 3595900
#> 4  K04000001 England and Wales     [15, 20) 3394700
#> 5  K04000001 England and Wales     [20, 25) 3602100
#> 6  K04000001 England and Wales     [25, 30) 3901800
#> 7  K04000001 England and Wales     [30, 35) 4148800
#> 8  K04000001 England and Wales     [35, 40) 3981600
#> 9  K04000001 England and Wales     [40, 45) 3755700
#> 10 K04000001 England and Wales     [45, 50) 3788700
#> 11 K04000001 England and Wales     [50, 55) 4123400
#> 12 K04000001 England and Wales     [55, 60) 4029000
#> 13 K04000001 England and Wales     [60, 65) 3455700
#> 14 K04000001 England and Wales     [65, 70) 2945100
#> 15 K04000001 England and Wales     [70, 75) 2978000
#> 16 K04000001 England and Wales     [75, 80) 2170300
#> 17 K04000001 England and Wales     [80, 85) 1517000
#> 18 K04000001 England and Wales     [85, 90)  925100
#> 19 K04000001 England and Wales    [90, Inf)  527900

# add start and end columns
start <- sub("\\[([0-9]+), .+)", "\\1", dat$age_category)
end <- sub(".+, (.+))", "\\1", dat$age_category)
dat$start <- as.double(start)
dat$end <- as.double(end)

# function to bootstrap with a single category
bootstrap_category <- function(
    interval_start,
    interval_end,
    count,
    reps = 1,
    max_end = 100
) {

    if (!is.infinite(interval_end)) {
        ages <- interval_start:(interval_end-1)
        weights <- 1 / (interval_end - interval_start)
        weights <- rep_len(weights, length(ages))
    } else {
        if (max_end <= interval_start)
            max_end <- interval_start + 1
        interval_end <- max_end
        ages <- interval_start:(interval_end-1)
        n <- interval_end - interval_start
        m <- 200/(n*(n+1))
        weights <- m * ( (interval_end) - ages ) / 100
    }

    `row.names<-`(
        rmultinom(n=reps, size = count, prob = weights),
        ages
    )
}

# function to bootstrap across all categories
bootstrap <- function(start_vector, end_vector, count_vector, n = 1, max_end = 100) {
    out <- .mapply(
        bootstrap_category,
        dots = list(
            interval_start = dat$start,
            interval_end = dat$end,
            count = dat$value
        ),
        MoreArgs = list(
            reps = n,
            max_end = max_end
        )
    )
    do.call(rbind, out)
}

# example with 5 different bootstraps
# note - rownames are ages
out=bootstrap(dat$start, dat$end, dat$value, n=5)
head(out)
#>     [,1]   [,2]   [,3]   [,4]   [,5]
#> 0 648169 646570 647638 646412 646464
#> 1 645856 645881 645488 646528 645407
#> 2 645694 646339 646655 646774 646797
#> 3 645466 646503 646203 646849 645984
#> 4 646915 646807 646116 645537 647448
#> 5 704035 704529 705090 705070 704904
tail(out)
#>     [,1]  [,2]  [,3]  [,4]  [,5]
#> 94 57574 57363 57283 57538 57390
#> 95 48076 47868 48290 48157 47702
#> 96 38530 38416 38492 38409 38657
#> 97 28807 28922 28703 29197 28668
#> 98 19332 19442 19178 19068 19299
#> 99  9585  9620  9437  9575  9555

# quick (memory main issue as returns as matrix)
# million bootstraps
system.time(
    out <- bootstrap(dat$start, dat$end, dat$value, n=1000000)
)
#>    user  system elapsed 
#>   6.217   0.242   6.472

# if only one sample needed
# single bootstrap
system.time(
    out <- bootstrap(dat$start, dat$end, dat$value, n=1)
)
#>    user  system elapsed 
#>       0       0       0

Created on 2022-09-14 with reprex v2.0.2

@TimTaylor
Copy link
Author

Most time will be spent in re-categorisation face

# with the recategorisation
# million bootstraps
library(ympes)
system.time({
    out <- bootstrap(dat$start, dat$end, dat$value, n=1000000)
    out2 <- apply(out, 2, imp_aggregate_age_counts_c)
})
#>   user  system elapsed 
#> 22.397   0.362  22.809 

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment