library(tidyverse)
# 4 data sets
# survey
n <- 100
create_survey <- function(n, year, id = 1:n){
tibble(
id = id,
year = year,
province = sample(1:9, size = n, replace = TRUE),
age = sample(18:65, size = n, replace = TRUE),
sex = sample(c("m", "f"), size = n, replace = TRUE)
)
}
create_survey(n = 10, year = 2004)
#> # A tibble: 10 × 5
#> id year province age sex
#> <int> <dbl> <int> <int> <chr>
#> 1 1 2004 4 28 m
#> 2 2 2004 2 46 f
#> 3 3 2004 6 22 m
#> 4 4 2004 7 22 f
#> 5 5 2004 7 51 f
#> 6 6 2004 4 62 f
#> 7 7 2004 1 62 f
#> 8 8 2004 7 37 f
#> 9 9 2004 2 23 m
#> 10 10 2004 9 38 m
survey <- bind_rows(
create_survey(n = 10, year = 2005),
create_survey(n = 10, year = 2007),
create_survey(n = 10, year = 2008),
create_survey(n = 10, year = 2009)
)
survey
#> # A tibble: 40 × 5
#> id year province age sex
#> <int> <dbl> <int> <int> <chr>
#> 1 1 2005 8 46 m
#> 2 2 2005 9 42 m
#> 3 3 2005 8 39 f
#> 4 4 2005 7 41 m
#> 5 5 2005 6 45 m
#> 6 6 2005 6 52 f
#> 7 7 2005 5 19 m
#> 8 8 2005 3 44 f
#> 9 9 2005 2 53 f
#> 10 10 2005 9 44 f
#> # ℹ 30 more rows
ca_postcode <- function(size){
}
child_age <- function(size){
sample(c(NA, 0:18),
size = size,
replace = TRUE)
}
# tax
create_tax <- function(n, year){
tibble(
id = 1:n,
year = year,
postal_code = sample(LETTERS, size = n, replace = TRUE),
child1_ages = child_age(size = n),
child2_ages = child_age(size = n),
income = abs(rnorm(n = n, mean = 70000, sd = 50000)),
child_benefit_amount = sample(c(NA, 6400), size = n, replace = TRUE)
)
}
tax <- map_dfr(.x = 2004:2019,
.f = function(year) create_tax(10, year = year))
tax
#> # A tibble: 160 × 7
#> id year postal_code child1_ages child2_ages income child_benefit_amount
#> <int> <int> <chr> <int> <int> <dbl> <dbl>
#> 1 1 2004 L 1 5 21786. NA
#> 2 2 2004 Q 11 10 46479. 6400
#> 3 3 2004 Z 11 8 50658. 6400
#> 4 4 2004 G 0 0 161264. 6400
#> 5 5 2004 J 0 0 126375. NA
#> 6 6 2004 A 15 3 81539. NA
#> 7 7 2004 E NA 15 130778. 6400
#> 8 8 2004 F 15 10 81270. NA
#> 9 9 2004 C 9 17 22745. 6400
#> 10 10 2004 R 14 7 18867. 6400
#> # ℹ 150 more rows
sample_date <- function(start, end, size){
sample(
seq(as.Date(start),
as.Date(end),
by = "days"),
replace = TRUE,
size = size
)
}
icd_codes <- glue::glue("F{30:39}")
# hospital discharges
create_hospital <- function(n, id = 1:n){
tibble(
id = id,
date = sample_date("2004-01-01", "2009-12-31", size = n),
icd = sample(icd_codes, size = n, replace = TRUE)
)
}
survey <- bind_rows(
create_survey(n = 10, year = 2005),
create_survey(n = 10, year = 2007),
create_survey(n = 10, year = 2008),
create_survey(n = 10, year = 2009)
)
tax <- map_dfr(.x = 2004:2009,
.f = function(year) create_tax(10, year = year))
hospital <- bind_rows(
create_hospital(n = 3, id = 4),
create_hospital(n = 3, id = 12),
create_hospital(n = 1, id = 2),
create_hospital(n = 9, id = 9)
)
survey
#> # A tibble: 40 × 5
#> id year province age sex
#> <int> <dbl> <int> <int> <chr>
#> 1 1 2005 2 42 f
#> 2 2 2005 8 36 f
#> 3 3 2005 6 46 m
#> 4 4 2005 8 57 m
#> 5 5 2005 4 32 f
#> 6 6 2005 7 60 f
#> 7 7 2005 1 57 f
#> 8 8 2005 1 65 m
#> 9 9 2005 1 35 f
#> 10 10 2005 7 62 f
#> # ℹ 30 more rows
tax
#> # A tibble: 60 × 7
#> id year postal_code child1_ages child2_ages income child_benefit_amount
#> <int> <int> <chr> <int> <int> <dbl> <dbl>
#> 1 1 2004 V 6 3 77636. NA
#> 2 2 2004 R 15 1 31974. NA
#> 3 3 2004 D NA 10 21312. NA
#> 4 4 2004 H 9 10 91251. NA
#> 5 5 2004 X 3 7 60838. 6400
#> 6 6 2004 P 15 1 84698. 6400
#> 7 7 2004 I 18 7 85907. 6400
#> 8 8 2004 X 15 9 125375. NA
#> 9 9 2004 A 12 10 193400. NA
#> 10 10 2004 P 8 0 125572. 6400
#> # ℹ 50 more rows
hospital
#> # A tibble: 16 × 3
#> id date icd
#> <dbl> <date> <glue>
#> 1 4 2009-12-25 F31
#> 2 4 2004-01-25 F32
#> 3 4 2009-04-12 F38
#> 4 12 2008-05-02 F37
#> 5 12 2008-11-25 F31
#> 6 12 2009-03-24 F33
#> 7 2 2005-11-05 F31
#> 8 9 2004-11-11 F30
#> 9 9 2009-09-24 F37
#> 10 9 2004-12-18 F31
#> 11 9 2009-07-01 F31
#> 12 9 2009-08-03 F36
#> 13 9 2006-01-01 F32
#> 14 9 2009-01-12 F30
#> 15 9 2009-12-06 F32
#> 16 9 2009-01-06 F33
tax_survey <- tax %>%
mutate(year_p1 = year + 1,
.after = year) %>%
left_join(y = survey,
by = c("id",
"year_p1" = "year"))
tax_survey_hospital <- tax_survey %>%
left_join(hospital,
by = "id",
relationship = "many-to-many") %>%
filter(!is.na(icd)) %>%
filter(!is.na(age))
income_age_summary <- tax_survey_hospital %>%
group_by(year) %>%
summarise(
across(.cols = c(income, age),
.fns = list(mean = mean,
med = median,
sd = sd,
iqr = IQR))
)
income_age_summary %>%
pivot_longer(
cols = -year,
names_to = "variable",
values_to = "stat"
)
#> # A tibble: 32 × 3
#> year variable stat
#> <int> <chr> <dbl>
#> 1 2004 income_mean 157410.
#> 2 2004 income_med 193400.
#> 3 2004 income_sd 58111.
#> 4 2004 income_iqr 102149.
#> 5 2004 age_mean 40.2
#> 6 2004 age_med 35
#> 7 2004 age_sd 9.61
#> 8 2004 age_iqr 1
#> 9 2006 income_mean 91190.
#> 10 2006 income_med 103871.
#> # ℹ 22 more rows
# ambulatory care
Created on 2023-10-16 with reprex v2.0.2