njtierney/demo-join.md

## demo-join.md

      
    Raw
  

              demo-join.md
            
          
    library(tidyverse)

# 4 data sets
# survey
n <- 100
create_survey <- function(n, year, id = 1:n){
  tibble(
  id = id,
  year = year,
  province = sample(1:9, size = n, replace = TRUE),
  age = sample(18:65, size = n, replace = TRUE),
  sex = sample(c("m", "f"), size = n, replace = TRUE)
  )
}

create_survey(n = 10, year = 2004)
#> # A tibble: 10 × 5
#>       id  year province   age sex  
#>    <int> <dbl>    <int> <int> <chr>
#>  1     1  2004        4    28 m    
#>  2     2  2004        2    46 f    
#>  3     3  2004        6    22 m    
#>  4     4  2004        7    22 f    
#>  5     5  2004        7    51 f    
#>  6     6  2004        4    62 f    
#>  7     7  2004        1    62 f    
#>  8     8  2004        7    37 f    
#>  9     9  2004        2    23 m    
#> 10    10  2004        9    38 m

survey <- bind_rows(
  create_survey(n = 10, year = 2005),
  create_survey(n = 10, year = 2007),
  create_survey(n = 10, year = 2008),
  create_survey(n = 10, year = 2009)
)
  
survey
#> # A tibble: 40 × 5
#>       id  year province   age sex  
#>    <int> <dbl>    <int> <int> <chr>
#>  1     1  2005        8    46 m    
#>  2     2  2005        9    42 m    
#>  3     3  2005        8    39 f    
#>  4     4  2005        7    41 m    
#>  5     5  2005        6    45 m    
#>  6     6  2005        6    52 f    
#>  7     7  2005        5    19 m    
#>  8     8  2005        3    44 f    
#>  9     9  2005        2    53 f    
#> 10    10  2005        9    44 f    
#> # ℹ 30 more rows
ca_postcode <- function(size){
  
}

child_age <- function(size){
  sample(c(NA, 0:18), 
         size = size, 
         replace = TRUE)
}

# tax
create_tax <- function(n, year){
  tibble(
  id = 1:n,
  year = year,
  postal_code = sample(LETTERS, size = n, replace = TRUE),
  child1_ages = child_age(size = n),
  child2_ages = child_age(size = n),
  income = abs(rnorm(n = n, mean = 70000, sd = 50000)),
  child_benefit_amount = sample(c(NA, 6400), size = n, replace = TRUE)
) 
}

tax <- map_dfr(.x = 2004:2019,
        .f = function(year) create_tax(10, year = year))

tax
#> # A tibble: 160 × 7
#>       id  year postal_code child1_ages child2_ages  income child_benefit_amount
#>    <int> <int> <chr>             <int>       <int>   <dbl>                <dbl>
#>  1     1  2004 L                     1           5  21786.                   NA
#>  2     2  2004 Q                    11          10  46479.                 6400
#>  3     3  2004 Z                    11           8  50658.                 6400
#>  4     4  2004 G                     0           0 161264.                 6400
#>  5     5  2004 J                     0           0 126375.                   NA
#>  6     6  2004 A                    15           3  81539.                   NA
#>  7     7  2004 E                    NA          15 130778.                 6400
#>  8     8  2004 F                    15          10  81270.                   NA
#>  9     9  2004 C                     9          17  22745.                 6400
#> 10    10  2004 R                    14           7  18867.                 6400
#> # ℹ 150 more rows

sample_date <- function(start, end, size){
  sample(
  seq(as.Date(start), 
      as.Date(end),
      by = "days"),
  replace = TRUE,
  size = size
  )
}

icd_codes <- glue::glue("F{30:39}")

# hospital discharges
create_hospital <- function(n, id = 1:n){
  tibble(
  id = id,
  date = sample_date("2004-01-01", "2009-12-31", size = n),
  icd = sample(icd_codes, size = n, replace = TRUE)
)
}

survey <- bind_rows(
  create_survey(n = 10, year = 2005),
  create_survey(n = 10, year = 2007),
  create_survey(n = 10, year = 2008),
  create_survey(n = 10, year = 2009)
)

tax <- map_dfr(.x = 2004:2009,
               .f = function(year) create_tax(10, year = year))

hospital <- bind_rows(
  create_hospital(n = 3, id = 4),
  create_hospital(n = 3, id = 12),
  create_hospital(n = 1, id = 2),
  create_hospital(n = 9, id = 9)
)

survey
#> # A tibble: 40 × 5
#>       id  year province   age sex  
#>    <int> <dbl>    <int> <int> <chr>
#>  1     1  2005        2    42 f    
#>  2     2  2005        8    36 f    
#>  3     3  2005        6    46 m    
#>  4     4  2005        8    57 m    
#>  5     5  2005        4    32 f    
#>  6     6  2005        7    60 f    
#>  7     7  2005        1    57 f    
#>  8     8  2005        1    65 m    
#>  9     9  2005        1    35 f    
#> 10    10  2005        7    62 f    
#> # ℹ 30 more rows
tax
#> # A tibble: 60 × 7
#>       id  year postal_code child1_ages child2_ages  income child_benefit_amount
#>    <int> <int> <chr>             <int>       <int>   <dbl>                <dbl>
#>  1     1  2004 V                     6           3  77636.                   NA
#>  2     2  2004 R                    15           1  31974.                   NA
#>  3     3  2004 D                    NA          10  21312.                   NA
#>  4     4  2004 H                     9          10  91251.                   NA
#>  5     5  2004 X                     3           7  60838.                 6400
#>  6     6  2004 P                    15           1  84698.                 6400
#>  7     7  2004 I                    18           7  85907.                 6400
#>  8     8  2004 X                    15           9 125375.                   NA
#>  9     9  2004 A                    12          10 193400.                   NA
#> 10    10  2004 P                     8           0 125572.                 6400
#> # ℹ 50 more rows
hospital
#> # A tibble: 16 × 3
#>       id date       icd   
#>    <dbl> <date>     <glue>
#>  1     4 2009-12-25 F31   
#>  2     4 2004-01-25 F32   
#>  3     4 2009-04-12 F38   
#>  4    12 2008-05-02 F37   
#>  5    12 2008-11-25 F31   
#>  6    12 2009-03-24 F33   
#>  7     2 2005-11-05 F31   
#>  8     9 2004-11-11 F30   
#>  9     9 2009-09-24 F37   
#> 10     9 2004-12-18 F31   
#> 11     9 2009-07-01 F31   
#> 12     9 2009-08-03 F36   
#> 13     9 2006-01-01 F32   
#> 14     9 2009-01-12 F30   
#> 15     9 2009-12-06 F32   
#> 16     9 2009-01-06 F33

tax_survey <- tax %>% 
  mutate(year_p1 = year + 1,
         .after = year) %>% 
  left_join(y = survey,
            by = c("id",
                   "year_p1" = "year"))

tax_survey_hospital <- tax_survey %>% 
  left_join(hospital, 
            by = "id",
            relationship = "many-to-many")  %>% 
  filter(!is.na(icd)) %>% 
  filter(!is.na(age))

income_age_summary <- tax_survey_hospital %>% 
  group_by(year) %>% 
  summarise(
    across(.cols = c(income, age),
           .fns = list(mean = mean,
                       med = median,
                       sd = sd,
                       iqr = IQR))
  )

income_age_summary %>% 
  pivot_longer(
    cols = -year,
    names_to = "variable",
    values_to = "stat"
  )
#> # A tibble: 32 × 3
#>     year variable         stat
#>    <int> <chr>           <dbl>
#>  1  2004 income_mean 157410.  
#>  2  2004 income_med  193400.  
#>  3  2004 income_sd    58111.  
#>  4  2004 income_iqr  102149.  
#>  5  2004 age_mean        40.2 
#>  6  2004 age_med         35   
#>  7  2004 age_sd           9.61
#>  8  2004 age_iqr          1   
#>  9  2006 income_mean  91190.  
#> 10  2006 income_med  103871.  
#> # ℹ 22 more rows

# ambulatory care
^{Created on 2023-10-16 with reprex v2.0.2}