Created
May 22, 2023 09:43
-
-
Save JosepER/4bd2127e2bf3731e64cdf262aefe778a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Summer workshop - Data preparation in R -------- | |
# COPY TO LISSY: ---------------------------------------------------------- | |
library(tidyverse) | |
library(magrittr) | |
# install.packages("Hmisc") | |
# ** my functions --------------------------------------------------------- | |
is.all.na.or.zero <- function(x){ | |
return(length(x[!is.na(x) & x != 0]) == 0) | |
} | |
bottom_code_with_iqr_pfile <- function(file, file_name, variable, times = 3, variable_level = NULL, weight = NULL){ | |
assertthat::assert_that(variable %in% names(file), | |
msg = glue::glue("Variable '{variable}' could not be found in '{file_name}'.")) | |
var_ <- file[[variable]] | |
if(!is.all.na.or.zero(var_)){ # need to define function | |
if(is.null(weight) & variable_level %in% c("person", "p")){ | |
weight_var <- "pwgt" | |
}else if(is.null(weight) & variable_level %in% c("household", "h")){ | |
weight_var <- "hwgt" | |
}else{ | |
weight_var <- weight | |
} | |
assertthat::assert_that(all(var_ >= 0 | is.na(var_)), | |
msg = glue::glue("Error in '{file_name}'. The variable where top coding with log IQR is applied can not have negative values.")) | |
assertthat::assert_that(weight_var %in% names(file), | |
msg = glue::glue("'{weight_var}' could not be found in '{file_name}'.")) | |
if(variable_level %in% c("household", "h")){ | |
assertthat::assert_that("relation" %in% names(file), | |
msg = glue::glue("'relation' could not be found in '{file_name}'.")) | |
} | |
log_var <- dplyr::if_else(var_ > 0, | |
true = log(var_), | |
false = 0, | |
missing = NA_real_) | |
index_valid_weights <- !is.na(file[[weight_var]]) # this shouldn't be happening, but it happens for some LIS and LWS Japan files | |
if(variable_level == "household"){ | |
index_hh_head <- !is.na(file[["relation"]]) & (file[["relation"]] == 1000 | file[["relation"]] == "[1000]head") | |
log_var_for_iqr_computation <- log_var[index_valid_weights & index_hh_head] | |
weights_for_iqr_computation <- file[index_valid_weights & index_hh_head, weight_var, drop = TRUE] | |
}else{ | |
log_var_for_iqr_computation <- log_var[index_valid_weights] | |
weights_for_iqr_computation <- file[index_valid_weights, weight_var, drop = TRUE] | |
} | |
if(!is.all.na.or.zero(log_var_for_iqr_computation)){ | |
log_third_quartile <- Hmisc::wtd.quantile(log_var_for_iqr_computation, | |
weights = weights_for_iqr_computation, | |
probs = 0.75) | |
log_first_quartile <- Hmisc::wtd.quantile(log_var_for_iqr_computation, | |
weights = weights_for_iqr_computation, | |
probs = 0.25) | |
log_times_iqr <- (log_third_quartile - log_first_quartile) * times | |
var_ <- dplyr::if_else(log(var_) < (log_first_quartile - log_times_iqr), | |
true = exp(log_first_quartile - log_times_iqr), | |
false = var_) | |
} | |
file[[variable]] <- var_ | |
} | |
return(file) | |
} | |
top_code_with_iqr_pfile <- function(file, file_name, variable, times = 3, variable_level = NULL, weight = NULL){ | |
assertthat::assert_that(variable %in% names(file), | |
msg = glue::glue("Variable '{variable}' could not be found in '{file_name}'.")) | |
var_ <- file[[variable]] | |
if(!is.all.na.or.zero(var_)){ | |
if(is.null(variable_level)){ | |
variable_level <- check_variable_level(variable) | |
}else{ | |
assertthat::assert_that(variable_level %in% c("person", "household", "p", "h"), | |
msg = "Argument 'variable_level' can only take 'person', 'p', 'household' or 'h' as values.") | |
} | |
if(is.null(weight) & variable_level %in% c("person", "p")){ | |
weight_var <- "pwgt" | |
}else if(is.null(weight) & variable_level %in% c("household", "h")){ | |
weight_var <- "hwgt" | |
}else{ | |
weight_var <- weight | |
} | |
assertthat::assert_that(all(var_ >= 0 | is.na(var_)), | |
msg = glue::glue("Error in '{file_name}'. The variable where top coding with log IQR is applied can not have negative values.")) | |
assertthat::assert_that(weight_var %in% names(file), | |
msg = glue::glue("'{weight_var}' could not be found in '{file_name}'.")) | |
if(variable_level %in% c("household", "h")){ | |
assertthat::assert_that("relation" %in% names(file), | |
msg = glue::glue("'relation' could not be found in '{file_name}'.")) | |
} | |
log_var <- dplyr::if_else(var_ > 0, | |
true = log(var_), | |
false = 0, | |
missing = NA_real_) | |
index_valid_weights <- !is.na(file[[weight_var]]) # this shouldn't be happening, but it happens for some LIS and LWS Japan files | |
if(variable_level == "household"){ | |
index_hh_head <- !is.na(file[["relation"]]) & (file[["relation"]] == 1000 | file[["relation"]] == "[1000]head") | |
log_var_for_iqr_computation <- log_var[index_valid_weights & index_hh_head] | |
weights_for_iqr_computation <- file[index_valid_weights & index_hh_head, weight_var, drop = TRUE] | |
}else{ | |
log_var_for_iqr_computation <- log_var[index_valid_weights] | |
weights_for_iqr_computation <- file[index_valid_weights, weight_var, drop = TRUE] | |
} | |
if(!is.all.na.or.zero(log_var_for_iqr_computation)){ | |
log_third_quartile <- Hmisc::wtd.quantile(log_var_for_iqr_computation, | |
weights = weights_for_iqr_computation, | |
probs = 0.75) | |
log_first_quartile <- Hmisc::wtd.quantile(log_var_for_iqr_computation, | |
weights = weights_for_iqr_computation, | |
probs = 0.25) | |
log_times_iqr <- (log_third_quartile - log_first_quartile) * times | |
var_ <- dplyr::if_else(log(var_) > (log_third_quartile + log_times_iqr), | |
true = exp(log_third_quartile + log_times_iqr), | |
false = var_) | |
} | |
file[[variable]] <- var_ | |
} | |
return(file) | |
} | |
# ** read files into R ---------------------------------------------------- | |
# **** read h-level files ------------------------------------------------- | |
# to read a single file: | |
de16h <- read.LIS("de16h") | |
class(de16h) | |
# to read multiple files at once: | |
names_files_h <- c("at16h", "ca17h", "de16h", "nl17h") | |
files_h <- purrr::map(.x = names_files_h, | |
~read.LIS(.x, vars = c("hid", "dhi", "nhhmem", "hwgt"))) | |
names(files_h) <- names_files_h | |
class(files_h) | |
names(files_h) | |
class(files_h[["at16h"]]) | |
# **** read p-level files ------------------------------------------------- | |
names_files_p <- c("at16p", "ca17p", "de16p", "nl17p") | |
files_p <- purrr::map(.x = names_files_p, | |
~read.LIS(.x, vars = c("hid", "pid", "pi11", "age", "relation", "pwgt"))) | |
# **** merge h and p-level files ------------------------------------------ | |
merged_files <- purrr::map2(.x = files_p, | |
.y = files_h, | |
.f = ~left_join(.x, .y, by = "hid")) | |
names(merged_files) <- c("at16", "ca17", "de16", "nl17") | |
# Checking the data ------------------------------------------------------- | |
# It is always important to always check descriptive statistics and basic | |
# characteristics of the loaded datasets. | |
merged_files %>% | |
bind_rows(.id = "file") %>% | |
group_by(file) %>% | |
summarize(min_dhi = min(dhi, na.rm = TRUE), | |
max_dhi = max(dhi, na.rm = TRUE), | |
nas_dhi = sum(is.na(dhi)), | |
`0s_dhi` = sum(dhi == 0, na.rm = TRUE), | |
min_pi11 = min(pi11, na.rm = TRUE), | |
max_pi11 = max(pi11, na.rm = TRUE), | |
nas_pi11 = sum(is.na(pi11)), | |
`0s_pi11` = sum(pi11 == 0, na.rm = TRUE)) | |
# Data preparation -------------------------------------------------------- | |
# ** recode negative values to 0 ------------------------------------------ | |
# for 'dhi' | |
merged_files %>% | |
bind_rows(.id = "file") %>% | |
group_by(file) %>% | |
summarize(min_dhi = min(dhi, na.rm = TRUE), | |
`0s_dhi` = sum(dhi == 0, na.rm = TRUE), | |
nas_dhi = sum(is.na(dhi))) | |
merged_files %<>% | |
purrr::map(~mutate(.x, dhi = if_else(dhi < 0 & !is.na(dhi) , | |
true = 0, | |
false = as.numeric(dhi) ))) | |
merged_files %>% | |
bind_rows(.id = "file") %>% | |
group_by(file) %>% | |
summarize(min_dhi = min(dhi, na.rm = TRUE), | |
`0s_dhi` = sum(dhi == 0, na.rm = TRUE), | |
nas_dhi = sum(is.na(dhi))) | |
# ** recode negative values and 0s to NA ---------------------------------- | |
# for 'pi11' | |
merged_files %>% | |
bind_rows(.id = "file") %>% | |
group_by(file) %>% | |
summarize(min_pi11 = min(pi11, na.rm = TRUE), | |
`0s_pi11` = sum(pi11 == 0, na.rm = TRUE), | |
nas_pi11 = sum(is.na(pi11))) | |
merged_files %<>% | |
purrr::map(~mutate(.x, pi11 = if_else( (pi11 > 0) & !is.na(pi11), | |
true = pi11, | |
false = NA_real_, | |
missing = NA_real_))) | |
merged_files %>% | |
bind_rows(.id = "file") %>% | |
group_by(file) %>% | |
summarize(min_pi11 = min(pi11, na.rm = TRUE), | |
`0s_pi11` = sum(pi11 == 0, na.rm = TRUE), | |
nas_pi11 = sum(is.na(pi11))) | |
# ** top and bottom code with IQR ----------------------------------------- | |
# for 'dhi' | |
# on a single file it would look like this: | |
nl17 <- merged_files[["nl17"]] | |
nl17$dhi_original <- nl17$dhi | |
nl17 <- top_code_with_iqr_pfile(file = nl17, | |
file_name = "nl17", | |
variable = "dhi", | |
variable_level = "household", | |
times = 3, | |
weight = "hwgt") | |
nl17 %>% | |
summarize(max_dhi_before = max(dhi_original, na.rm = TRUE), | |
max_dhi_after = max(dhi, na.rm = TRUE)) | |
# on multiple files: | |
# before top and bottom coding | |
merged_files %>% | |
bind_rows(.id = "file") %>% | |
group_by(file) %>% | |
summarize(min_dhi = min(dhi, na.rm = TRUE), | |
max_dhi = max(dhi, na.rm = TRUE)) | |
merged_files <- purrr::imap(merged_files, | |
~top_code_with_iqr_pfile(file = .x, | |
file_name = .y, | |
variable = "dhi", | |
variable_level = "household", | |
times = 3, | |
weight = "hwgt")) | |
merged_files <- purrr::imap(merged_files, | |
~bottom_code_with_iqr_pfile(file = .x, | |
file_name = .y, | |
variable = "dhi", | |
variable_level = "household", | |
times = 3, | |
weight = "hwgt")) | |
# after top and bottom coding | |
merged_files %>% | |
bind_rows(.id = "file") %>% | |
group_by(file) %>% | |
summarize(min_dhi = min(dhi, na.rm = TRUE), | |
max_dhi = max(dhi, na.rm = TRUE)) | |
# we can do the same for 'pi11' | |
merged_files <- purrr::imap(merged_files, | |
~top_code_with_iqr_pfile(file = .x, | |
file_name = .y, | |
variable = "pi11", | |
variable_level = "person", | |
times = 3, | |
weight = "pwgt")) # <- p-level variables should be weighted using 'pwgt' | |
merged_files <- purrr::imap(merged_files, | |
~bottom_code_with_iqr_pfile(file = .x, | |
file_name = .y, | |
variable = "pi11", | |
variable_level = "person", | |
times = 3, | |
weight = "pwgt")) # <- p-level variables should be weighted using 'pwgt' | |
# # ** bind rows for all datasets ----------------------------------------- | |
# from a list of data.frames to a single data.frame | |
merged_files <- bind_rows(merged_files, .id = "dataset") | |
# ** equivalise by hh size ------------------------------------------------ | |
merged_files %<>% | |
mutate(dhi_sqrt = dhi/(nhhmem^0.5), | |
dhi_pcapita = dhi/(nhhmem)) | |
# ** adjust by cpi and ppp ------------------------------------------------ | |
# can be found here: | |
# https://www.lisdatacenter.org/resources/ppp-deflators/ | |
# reminder for LWS datasets: the reference year for income variables | |
# might not be the dataset year. | |
lisppp <- tibble::tribble( | |
~dataset, ~lisppp, | |
#--|--|---- | |
"at16", 0.82040614, | |
"ca17", 1.298456, | |
"de16", 0.787293827, | |
"nl17", 0.860976 | |
) | |
merged_files %<>% | |
left_join(lisppp, by = "dataset") | |
merged_files %<>% | |
mutate(dhi_ppp = dhi_sqrt/lisppp, | |
pi11_ppp = pi11/lisppp) | |
# ** restrict age for pi11 ------------------------------------------------ | |
merged_files %<>% | |
mutate(pi11_ppp = if_else(between(age, 16, 64), | |
true = pi11_ppp, | |
false = NA_real_)) | |
# Compute estimates ------------------------------------------------------- | |
# more on this in the next session. | |
merged_files %>% | |
group_by(dataset) %>% | |
summarise(mean_dhi = Hmisc::wtd.mean(dhi_ppp, na.rm = TRUE), | |
median_dhi = Hmisc::wtd.quantile(dhi_ppp, weights = hwgt, probs = 0.5, na.rm = TRUE), | |
mean_pi11 = Hmisc::wtd.mean(pi11_ppp, na.rm = TRUE), | |
median_pi11 = Hmisc::wtd.quantile(pi11_ppp, weights = hwgt, probs = 0.5, na.rm = TRUE)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment