Skip to content

Instantly share code, notes, and snippets.

@JosepER
Created May 22, 2023 09:43
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save JosepER/4bd2127e2bf3731e64cdf262aefe778a to your computer and use it in GitHub Desktop.
Save JosepER/4bd2127e2bf3731e64cdf262aefe778a to your computer and use it in GitHub Desktop.
# Summer workshop - Data preparation in R --------
# COPY TO LISSY: ----------------------------------------------------------
library(tidyverse)
library(magrittr)
# install.packages("Hmisc")
# ** my functions ---------------------------------------------------------
is.all.na.or.zero <- function(x){
return(length(x[!is.na(x) & x != 0]) == 0)
}
bottom_code_with_iqr_pfile <- function(file, file_name, variable, times = 3, variable_level = NULL, weight = NULL){
assertthat::assert_that(variable %in% names(file),
msg = glue::glue("Variable '{variable}' could not be found in '{file_name}'."))
var_ <- file[[variable]]
if(!is.all.na.or.zero(var_)){ # need to define function
if(is.null(weight) & variable_level %in% c("person", "p")){
weight_var <- "pwgt"
}else if(is.null(weight) & variable_level %in% c("household", "h")){
weight_var <- "hwgt"
}else{
weight_var <- weight
}
assertthat::assert_that(all(var_ >= 0 | is.na(var_)),
msg = glue::glue("Error in '{file_name}'. The variable where top coding with log IQR is applied can not have negative values."))
assertthat::assert_that(weight_var %in% names(file),
msg = glue::glue("'{weight_var}' could not be found in '{file_name}'."))
if(variable_level %in% c("household", "h")){
assertthat::assert_that("relation" %in% names(file),
msg = glue::glue("'relation' could not be found in '{file_name}'."))
}
log_var <- dplyr::if_else(var_ > 0,
true = log(var_),
false = 0,
missing = NA_real_)
index_valid_weights <- !is.na(file[[weight_var]]) # this shouldn't be happening, but it happens for some LIS and LWS Japan files
if(variable_level == "household"){
index_hh_head <- !is.na(file[["relation"]]) & (file[["relation"]] == 1000 | file[["relation"]] == "[1000]head")
log_var_for_iqr_computation <- log_var[index_valid_weights & index_hh_head]
weights_for_iqr_computation <- file[index_valid_weights & index_hh_head, weight_var, drop = TRUE]
}else{
log_var_for_iqr_computation <- log_var[index_valid_weights]
weights_for_iqr_computation <- file[index_valid_weights, weight_var, drop = TRUE]
}
if(!is.all.na.or.zero(log_var_for_iqr_computation)){
log_third_quartile <- Hmisc::wtd.quantile(log_var_for_iqr_computation,
weights = weights_for_iqr_computation,
probs = 0.75)
log_first_quartile <- Hmisc::wtd.quantile(log_var_for_iqr_computation,
weights = weights_for_iqr_computation,
probs = 0.25)
log_times_iqr <- (log_third_quartile - log_first_quartile) * times
var_ <- dplyr::if_else(log(var_) < (log_first_quartile - log_times_iqr),
true = exp(log_first_quartile - log_times_iqr),
false = var_)
}
file[[variable]] <- var_
}
return(file)
}
top_code_with_iqr_pfile <- function(file, file_name, variable, times = 3, variable_level = NULL, weight = NULL){
assertthat::assert_that(variable %in% names(file),
msg = glue::glue("Variable '{variable}' could not be found in '{file_name}'."))
var_ <- file[[variable]]
if(!is.all.na.or.zero(var_)){
if(is.null(variable_level)){
variable_level <- check_variable_level(variable)
}else{
assertthat::assert_that(variable_level %in% c("person", "household", "p", "h"),
msg = "Argument 'variable_level' can only take 'person', 'p', 'household' or 'h' as values.")
}
if(is.null(weight) & variable_level %in% c("person", "p")){
weight_var <- "pwgt"
}else if(is.null(weight) & variable_level %in% c("household", "h")){
weight_var <- "hwgt"
}else{
weight_var <- weight
}
assertthat::assert_that(all(var_ >= 0 | is.na(var_)),
msg = glue::glue("Error in '{file_name}'. The variable where top coding with log IQR is applied can not have negative values."))
assertthat::assert_that(weight_var %in% names(file),
msg = glue::glue("'{weight_var}' could not be found in '{file_name}'."))
if(variable_level %in% c("household", "h")){
assertthat::assert_that("relation" %in% names(file),
msg = glue::glue("'relation' could not be found in '{file_name}'."))
}
log_var <- dplyr::if_else(var_ > 0,
true = log(var_),
false = 0,
missing = NA_real_)
index_valid_weights <- !is.na(file[[weight_var]]) # this shouldn't be happening, but it happens for some LIS and LWS Japan files
if(variable_level == "household"){
index_hh_head <- !is.na(file[["relation"]]) & (file[["relation"]] == 1000 | file[["relation"]] == "[1000]head")
log_var_for_iqr_computation <- log_var[index_valid_weights & index_hh_head]
weights_for_iqr_computation <- file[index_valid_weights & index_hh_head, weight_var, drop = TRUE]
}else{
log_var_for_iqr_computation <- log_var[index_valid_weights]
weights_for_iqr_computation <- file[index_valid_weights, weight_var, drop = TRUE]
}
if(!is.all.na.or.zero(log_var_for_iqr_computation)){
log_third_quartile <- Hmisc::wtd.quantile(log_var_for_iqr_computation,
weights = weights_for_iqr_computation,
probs = 0.75)
log_first_quartile <- Hmisc::wtd.quantile(log_var_for_iqr_computation,
weights = weights_for_iqr_computation,
probs = 0.25)
log_times_iqr <- (log_third_quartile - log_first_quartile) * times
var_ <- dplyr::if_else(log(var_) > (log_third_quartile + log_times_iqr),
true = exp(log_third_quartile + log_times_iqr),
false = var_)
}
file[[variable]] <- var_
}
return(file)
}
# ** read files into R ----------------------------------------------------
# **** read h-level files -------------------------------------------------
# to read a single file:
de16h <- read.LIS("de16h")
class(de16h)
# to read multiple files at once:
names_files_h <- c("at16h", "ca17h", "de16h", "nl17h")
files_h <- purrr::map(.x = names_files_h,
~read.LIS(.x, vars = c("hid", "dhi", "nhhmem", "hwgt")))
names(files_h) <- names_files_h
class(files_h)
names(files_h)
class(files_h[["at16h"]])
# **** read p-level files -------------------------------------------------
names_files_p <- c("at16p", "ca17p", "de16p", "nl17p")
files_p <- purrr::map(.x = names_files_p,
~read.LIS(.x, vars = c("hid", "pid", "pi11", "age", "relation", "pwgt")))
# **** merge h and p-level files ------------------------------------------
merged_files <- purrr::map2(.x = files_p,
.y = files_h,
.f = ~left_join(.x, .y, by = "hid"))
names(merged_files) <- c("at16", "ca17", "de16", "nl17")
# Checking the data -------------------------------------------------------
# It is always important to always check descriptive statistics and basic
# characteristics of the loaded datasets.
merged_files %>%
bind_rows(.id = "file") %>%
group_by(file) %>%
summarize(min_dhi = min(dhi, na.rm = TRUE),
max_dhi = max(dhi, na.rm = TRUE),
nas_dhi = sum(is.na(dhi)),
`0s_dhi` = sum(dhi == 0, na.rm = TRUE),
min_pi11 = min(pi11, na.rm = TRUE),
max_pi11 = max(pi11, na.rm = TRUE),
nas_pi11 = sum(is.na(pi11)),
`0s_pi11` = sum(pi11 == 0, na.rm = TRUE))
# Data preparation --------------------------------------------------------
# ** recode negative values to 0 ------------------------------------------
# for 'dhi'
merged_files %>%
bind_rows(.id = "file") %>%
group_by(file) %>%
summarize(min_dhi = min(dhi, na.rm = TRUE),
`0s_dhi` = sum(dhi == 0, na.rm = TRUE),
nas_dhi = sum(is.na(dhi)))
merged_files %<>%
purrr::map(~mutate(.x, dhi = if_else(dhi < 0 & !is.na(dhi) ,
true = 0,
false = as.numeric(dhi) )))
merged_files %>%
bind_rows(.id = "file") %>%
group_by(file) %>%
summarize(min_dhi = min(dhi, na.rm = TRUE),
`0s_dhi` = sum(dhi == 0, na.rm = TRUE),
nas_dhi = sum(is.na(dhi)))
# ** recode negative values and 0s to NA ----------------------------------
# for 'pi11'
merged_files %>%
bind_rows(.id = "file") %>%
group_by(file) %>%
summarize(min_pi11 = min(pi11, na.rm = TRUE),
`0s_pi11` = sum(pi11 == 0, na.rm = TRUE),
nas_pi11 = sum(is.na(pi11)))
merged_files %<>%
purrr::map(~mutate(.x, pi11 = if_else( (pi11 > 0) & !is.na(pi11),
true = pi11,
false = NA_real_,
missing = NA_real_)))
merged_files %>%
bind_rows(.id = "file") %>%
group_by(file) %>%
summarize(min_pi11 = min(pi11, na.rm = TRUE),
`0s_pi11` = sum(pi11 == 0, na.rm = TRUE),
nas_pi11 = sum(is.na(pi11)))
# ** top and bottom code with IQR -----------------------------------------
# for 'dhi'
# on a single file it would look like this:
nl17 <- merged_files[["nl17"]]
nl17$dhi_original <- nl17$dhi
nl17 <- top_code_with_iqr_pfile(file = nl17,
file_name = "nl17",
variable = "dhi",
variable_level = "household",
times = 3,
weight = "hwgt")
nl17 %>%
summarize(max_dhi_before = max(dhi_original, na.rm = TRUE),
max_dhi_after = max(dhi, na.rm = TRUE))
# on multiple files:
# before top and bottom coding
merged_files %>%
bind_rows(.id = "file") %>%
group_by(file) %>%
summarize(min_dhi = min(dhi, na.rm = TRUE),
max_dhi = max(dhi, na.rm = TRUE))
merged_files <- purrr::imap(merged_files,
~top_code_with_iqr_pfile(file = .x,
file_name = .y,
variable = "dhi",
variable_level = "household",
times = 3,
weight = "hwgt"))
merged_files <- purrr::imap(merged_files,
~bottom_code_with_iqr_pfile(file = .x,
file_name = .y,
variable = "dhi",
variable_level = "household",
times = 3,
weight = "hwgt"))
# after top and bottom coding
merged_files %>%
bind_rows(.id = "file") %>%
group_by(file) %>%
summarize(min_dhi = min(dhi, na.rm = TRUE),
max_dhi = max(dhi, na.rm = TRUE))
# we can do the same for 'pi11'
merged_files <- purrr::imap(merged_files,
~top_code_with_iqr_pfile(file = .x,
file_name = .y,
variable = "pi11",
variable_level = "person",
times = 3,
weight = "pwgt")) # <- p-level variables should be weighted using 'pwgt'
merged_files <- purrr::imap(merged_files,
~bottom_code_with_iqr_pfile(file = .x,
file_name = .y,
variable = "pi11",
variable_level = "person",
times = 3,
weight = "pwgt")) # <- p-level variables should be weighted using 'pwgt'
# # ** bind rows for all datasets -----------------------------------------
# from a list of data.frames to a single data.frame
merged_files <- bind_rows(merged_files, .id = "dataset")
# ** equivalise by hh size ------------------------------------------------
merged_files %<>%
mutate(dhi_sqrt = dhi/(nhhmem^0.5),
dhi_pcapita = dhi/(nhhmem))
# ** adjust by cpi and ppp ------------------------------------------------
# can be found here:
# https://www.lisdatacenter.org/resources/ppp-deflators/
# reminder for LWS datasets: the reference year for income variables
# might not be the dataset year.
lisppp <- tibble::tribble(
~dataset, ~lisppp,
#--|--|----
"at16", 0.82040614,
"ca17", 1.298456,
"de16", 0.787293827,
"nl17", 0.860976
)
merged_files %<>%
left_join(lisppp, by = "dataset")
merged_files %<>%
mutate(dhi_ppp = dhi_sqrt/lisppp,
pi11_ppp = pi11/lisppp)
# ** restrict age for pi11 ------------------------------------------------
merged_files %<>%
mutate(pi11_ppp = if_else(between(age, 16, 64),
true = pi11_ppp,
false = NA_real_))
# Compute estimates -------------------------------------------------------
# more on this in the next session.
merged_files %>%
group_by(dataset) %>%
summarise(mean_dhi = Hmisc::wtd.mean(dhi_ppp, na.rm = TRUE),
median_dhi = Hmisc::wtd.quantile(dhi_ppp, weights = hwgt, probs = 0.5, na.rm = TRUE),
mean_pi11 = Hmisc::wtd.mean(pi11_ppp, na.rm = TRUE),
median_pi11 = Hmisc::wtd.quantile(pi11_ppp, weights = hwgt, probs = 0.5, na.rm = TRUE))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment