JosepER/session_day_1.R

## session_day_1.R

# Summer workshop - Data preparation in R --------


# COPY TO LISSY: ----------------------------------------------------------

library(tidyverse)
library(magrittr)
# install.packages("Hmisc")

# ** my functions ---------------------------------------------------------

is.all.na.or.zero <- function(x){

  return(length(x[!is.na(x) & x != 0]) == 0)

}

bottom_code_with_iqr_pfile <- function(file, file_name, variable, times = 3, variable_level = NULL, weight = NULL){

  assertthat::assert_that(variable %in% names(file),
                          msg = glue::glue("Variable '{variable}' could not be found in '{file_name}'."))

  var_ <- file[[variable]]

  if(!is.all.na.or.zero(var_)){ # need to define function

    if(is.null(weight) & variable_level %in% c("person", "p")){

      weight_var <- "pwgt"

    }else if(is.null(weight) & variable_level %in% c("household", "h")){

      weight_var <- "hwgt"

    }else{

      weight_var <- weight

    }

    assertthat::assert_that(all(var_ >= 0 | is.na(var_)),
                            msg = glue::glue("Error in '{file_name}'. The variable where top coding with log IQR is applied can not have negative values."))

    assertthat::assert_that(weight_var %in% names(file),
                            msg = glue::glue("'{weight_var}' could not be found in '{file_name}'."))

    if(variable_level %in% c("household", "h")){

      assertthat::assert_that("relation" %in% names(file),
                              msg = glue::glue("'relation' could not be found in '{file_name}'."))

    }

    log_var <- dplyr::if_else(var_ > 0,
                              true = log(var_),
                              false = 0,
                              missing = NA_real_)

    index_valid_weights <- !is.na(file[[weight_var]]) # this shouldn't be happening, but it happens for some LIS and LWS Japan files


    if(variable_level == "household"){

      index_hh_head <- !is.na(file[["relation"]]) & (file[["relation"]] == 1000 | file[["relation"]] == "[1000]head")

      log_var_for_iqr_computation <- log_var[index_valid_weights & index_hh_head]
      weights_for_iqr_computation <- file[index_valid_weights & index_hh_head, weight_var, drop = TRUE]

    }else{

      log_var_for_iqr_computation <- log_var[index_valid_weights]
      weights_for_iqr_computation <- file[index_valid_weights, weight_var, drop = TRUE]

    }


    if(!is.all.na.or.zero(log_var_for_iqr_computation)){

      log_third_quartile <-  Hmisc::wtd.quantile(log_var_for_iqr_computation,
                                         weights = weights_for_iqr_computation,
                                         probs = 0.75)

      log_first_quartile <- Hmisc::wtd.quantile(log_var_for_iqr_computation,
                                         weights = weights_for_iqr_computation,
                                         probs = 0.25)

      log_times_iqr <- (log_third_quartile - log_first_quartile) * times

      var_ <- dplyr::if_else(log(var_) < (log_first_quartile - log_times_iqr),
                             true = exp(log_first_quartile - log_times_iqr),
                             false = var_)

    }

    file[[variable]] <- var_

  }

  return(file)

}

top_code_with_iqr_pfile <- function(file, file_name, variable, times = 3, variable_level = NULL, weight = NULL){

  assertthat::assert_that(variable %in% names(file),
                          msg = glue::glue("Variable '{variable}' could not be found in '{file_name}'."))

  var_ <- file[[variable]]

  if(!is.all.na.or.zero(var_)){

    if(is.null(variable_level)){

      variable_level <- check_variable_level(variable)

    }else{

      assertthat::assert_that(variable_level %in% c("person", "household", "p", "h"),
                              msg = "Argument 'variable_level' can only take 'person', 'p', 'household' or 'h' as values.")
    }


    if(is.null(weight) & variable_level %in% c("person", "p")){

      weight_var <- "pwgt"

    }else if(is.null(weight) & variable_level %in% c("household", "h")){

      weight_var <- "hwgt"

    }else{

      weight_var <- weight

    }

    assertthat::assert_that(all(var_ >= 0 | is.na(var_)),
                            msg = glue::glue("Error in '{file_name}'. The variable where top coding with log IQR is applied can not have negative values."))

    assertthat::assert_that(weight_var %in% names(file),
                            msg = glue::glue("'{weight_var}' could not be found in '{file_name}'."))

    if(variable_level %in% c("household", "h")){

      assertthat::assert_that("relation" %in% names(file),
                              msg = glue::glue("'relation' could not be found in '{file_name}'."))

    }

    log_var <- dplyr::if_else(var_ > 0,
                              true = log(var_),
                              false = 0,
                              missing = NA_real_)

    index_valid_weights <- !is.na(file[[weight_var]]) # this shouldn't be happening, but it happens for some LIS and LWS Japan files


    if(variable_level == "household"){

      index_hh_head <- !is.na(file[["relation"]]) & (file[["relation"]] == 1000 | file[["relation"]] == "[1000]head")

      log_var_for_iqr_computation <- log_var[index_valid_weights & index_hh_head]
      weights_for_iqr_computation <- file[index_valid_weights & index_hh_head, weight_var, drop = TRUE]

    }else{

      log_var_for_iqr_computation <- log_var[index_valid_weights]
      weights_for_iqr_computation <- file[index_valid_weights, weight_var, drop = TRUE]

    }


    if(!is.all.na.or.zero(log_var_for_iqr_computation)){

      log_third_quartile <- Hmisc::wtd.quantile(log_var_for_iqr_computation,
                                         weights = weights_for_iqr_computation,
                                         probs = 0.75)

      log_first_quartile <- Hmisc::wtd.quantile(log_var_for_iqr_computation,
                                         weights = weights_for_iqr_computation,
                                         probs = 0.25)

      log_times_iqr <- (log_third_quartile - log_first_quartile) * times

      var_ <- dplyr::if_else(log(var_) > (log_third_quartile + log_times_iqr),
                             true = exp(log_third_quartile + log_times_iqr),
                             false = var_)

    }

    file[[variable]] <- var_

  }

  return(file)

}


# ** read files into R ----------------------------------------------------


# **** read h-level files -------------------------------------------------

# to read a single file:

de16h <- read.LIS("de16h")

class(de16h)

# to read multiple files at once:

names_files_h <- c("at16h", "ca17h", "de16h", "nl17h")

files_h <- purrr::map(.x = names_files_h,
                       ~read.LIS(.x, vars = c("hid", "dhi", "nhhmem", "hwgt")))

names(files_h) <- names_files_h

class(files_h)

names(files_h)

class(files_h[["at16h"]])

# **** read p-level files -------------------------------------------------

names_files_p <- c("at16p", "ca17p", "de16p", "nl17p")

files_p <- purrr::map(.x = names_files_p,
                       ~read.LIS(.x, vars = c("hid", "pid", "pi11", "age", "relation", "pwgt")))


# **** merge h and p-level files ------------------------------------------

merged_files <- purrr::map2(.x = files_p,
                            .y = files_h,
                            .f = ~left_join(.x, .y, by = "hid"))

names(merged_files) <- c("at16", "ca17", "de16", "nl17")


# Checking the data -------------------------------------------------------

# It is always important to always check descriptive statistics and basic
#   characteristics of the loaded datasets.

merged_files %>%
  bind_rows(.id = "file") %>%
  group_by(file) %>%
  summarize(min_dhi = min(dhi, na.rm = TRUE),
            max_dhi = max(dhi, na.rm = TRUE),
            nas_dhi = sum(is.na(dhi)),
            `0s_dhi` = sum(dhi == 0, na.rm = TRUE),
            min_pi11 = min(pi11, na.rm = TRUE),
            max_pi11 = max(pi11, na.rm = TRUE),
            nas_pi11 = sum(is.na(pi11)),
            `0s_pi11` = sum(pi11 == 0, na.rm = TRUE))


# Data preparation --------------------------------------------------------

# ** recode negative values to 0 ------------------------------------------
# for 'dhi'

merged_files %>%
  bind_rows(.id = "file") %>%
  group_by(file) %>%
  summarize(min_dhi = min(dhi, na.rm = TRUE),
            `0s_dhi` = sum(dhi == 0, na.rm = TRUE),
            nas_dhi = sum(is.na(dhi)))

merged_files %<>%
  purrr::map(~mutate(.x, dhi = if_else(dhi < 0 & !is.na(dhi) ,
                                       true = 0,
                                       false = as.numeric(dhi) )))


merged_files %>%
  bind_rows(.id = "file") %>%
  group_by(file) %>%
  summarize(min_dhi = min(dhi, na.rm = TRUE),
            `0s_dhi` = sum(dhi == 0, na.rm = TRUE),
            nas_dhi = sum(is.na(dhi)))


# ** recode negative values and 0s to NA ----------------------------------
# for 'pi11'

merged_files %>%
  bind_rows(.id = "file") %>%
  group_by(file) %>%
  summarize(min_pi11 = min(pi11, na.rm = TRUE),
            `0s_pi11` = sum(pi11 == 0, na.rm = TRUE),
            nas_pi11 = sum(is.na(pi11)))

merged_files %<>%
  purrr::map(~mutate(.x, pi11 = if_else( (pi11 > 0) & !is.na(pi11),
                                         true = pi11,
                                         false = NA_real_,
                                         missing = NA_real_)))


merged_files %>%
  bind_rows(.id = "file") %>%
  group_by(file) %>%
  summarize(min_pi11 = min(pi11, na.rm = TRUE),
            `0s_pi11` = sum(pi11 == 0, na.rm = TRUE),
            nas_pi11 = sum(is.na(pi11)))


# ** top and bottom code with IQR -----------------------------------------

# for 'dhi'
# on a single file it would look like this:
nl17 <- merged_files[["nl17"]]

nl17$dhi_original <- nl17$dhi

nl17 <- top_code_with_iqr_pfile(file = nl17,
                        file_name = "nl17",
                        variable = "dhi",
                        variable_level = "household",
                        times = 3,
                        weight = "hwgt")

nl17 %>%
  summarize(max_dhi_before = max(dhi_original, na.rm = TRUE),
            max_dhi_after = max(dhi, na.rm = TRUE))


# on multiple files:
# before top and bottom coding

merged_files %>%
  bind_rows(.id = "file") %>%
  group_by(file) %>%
  summarize(min_dhi = min(dhi, na.rm = TRUE),
            max_dhi =  max(dhi, na.rm = TRUE))

merged_files <- purrr::imap(merged_files,
                            ~top_code_with_iqr_pfile(file = .x,
                                                     file_name = .y,
                                                     variable = "dhi",
                                                     variable_level = "household",
                                                     times = 3,
                                                     weight = "hwgt"))

merged_files <- purrr::imap(merged_files,
                            ~bottom_code_with_iqr_pfile(file = .x,
                                                     file_name = .y,
                                                     variable = "dhi",
                                                     variable_level = "household",
                                                     times = 3,
                                                     weight = "hwgt"))

# after top and bottom coding
merged_files %>%
  bind_rows(.id = "file") %>%
  group_by(file) %>%
  summarize(min_dhi = min(dhi, na.rm = TRUE),
            max_dhi =  max(dhi, na.rm = TRUE))


# we can do the same for 'pi11'

merged_files <- purrr::imap(merged_files,
                            ~top_code_with_iqr_pfile(file = .x,
                                                     file_name = .y,
                                                     variable = "pi11",
                                                     variable_level = "person",
                                                     times = 3,
                                                     weight = "pwgt")) # <- p-level variables should be weighted using 'pwgt'

merged_files <- purrr::imap(merged_files,
                            ~bottom_code_with_iqr_pfile(file = .x,
                                                        file_name = .y,
                                                        variable = "pi11",
                                                        variable_level = "person",
                                                        times = 3,
                                                        weight = "pwgt"))  # <- p-level variables should be weighted using 'pwgt'


# # ** bind rows for all datasets -----------------------------------------

# from a list of data.frames to a single data.frame

merged_files <- bind_rows(merged_files, .id = "dataset")

# ** equivalise by hh size ------------------------------------------------

merged_files %<>%
  mutate(dhi_sqrt = dhi/(nhhmem^0.5),
         dhi_pcapita = dhi/(nhhmem))


# ** adjust by cpi and ppp ------------------------------------------------
# can be found here:
# https://www.lisdatacenter.org/resources/ppp-deflators/

# reminder for LWS datasets: the reference year for income variables
#   might not be the dataset year.

lisppp <- tibble::tribble(
  ~dataset, ~lisppp,
  #--|--|----
  "at16", 0.82040614,
  "ca17", 1.298456,
  "de16", 0.787293827,
  "nl17", 0.860976
  )

merged_files %<>%
  left_join(lisppp, by = "dataset")

merged_files %<>%
  mutate(dhi_ppp = dhi_sqrt/lisppp,
         pi11_ppp = pi11/lisppp)


# ** restrict age for pi11 ------------------------------------------------

merged_files %<>%
  mutate(pi11_ppp = if_else(between(age, 16, 64),
                            true = pi11_ppp,
                            false = NA_real_))


# Compute estimates -------------------------------------------------------
# more on this in the next session.

merged_files %>%
  group_by(dataset) %>%
  summarise(mean_dhi = Hmisc::wtd.mean(dhi_ppp, na.rm = TRUE),
            median_dhi = Hmisc::wtd.quantile(dhi_ppp, weights = hwgt, probs = 0.5, na.rm = TRUE),
            mean_pi11 = Hmisc::wtd.mean(pi11_ppp, na.rm = TRUE),
            median_pi11 = Hmisc::wtd.quantile(pi11_ppp, weights = hwgt, probs = 0.5, na.rm = TRUE))

	# Summer workshop - Data preparation in R --------


	# COPY TO LISSY: ----------------------------------------------------------

	library(tidyverse)
	library(magrittr)
	# install.packages("Hmisc")

	# ** my functions ---------------------------------------------------------

	is.all.na.or.zero <- function(x){

	return(length(x[!is.na(x) & x != 0]) == 0)

	}

	bottom_code_with_iqr_pfile <- function(file, file_name, variable, times = 3, variable_level = NULL, weight = NULL){

	assertthat::assert_that(variable %in% names(file),
	msg = glue::glue("Variable '{variable}' could not be found in '{file_name}'."))

	var_ <- file[[variable]]

	if(!is.all.na.or.zero(var_)){ # need to define function

	if(is.null(weight) & variable_level %in% c("person", "p")){

	weight_var <- "pwgt"

	}else if(is.null(weight) & variable_level %in% c("household", "h")){

	weight_var <- "hwgt"

	}else{

	weight_var <- weight

	}

	assertthat::assert_that(all(var_ >= 0 \| is.na(var_)),
	msg = glue::glue("Error in '{file_name}'. The variable where top coding with log IQR is applied can not have negative values."))

	assertthat::assert_that(weight_var %in% names(file),
	msg = glue::glue("'{weight_var}' could not be found in '{file_name}'."))

	if(variable_level %in% c("household", "h")){

	assertthat::assert_that("relation" %in% names(file),
	msg = glue::glue("'relation' could not be found in '{file_name}'."))

	}

	log_var <- dplyr::if_else(var_ > 0,
	true = log(var_),
	false = 0,
	missing = NA_real_)

	index_valid_weights <- !is.na(file[[weight_var]]) # this shouldn't be happening, but it happens for some LIS and LWS Japan files


	if(variable_level == "household"){

	index_hh_head <- !is.na(file[["relation"]]) & (file[["relation"]] == 1000 \| file[["relation"]] == "[1000]head")

	log_var_for_iqr_computation <- log_var[index_valid_weights & index_hh_head]
	weights_for_iqr_computation <- file[index_valid_weights & index_hh_head, weight_var, drop = TRUE]

	}else{

	log_var_for_iqr_computation <- log_var[index_valid_weights]
	weights_for_iqr_computation <- file[index_valid_weights, weight_var, drop = TRUE]

	}


	if(!is.all.na.or.zero(log_var_for_iqr_computation)){

	log_third_quartile <- Hmisc::wtd.quantile(log_var_for_iqr_computation,
	weights = weights_for_iqr_computation,
	probs = 0.75)

	log_first_quartile <- Hmisc::wtd.quantile(log_var_for_iqr_computation,
	weights = weights_for_iqr_computation,
	probs = 0.25)

	log_times_iqr <- (log_third_quartile - log_first_quartile) * times

	var_ <- dplyr::if_else(log(var_) < (log_first_quartile - log_times_iqr),
	true = exp(log_first_quartile - log_times_iqr),
	false = var_)

	}

	file[[variable]] <- var_

	}

	return(file)

	}

	top_code_with_iqr_pfile <- function(file, file_name, variable, times = 3, variable_level = NULL, weight = NULL){

	assertthat::assert_that(variable %in% names(file),
	msg = glue::glue("Variable '{variable}' could not be found in '{file_name}'."))

	var_ <- file[[variable]]

	if(!is.all.na.or.zero(var_)){

	if(is.null(variable_level)){

	variable_level <- check_variable_level(variable)

	}else{

	assertthat::assert_that(variable_level %in% c("person", "household", "p", "h"),
	msg = "Argument 'variable_level' can only take 'person', 'p', 'household' or 'h' as values.")
	}


	if(is.null(weight) & variable_level %in% c("person", "p")){

	weight_var <- "pwgt"

	}else if(is.null(weight) & variable_level %in% c("household", "h")){

	weight_var <- "hwgt"

	}else{

	weight_var <- weight

	}

	assertthat::assert_that(all(var_ >= 0 \| is.na(var_)),
	msg = glue::glue("Error in '{file_name}'. The variable where top coding with log IQR is applied can not have negative values."))

	assertthat::assert_that(weight_var %in% names(file),
	msg = glue::glue("'{weight_var}' could not be found in '{file_name}'."))

	if(variable_level %in% c("household", "h")){

	assertthat::assert_that("relation" %in% names(file),
	msg = glue::glue("'relation' could not be found in '{file_name}'."))

	}

	log_var <- dplyr::if_else(var_ > 0,
	true = log(var_),
	false = 0,
	missing = NA_real_)

	index_valid_weights <- !is.na(file[[weight_var]]) # this shouldn't be happening, but it happens for some LIS and LWS Japan files


	if(variable_level == "household"){

	index_hh_head <- !is.na(file[["relation"]]) & (file[["relation"]] == 1000 \| file[["relation"]] == "[1000]head")

	log_var_for_iqr_computation <- log_var[index_valid_weights & index_hh_head]
	weights_for_iqr_computation <- file[index_valid_weights & index_hh_head, weight_var, drop = TRUE]

	}else{

	log_var_for_iqr_computation <- log_var[index_valid_weights]
	weights_for_iqr_computation <- file[index_valid_weights, weight_var, drop = TRUE]

	}


	if(!is.all.na.or.zero(log_var_for_iqr_computation)){

	log_third_quartile <- Hmisc::wtd.quantile(log_var_for_iqr_computation,
	weights = weights_for_iqr_computation,
	probs = 0.75)

	log_first_quartile <- Hmisc::wtd.quantile(log_var_for_iqr_computation,
	weights = weights_for_iqr_computation,
	probs = 0.25)

	log_times_iqr <- (log_third_quartile - log_first_quartile) * times

	var_ <- dplyr::if_else(log(var_) > (log_third_quartile + log_times_iqr),
	true = exp(log_third_quartile + log_times_iqr),
	false = var_)

	}

	file[[variable]] <- var_

	}

	return(file)

	}



	# ** read files into R ----------------------------------------------------


	# **** read h-level files -------------------------------------------------

	# to read a single file:

	de16h <- read.LIS("de16h")

	class(de16h)

	# to read multiple files at once:

	names_files_h <- c("at16h", "ca17h", "de16h", "nl17h")

	files_h <- purrr::map(.x = names_files_h,
	~read.LIS(.x, vars = c("hid", "dhi", "nhhmem", "hwgt")))

	names(files_h) <- names_files_h

	class(files_h)

	names(files_h)

	class(files_h[["at16h"]])

	# **** read p-level files -------------------------------------------------

	names_files_p <- c("at16p", "ca17p", "de16p", "nl17p")

	files_p <- purrr::map(.x = names_files_p,
	~read.LIS(.x, vars = c("hid", "pid", "pi11", "age", "relation", "pwgt")))


	# **** merge h and p-level files ------------------------------------------

	merged_files <- purrr::map2(.x = files_p,
	.y = files_h,
	.f = ~left_join(.x, .y, by = "hid"))

	names(merged_files) <- c("at16", "ca17", "de16", "nl17")



	# Checking the data -------------------------------------------------------

	# It is always important to always check descriptive statistics and basic
	# characteristics of the loaded datasets.

	merged_files %>%
	bind_rows(.id = "file") %>%
	group_by(file) %>%
	summarize(min_dhi = min(dhi, na.rm = TRUE),
	max_dhi = max(dhi, na.rm = TRUE),
	nas_dhi = sum(is.na(dhi)),
	`0s_dhi` = sum(dhi == 0, na.rm = TRUE),
	min_pi11 = min(pi11, na.rm = TRUE),
	max_pi11 = max(pi11, na.rm = TRUE),
	nas_pi11 = sum(is.na(pi11)),
	`0s_pi11` = sum(pi11 == 0, na.rm = TRUE))


	# Data preparation --------------------------------------------------------

	# ** recode negative values to 0 ------------------------------------------
	# for 'dhi'

	merged_files %>%
	bind_rows(.id = "file") %>%
	group_by(file) %>%
	summarize(min_dhi = min(dhi, na.rm = TRUE),
	`0s_dhi` = sum(dhi == 0, na.rm = TRUE),
	nas_dhi = sum(is.na(dhi)))

	merged_files %<>%
	purrr::map(~mutate(.x, dhi = if_else(dhi < 0 & !is.na(dhi) ,
	true = 0,
	false = as.numeric(dhi) )))


	merged_files %>%
	bind_rows(.id = "file") %>%
	group_by(file) %>%
	summarize(min_dhi = min(dhi, na.rm = TRUE),
	`0s_dhi` = sum(dhi == 0, na.rm = TRUE),
	nas_dhi = sum(is.na(dhi)))


	# ** recode negative values and 0s to NA ----------------------------------
	# for 'pi11'

	merged_files %>%
	bind_rows(.id = "file") %>%
	group_by(file) %>%
	summarize(min_pi11 = min(pi11, na.rm = TRUE),
	`0s_pi11` = sum(pi11 == 0, na.rm = TRUE),
	nas_pi11 = sum(is.na(pi11)))

	merged_files %<>%
	purrr::map(~mutate(.x, pi11 = if_else( (pi11 > 0) & !is.na(pi11),
	true = pi11,
	false = NA_real_,
	missing = NA_real_)))


	merged_files %>%
	bind_rows(.id = "file") %>%
	group_by(file) %>%
	summarize(min_pi11 = min(pi11, na.rm = TRUE),
	`0s_pi11` = sum(pi11 == 0, na.rm = TRUE),
	nas_pi11 = sum(is.na(pi11)))



	# ** top and bottom code with IQR -----------------------------------------

	# for 'dhi'
	# on a single file it would look like this:
	nl17 <- merged_files[["nl17"]]

	nl17$dhi_original <- nl17$dhi

	nl17 <- top_code_with_iqr_pfile(file = nl17,
	file_name = "nl17",
	variable = "dhi",
	variable_level = "household",
	times = 3,
	weight = "hwgt")

	nl17 %>%
	summarize(max_dhi_before = max(dhi_original, na.rm = TRUE),
	max_dhi_after = max(dhi, na.rm = TRUE))


	# on multiple files:
	# before top and bottom coding

	merged_files %>%
	bind_rows(.id = "file") %>%
	group_by(file) %>%
	summarize(min_dhi = min(dhi, na.rm = TRUE),
	max_dhi = max(dhi, na.rm = TRUE))

	merged_files <- purrr::imap(merged_files,
	~top_code_with_iqr_pfile(file = .x,
	file_name = .y,
	variable = "dhi",
	variable_level = "household",
	times = 3,
	weight = "hwgt"))

	merged_files <- purrr::imap(merged_files,
	~bottom_code_with_iqr_pfile(file = .x,
	file_name = .y,
	variable = "dhi",
	variable_level = "household",
	times = 3,
	weight = "hwgt"))

	# after top and bottom coding
	merged_files %>%
	bind_rows(.id = "file") %>%
	group_by(file) %>%
	summarize(min_dhi = min(dhi, na.rm = TRUE),
	max_dhi = max(dhi, na.rm = TRUE))


	# we can do the same for 'pi11'

	merged_files <- purrr::imap(merged_files,
	~top_code_with_iqr_pfile(file = .x,
	file_name = .y,
	variable = "pi11",
	variable_level = "person",
	times = 3,
	weight = "pwgt")) # <- p-level variables should be weighted using 'pwgt'

	merged_files <- purrr::imap(merged_files,
	~bottom_code_with_iqr_pfile(file = .x,
	file_name = .y,
	variable = "pi11",
	variable_level = "person",
	times = 3,
	weight = "pwgt")) # <- p-level variables should be weighted using 'pwgt'



	# # ** bind rows for all datasets -----------------------------------------

	# from a list of data.frames to a single data.frame

	merged_files <- bind_rows(merged_files, .id = "dataset")

	# ** equivalise by hh size ------------------------------------------------

	merged_files %<>%
	mutate(dhi_sqrt = dhi/(nhhmem^0.5),
	dhi_pcapita = dhi/(nhhmem))


	# ** adjust by cpi and ppp ------------------------------------------------
	# can be found here:
	# https://www.lisdatacenter.org/resources/ppp-deflators/

	# reminder for LWS datasets: the reference year for income variables
	# might not be the dataset year.

	lisppp <- tibble::tribble(
	~dataset, ~lisppp,
	#--\|--\|----
	"at16", 0.82040614,
	"ca17", 1.298456,
	"de16", 0.787293827,
	"nl17", 0.860976
	)

	merged_files %<>%
	left_join(lisppp, by = "dataset")

	merged_files %<>%
	mutate(dhi_ppp = dhi_sqrt/lisppp,
	pi11_ppp = pi11/lisppp)


	# ** restrict age for pi11 ------------------------------------------------

	merged_files %<>%
	mutate(pi11_ppp = if_else(between(age, 16, 64),
	true = pi11_ppp,
	false = NA_real_))


	# Compute estimates -------------------------------------------------------
	# more on this in the next session.

	merged_files %>%
	group_by(dataset) %>%
	summarise(mean_dhi = Hmisc::wtd.mean(dhi_ppp, na.rm = TRUE),
	median_dhi = Hmisc::wtd.quantile(dhi_ppp, weights = hwgt, probs = 0.5, na.rm = TRUE),
	mean_pi11 = Hmisc::wtd.mean(pi11_ppp, na.rm = TRUE),
	median_pi11 = Hmisc::wtd.quantile(pi11_ppp, weights = hwgt, probs = 0.5, na.rm = TRUE))