JosepER/example_full_na_ratio_computation.R

## example_full_na_ratio_computation.R
library(tidyverse)
library(haven)

#' Download the OECD detailed National Accounts
#'
#' @description
#' Uses the API to the OECD tables to download the National Accounts.
#'
#' Downloads all variables for all years for the selected countries.
#'
#' @details
#' The OECD API takes countries with ISO3. The function uses the METIS 'country'
#'   table to convert ISO2 to ISO3.
#'
#' @param countries A string with the list of countries for which the
#'   National Accounts should be downloaded. Country names should be in ISO2
#'   or ISO3. If NULL (default), the National Accounts for all countries in
#'   METIS datasets will be downloaded.
#' @param start_year An integer with the year from which the funciton should
#'   start reading the National Accounts. By default it starts at 1967.
#'
#' @return A tibble with 5 columns: country, variable, sector, year, value.
download_oecd_detailed <- function(countries = NULL, start_year = 1967){

  countries <- toupper(countries)

  assertthat::assert_that(all(stringr::str_length(countries) %in% c(2, 3)),
                          msg = "Values in 'countries' need to be country codes in ISO2 or ISO3.")

  if(any(stringr::str_length(countries) == 2)){
    countries <- convert_iso2_to_iso3(countries)
  }

  variables <- c("NFB2G_B3GR", "NFB2G_B3GP", "NFB2GP", "NFB2GR",
                 "NFB3GR", "NFB3GP", "NFB3GR",
                 "NFD11P", "NFD12R", "NFD11R", "NFD1P", "NFD11P", "NFD12P", "NFD1R", "NFD12R",
                 "NFD4R", "NFD4P",
                 "NFD41P", "NFD41R", "NFD42R", "NFD42P", "NFD43P", "NFD43R", "NFD44R",
                 "NFD44P", "NFD45P", "NFD45R", "NFD421P", "NFD421R", "NFD441R", "NFD441P",
                 "NFD442R", "NFD422R", "NFD442P", "NFD422P", "NFD443P", "NFD443R",
                 "NFD51P", "NFD51R", "NFD5P", "NFD5R", "NFD59P", "NFD59R",
                 "NFD61P", "NFD61R", "NFD611P", "NFD611R", "NFD6P", "NFD612R", "NFD612P",
                 "NFD62P", "NFD62R", "NFD63P", "NFD63R", "NFD6R", "NFD631P", "NFD631R",
                 "NFD6111P", "NFD6111R", "NFD6121P", "NFD6121R", "NFD612HP", "NFD612HR",
                 "NFD611HP", "NFD611HR", "NFD632P", "NFD632R", "NFD61SCP", "NFD61SCR",
                 "NFB5GR", "NFB5GP",
                 "NFD71P", "NFD71R", "NFD72R", "NFD72P", "NFD74P", "NFD74R", "NFD7P",
                 "NFD7R", "NFD75R", "NFD75P", "NFD76P", "NFD76R", "NFD751R", "NFD751P",
                 "NFB6GR", "NFB6GP",
                 "NFB7GP",
                 "NFP31P", "NFP3P", "NFP32P",
                 "NFD8P", "NFD8R",
                 "NFK1MP", "NFK1P", "NFK1R",
                 "NFTINTR" # instead of "NFD41GR"
  )

  downloaded_df <- OECD::get_dataset("SNA_TABLE14A",
                                     filter = list(countries,
                                                   variables,
                                                   c("S1", "S14", "S14_S15")),
                                     start_time = start_year)

  check_duplicates_downloaded_oecd_detailed(downloaded_df)

  # subset columns
  return(dplyr::select(downloaded_df, country = LOCATION, variable = TRANSACT,
                       sector = SECTOR, year = Time, value = ObsValue))

}

#' Check for duplicates in downloaded OECD National Accounts
#'
#' @description
#' Throws an error if there are duplicates in the downloaded OECD data.
#'
#' @param downloaded_df A data.frame type of object with the downloaded data.
#'
#' @return NULL if there are no duplicates. Throws an error otherwise.
check_duplicates_downloaded_oecd_detailed <- function(downloaded_df){

  nrow_df <-  nrow(dplyr::select(downloaded_df, LOCATION, TRANSACT, SECTOR, Time))

  nrow_unique_df <- nrow(unique(dplyr::select(downloaded_df, LOCATION, TRANSACT, SECTOR, Time)))

  assertthat::assert_that(nrow_df == nrow_unique_df,
                          msg = "The number of rows in the downloaded dataset is not the same as the number of unique combinations of 'LOCATION', 'TRANSACT', 'SECTOR' and 'Time'. There might be duplicated rows in the downloaded dataset.")

  return(invisible(NULL))
}


#' Compute Column from Formula
#'
#' @description
#' The compute_column_from_formula function is a lower-level function that computes a new column in a data.frame based on the given formula.
#'
#' @param data A data.frame containing the input data.
#' @param formula A formula for computing the desired new column.
#'
#' @return A data.frame with the new column added.
#'
#' @note This is a lower-level function and is typically not used directly. Instead, use the compute_columns_from_formulas function to compute multiple columns at once.
compute_column_from_formula <- function(data, formula) {
  # Check if the input is a data.frame
  if (!is.data.frame(data)) {
    stop("The first argument should be a data.frame")
  }

  # Check if the input is a formula
  if (!rlang::is_formula(formula)) {
    stop("The second argument should be a formula")
  }

  # Extract the left-hand and right-hand sides of the formula
  lhs <- as.character(formula[[2]])
  rhs <- formula[[3]]

  # Evaluate the right-hand side expression
  new_column_values <- eval(rhs, data)

  # Add the new column to the data.frame with the name from the left-hand side of the formula
  data[[lhs]] <- new_column_values

  # Return the updated data.frame
  return(data)
}


#' Compute Columns from Formulas
#'
#' @description
#' The higher-level function of compute_column_from_formula. The compute_columns_from_formulas function computes new columns in a data.frame based on the given list of formulas.
#'
#' @param data A data.frame containing the input data.
#' @param formulas A list of formulas for computing the desired new columns.
#'
#' @return A data.frame with the new columns added.
compute_columns_from_formulas <- function(data, formulas) {
  # Check if the input is a data.frame
  if (!is.data.frame(data)) {
    stop("The first argument should be a data.frame")
  }

  # Check if the input is a list of formulas
  if (!is.list(formulas) || !all(sapply(formulas, rlang::is_formula))) {
    stop("The second argument should be a list of formulas")
  }

  # Iterate through the list of formulas and add a new column for each formula
  for (formula in formulas) {
    data <- compute_column_from_formula(data, formula)
  }

  # Return the updated data.frame
  return(data)
}

#' Compute estimates from Microdata
#'
#' @description
#' The compute_estimates_from_microdata function computes estimates for specified variables from microdata using provided formulas and weights.
#'
#' @param data A data.frame containing the microdata.
#' @param formulas A list of formulas for computing the desired variables.
#' @param weights A character vector of variable names for weights to be used with each corresponding formula.
#'
#' @return A tibble containing the computed estimates.
compute_estimates_from_microdata <- function(data, formulas, weights){

  df <- compute_columns_from_formulas(data, formulas)

  # weighted sum of new variables
  new_vars <- purrr::map_chr(formulas, ~as.character(.x[[2]]))

  tibble::enframe(purrr::map2_dbl(set_names(new_vars), weights, .f = function(var, weights, data){
    sum(df[[var]] * df[[weights]])/1000000
  }, data),
  name = "indicator")

}

#' Compute National Account Ratios
#'
#' @description
#' The compute_national_account_ratios function computes the ratios between estimates from microdata and national accounts data.
#'
#' @param estimates_from_microdata_df A tibble containing the computed estimates from the estimates_from_microdata function.
#' @param national_accounts_table A data.frame containing national accounts data.
#'
#' @return A numeric vector with the computed ratios.
compute_national_account_ratios <- function(estimates_from_microdata_df, national_accounts_table){

  national_accounts_table$variable <- stringr::str_remove(national_accounts_table$variable, "^NF" )

  national_accounts_table_wide <- national_accounts_table %>%
    pivot_wider(id_cols = "country", names_from = "variable", values_from = "value")

  national_accounts_estimates <- purrr::map_dbl(estimates_from_microdata_df[["indicator"]],
                                                .f = function(formula_str, df){

                                                  formula_expr <- parse(text = formula_str)

                                                  # Evaluate the formula expression within the context of the data.frame
                                                  new_column <- eval(formula_expr, envir = df)

                                                  return(new_column)

                                                },
                                                national_accounts_table_wide)

  return(estimates_from_microdata_df[["value"]]/national_accounts_estimates)

}


# Read data ---------------------------------------------------------------

# You can download the files form here: https://www.lisdatacenter.org/resources/self-teaching/
it14ip <- haven::read_dta("it14ip.dta", col_select = c("hid", "ppopwgt", "relation"))
it14ih <- haven::read_dta("it14ih.dta", col_select = c("hid", "hi11", "hi13", "hi12", "hicapital"))


it14i <- left_join(it14ip, it14ih, by = "hid")

ita_na_df <- download_oecd_detailed(countries = "ita", start_year = 2014)


# Data management ---------------------------------------------------------

# keep only 2016 and S14 for this example
ita_na_df <- ita_na_df %>%
  filter(year == 2014, sector == "S14") %>%
  mutate(value = as.numeric(value))

it14i <- it14i %>%
  filter(relation == 1000)


# Compute the estimates form microdata ------------------------------------

it14i_estimates <- compute_estimates_from_microdata(data = it14i,
                                            formulas = list(D11P ~ hi11 + hi13,
                                                            `B3GR+D41R+D42R+D45R` ~ hi12 + hicapital),
                                            weights = c("ppopwgt", "ppopwgt"))


# Compute ratio with national accounts ------------------------------------

compute_national_account_ratios(it14i_estimates, ita_na_df)
	library(tidyverse)
	library(haven)

	#' Download the OECD detailed National Accounts
	#'
	#' @description
	#' Uses the API to the OECD tables to download the National Accounts.
	#'
	#' Downloads all variables for all years for the selected countries.
	#'
	#' @details
	#' The OECD API takes countries with ISO3. The function uses the METIS 'country'
	#' table to convert ISO2 to ISO3.
	#'
	#' @param countries A string with the list of countries for which the
	#' National Accounts should be downloaded. Country names should be in ISO2
	#' or ISO3. If NULL (default), the National Accounts for all countries in
	#' METIS datasets will be downloaded.
	#' @param start_year An integer with the year from which the funciton should
	#' start reading the National Accounts. By default it starts at 1967.
	#'
	#' @return A tibble with 5 columns: country, variable, sector, year, value.
	download_oecd_detailed <- function(countries = NULL, start_year = 1967){

	countries <- toupper(countries)

	assertthat::assert_that(all(stringr::str_length(countries) %in% c(2, 3)),
	msg = "Values in 'countries' need to be country codes in ISO2 or ISO3.")

	if(any(stringr::str_length(countries) == 2)){
	countries <- convert_iso2_to_iso3(countries)
	}

	variables <- c("NFB2G_B3GR", "NFB2G_B3GP", "NFB2GP", "NFB2GR",
	"NFB3GR", "NFB3GP", "NFB3GR",
	"NFD11P", "NFD12R", "NFD11R", "NFD1P", "NFD11P", "NFD12P", "NFD1R", "NFD12R",
	"NFD4R", "NFD4P",
	"NFD41P", "NFD41R", "NFD42R", "NFD42P", "NFD43P", "NFD43R", "NFD44R",
	"NFD44P", "NFD45P", "NFD45R", "NFD421P", "NFD421R", "NFD441R", "NFD441P",
	"NFD442R", "NFD422R", "NFD442P", "NFD422P", "NFD443P", "NFD443R",
	"NFD51P", "NFD51R", "NFD5P", "NFD5R", "NFD59P", "NFD59R",
	"NFD61P", "NFD61R", "NFD611P", "NFD611R", "NFD6P", "NFD612R", "NFD612P",
	"NFD62P", "NFD62R", "NFD63P", "NFD63R", "NFD6R", "NFD631P", "NFD631R",
	"NFD6111P", "NFD6111R", "NFD6121P", "NFD6121R", "NFD612HP", "NFD612HR",
	"NFD611HP", "NFD611HR", "NFD632P", "NFD632R", "NFD61SCP", "NFD61SCR",
	"NFB5GR", "NFB5GP",
	"NFD71P", "NFD71R", "NFD72R", "NFD72P", "NFD74P", "NFD74R", "NFD7P",
	"NFD7R", "NFD75R", "NFD75P", "NFD76P", "NFD76R", "NFD751R", "NFD751P",
	"NFB6GR", "NFB6GP",
	"NFB7GP",
	"NFP31P", "NFP3P", "NFP32P",
	"NFD8P", "NFD8R",
	"NFK1MP", "NFK1P", "NFK1R",
	"NFTINTR" # instead of "NFD41GR"
	)

	downloaded_df <- OECD::get_dataset("SNA_TABLE14A",
	filter = list(countries,
	variables,
	c("S1", "S14", "S14_S15")),
	start_time = start_year)

	check_duplicates_downloaded_oecd_detailed(downloaded_df)

	# subset columns
	return(dplyr::select(downloaded_df, country = LOCATION, variable = TRANSACT,
	sector = SECTOR, year = Time, value = ObsValue))

	}

	#' Check for duplicates in downloaded OECD National Accounts
	#'
	#' @description
	#' Throws an error if there are duplicates in the downloaded OECD data.
	#'
	#' @param downloaded_df A data.frame type of object with the downloaded data.
	#'
	#' @return NULL if there are no duplicates. Throws an error otherwise.
	check_duplicates_downloaded_oecd_detailed <- function(downloaded_df){

	nrow_df <- nrow(dplyr::select(downloaded_df, LOCATION, TRANSACT, SECTOR, Time))

	nrow_unique_df <- nrow(unique(dplyr::select(downloaded_df, LOCATION, TRANSACT, SECTOR, Time)))

	assertthat::assert_that(nrow_df == nrow_unique_df,
	msg = "The number of rows in the downloaded dataset is not the same as the number of unique combinations of 'LOCATION', 'TRANSACT', 'SECTOR' and 'Time'. There might be duplicated rows in the downloaded dataset.")

	return(invisible(NULL))
	}


	#' Compute Column from Formula
	#'
	#' @description
	#' The compute_column_from_formula function is a lower-level function that computes a new column in a data.frame based on the given formula.
	#'
	#' @param data A data.frame containing the input data.
	#' @param formula A formula for computing the desired new column.
	#'
	#' @return A data.frame with the new column added.
	#'
	#' @note This is a lower-level function and is typically not used directly. Instead, use the compute_columns_from_formulas function to compute multiple columns at once.
	compute_column_from_formula <- function(data, formula) {
	# Check if the input is a data.frame
	if (!is.data.frame(data)) {
	stop("The first argument should be a data.frame")
	}

	# Check if the input is a formula
	if (!rlang::is_formula(formula)) {
	stop("The second argument should be a formula")
	}

	# Extract the left-hand and right-hand sides of the formula
	lhs <- as.character(formula[[2]])
	rhs <- formula[[3]]

	# Evaluate the right-hand side expression
	new_column_values <- eval(rhs, data)

	# Add the new column to the data.frame with the name from the left-hand side of the formula
	data[[lhs]] <- new_column_values

	# Return the updated data.frame
	return(data)
	}


	#' Compute Columns from Formulas
	#'
	#' @description
	#' The higher-level function of compute_column_from_formula. The compute_columns_from_formulas function computes new columns in a data.frame based on the given list of formulas.
	#'
	#' @param data A data.frame containing the input data.
	#' @param formulas A list of formulas for computing the desired new columns.
	#'
	#' @return A data.frame with the new columns added.
	compute_columns_from_formulas <- function(data, formulas) {
	# Check if the input is a data.frame
	if (!is.data.frame(data)) {
	stop("The first argument should be a data.frame")
	}

	# Check if the input is a list of formulas
	if (!is.list(formulas) \|\| !all(sapply(formulas, rlang::is_formula))) {
	stop("The second argument should be a list of formulas")
	}

	# Iterate through the list of formulas and add a new column for each formula
	for (formula in formulas) {
	data <- compute_column_from_formula(data, formula)
	}

	# Return the updated data.frame
	return(data)
	}

	#' Compute estimates from Microdata
	#'
	#' @description
	#' The compute_estimates_from_microdata function computes estimates for specified variables from microdata using provided formulas and weights.
	#'
	#' @param data A data.frame containing the microdata.
	#' @param formulas A list of formulas for computing the desired variables.
	#' @param weights A character vector of variable names for weights to be used with each corresponding formula.
	#'
	#' @return A tibble containing the computed estimates.
	compute_estimates_from_microdata <- function(data, formulas, weights){

	df <- compute_columns_from_formulas(data, formulas)

	# weighted sum of new variables
	new_vars <- purrr::map_chr(formulas, ~as.character(.x[[2]]))

	tibble::enframe(purrr::map2_dbl(set_names(new_vars), weights, .f = function(var, weights, data){
	sum(df[[var]] * df[[weights]])/1000000
	}, data),
	name = "indicator")

	}

	#' Compute National Account Ratios
	#'
	#' @description
	#' The compute_national_account_ratios function computes the ratios between estimates from microdata and national accounts data.
	#'
	#' @param estimates_from_microdata_df A tibble containing the computed estimates from the estimates_from_microdata function.
	#' @param national_accounts_table A data.frame containing national accounts data.
	#'
	#' @return A numeric vector with the computed ratios.
	compute_national_account_ratios <- function(estimates_from_microdata_df, national_accounts_table){

	national_accounts_table$variable <- stringr::str_remove(national_accounts_table$variable, "^NF" )

	national_accounts_table_wide <- national_accounts_table %>%
	pivot_wider(id_cols = "country", names_from = "variable", values_from = "value")

	national_accounts_estimates <- purrr::map_dbl(estimates_from_microdata_df[["indicator"]],
	.f = function(formula_str, df){

	formula_expr <- parse(text = formula_str)

	# Evaluate the formula expression within the context of the data.frame
	new_column <- eval(formula_expr, envir = df)

	return(new_column)

	},
	national_accounts_table_wide)

	return(estimates_from_microdata_df[["value"]]/national_accounts_estimates)

	}



	# Read data ---------------------------------------------------------------

	# You can download the files form here: https://www.lisdatacenter.org/resources/self-teaching/
	it14ip <- haven::read_dta("it14ip.dta", col_select = c("hid", "ppopwgt", "relation"))
	it14ih <- haven::read_dta("it14ih.dta", col_select = c("hid", "hi11", "hi13", "hi12", "hicapital"))


	it14i <- left_join(it14ip, it14ih, by = "hid")

	ita_na_df <- download_oecd_detailed(countries = "ita", start_year = 2014)


	# Data management ---------------------------------------------------------

	# keep only 2016 and S14 for this example
	ita_na_df <- ita_na_df %>%
	filter(year == 2014, sector == "S14") %>%
	mutate(value = as.numeric(value))

	it14i <- it14i %>%
	filter(relation == 1000)


	# Compute the estimates form microdata ------------------------------------

	it14i_estimates <- compute_estimates_from_microdata(data = it14i,
	formulas = list(D11P ~ hi11 + hi13,
	`B3GR+D41R+D42R+D45R` ~ hi12 + hicapital),
	weights = c("ppopwgt", "ppopwgt"))


	# Compute ratio with national accounts ------------------------------------

	compute_national_account_ratios(it14i_estimates, ita_na_df)