graebnerc/#S10 - Solutions

## #S10 - Solutions
Example solutions for the recap session on exploratory data analysis (spring semester 2023).

## further_tasks.R
here::i_am("R/further_tasks.R")
library(here)
library(data.table)
library(dplyr)
library(tidyr)
library(ggplot2)
library(scales)
library(data.table)

# Read in data---------------
base_data <- fread(here("data/tidy/full_data_prepared.csv")) %>%
  as_tibble() %>%
  select(country, year, gdp_pc)

# Compute country groups---------------
incomegroups <- base_data %>%
  filter(year==1990, !is.na(gdp_pc)) %>%
  mutate(
    threshold=quantile(gdp_pc, 0.8),
    country_class = ifelse(
      test = gdp_pc>threshold,
      yes = "Rich countries",
      no = "Poor countries")
    ) %>%
  select(country, country_class)

base_data_incomegroups <- left_join(
  base_data, incomegroups, by=c("country")) %>%
  filter(!is.na(country_class), year>=1990, year<=2019)

# Add data on population and CO2 emissions------------
download_data <- FALSE # Set to true to download data
if (download_data){
  further_wb_data <- WDI::WDI(
    indicator = c(
      "co2"="EN.ATM.CO2E.KT",
      "population"="SP.POP.TOTL"),
    start = 1990, end = 2019) %>%
    select(-c("iso2c", "iso3c"))

  fwrite(further_wb_data, file = here("data/tidy/wb_co2_pop.csv"))
} else{
  further_wb_data <- fread(here("data/tidy/wb_co2_pop.csv"))
}

# Merge data-----------------
data_final <- left_join(
  x = base_data_incomegroups,
  y = further_wb_data,
  by=c("country", "year"))

# Plot on CO2 emissions and GDP per capita------------
gdp_có2 <- data_final %>%
  mutate(co2_pc = co2/population) %>%
  summarise(
    gdp_pc = mean(gdp_pc, na.rm = TRUE),
    co2_pc = mean(co2_pc, na.rm = TRUE),
    .by = c("country", "country_class")) %>%
  ggplot(
    data = .,
    mapping = aes(x=gdp_pc, y=co2_pc, color=country_class)
    ) +
  geom_point() +
  scale_x_continuous(
    labels = scales::number_format(scale = 0.001, suffix = "k")
    ) +
  scale_y_continuous(
    labels = scales::number_format(scale = 1000)
    ) +
  scale_color_brewer(palette = "Set1") +
  labs(
    title = "GDP and CO2 emissions",
    caption = "Data: World Bank. Plot shows averages over 1990-2019.",
    x = "GDP per capita (PPP, 2017$)",
    y = "CO2 (TNT equivalents)") +
  theme_bw() +
  theme(
    legend.position = "bottom",
    legend.title = element_blank(),
    axis.title.x = element_blank(),
    plot.title = element_text(hjust = 0.5)
  )
gdp_có2

# Shares in emissions and population-------------

share_data <- data_final %>%
  dplyr::summarize(
    co2_group = sum(co2, na.rm=TRUE),
    population_group = sum(population, na.rm=TRUE),
    .by = c("country_class", "year")
  ) %>%
  dplyr::mutate(
    co2_world = sum(co2_group),
    population_world = sum(population_group),
    .by = "year"
  ) %>%
  dplyr::mutate(
    share_co2 = co2_group/co2_world,
    share_pop = population_group/population_world
  )

share_plot_co2 <- ggplot(
  data = share_data,
  mapping = aes(x=year, y=share_co2, color=country_class)) +
  geom_point() + geom_line() +
  scale_y_continuous(
    labels = scales::percent_format(scale = 100),
    limits = c(0, 1), expand = expansion()
    ) +
  scale_x_continuous(
    breaks = c(seq(1990, 2015, 5), 2019)
    ) +
  labs(
    title = "Shares of CO2 emissions",
    caption = "Data: World Bank.",
    y = "Share of world aggregate") +
  scale_color_brewer(palette = "Set1") +
  theme_bw() +
  theme(
    legend.position = "bottom",
    legend.title = element_blank(),
    axis.title.x = element_blank(),
    plot.title = element_text(hjust = 0.5)
  )
share_plot_co2

share_plot_pop <- ggplot(
  data = share_data,
  mapping = aes(x=year, y=share_pop, color=country_class)) +
  geom_point() + geom_line() +
  scale_x_continuous(
    breaks = c(seq(1990, 2015, 5), 2019)
    ) +
  scale_y_continuous(
    labels = scales::percent_format(scale = 100)
    ) +
  labs(
    title = "Shares of population",
    caption = "Data: World Bank.",
    y = "Share of world aggregate") +
  scale_color_brewer(palette = "Set1") +
  theme_bw() +
  theme(
    legend.position = "bottom",
    legend.title = element_blank(),
    axis.title.x = element_blank(),
    plot.title = element_text(hjust = 0.5)
    )
share_plot_pop

# Final plot-----------------
full_plot <- ggarrange(
  gdp_có2, share_plot_co2, share_plot_pop, ncol = 3,
  common.legend = TRUE, legend = "bottom")

ggsave(
  plot = full_plot,
  filename = here("output/further_task.pdf"),
  width = 10, height = 3.5)

## inclass_exercise.R
here::i_am("R/inclass_exercise.R.R")
library(here)
library(data.table)
library(dplyr)
library(ggplot2)
library(data.table)

# Import data----------------
wb_gdp_pc_raw <- fread(
  file = here("data/raw/API_NY.GDP.PCAP.PP.KD_DS2_en_csv_v2_5359165.csv"),
  header = TRUE) %>%
  as_tibble()

wb_child_mort_raw <- fread(
  file = here("data/raw/API_SH.DYN.MORT_DS2_en_csv_v2_5358988.csv"),
  header = TRUE) %>%
  as_tibble()

# SWIID: https://doi.org/10.7910/DVN/LM4OWF
gini_data <- fread(here("data/raw/swiid9_4_summary.csv")) %>%
  as_tibble() %>%
  select(c("country", "year", "gini_disp"))

# Alternative: download World Bank Data using the WDI package:

# wb_data <- WDI::WDI(
#   indicator = c(
#     "gdp_pc"="NY.GDP.PCAP.PP.KD",
#     "child_mort"="SH.DYN.MORT")
# )
# fwrite(wb_data, here("data/raw/wb_data.csv"))

# Prepare world bank data:--------------
# The wrangling steps for the World Bank data are not necessary if you
#  download the data using the WDI package

wb_gdp_pc <- wb_gdp_pc_raw %>%
  select(-c("Country Code", "Indicator Name", "Indicator Code", "V67")) %>%
  tidyr::pivot_longer(
    cols = -`Country Name`,
    names_to = "year",
    values_to = "gdp_pc")

wb_child_mort <- wb_child_mort_raw %>%
  select(-c("Country Code", "Indicator Name", "Indicator Code", "V67")) %>%
  tidyr::pivot_longer(
    cols = -`Country Name`,
    names_to = "year",
    values_to = "child_mort")

wb_data <- inner_join(
  x = wb_gdp_pc,
  y = wb_child_mort,
  by = c("Country Name", "year")) %>%
  dplyr::mutate(
    year = as.double(year)
    )

# Merge and save data sets-----
full_data <- inner_join(
  x = gini_data, y = wb_data,
  by=c("country" = "Country Name", "year"))

fwrite(full_data, here("data/tidy/full_data_prepared.csv"))

# Summarize data-------------
full_data_prepared <- full_data %>%
  dplyr::filter(year>=2010, year<=2020) %>%
  dplyr::summarise(
    gdp_pc = mean(gdp_pc, na.rm = TRUE),
    child_mort = mean(child_mort, na.rm = TRUE),
    gini_disp = mean(gini_disp, na.rm = TRUE),
    .by = "country"
  )

# Alternative using across:
full_data_prepared <- full_data %>%
  dplyr::filter(year>=2010, year<=2020) %>%
  dplyr::summarise(across(
    .cols = where(is.numeric),
    .fns = ~ mean(.x, na.rm = TRUE)),
    .by = "country"
    )

# Make the scatter plots----------
# Note: the color mapping could also be specified in ggplot(), but this would
#  make the creation of the bonus plots below more difficult

gini_mortality <- ggplot(
  data = full_data_prepared,
  mapping = aes(
    y = log(gini_disp),
    x = log(child_mort))) +
  geom_point(mapping = aes(color=country)) +
  scale_color_viridis_d() +
  labs(
    title = "Inequality and child mortality",
    x = "Child mortality (log)",
    y = "Gini (disposable income, log)",
    caption = "Data: SWIID and World Bank.") +
  theme_bw() +
  theme(legend.position = "none")
gini_mortality

gini_income <- ggplot(
  data = full_data_prepared,
  mapping = aes(
    y = log(gini_disp),
    x = log(gdp_pc))) +
  geom_point(mapping = aes(color=country)) +
  scale_color_viridis_d() +
  labs(
    title = "Inequality and income",
    x = "GDP per capita (PPP, 2017$)",
    y = "Gini (disposable income)",
    caption = "Data: SWIID and World Bank.") +
  theme_bw() +
  theme(legend.position = "none")
gini_income

mortality_income <- ggplot(
  data = full_data_prepared,
  mapping = aes(
    y = log(child_mort),
    x = log(gdp_pc))) +
  geom_point(mapping = aes(color=country)) +
  scale_color_viridis_d() +
  labs(
    title = "Income and child mortality",
    x = "GDP per capita (PPP, 2017$)",
    y = "Child mortality (log)",
    caption = "Data: World Bank.") +
  theme_bw() +
  theme(legend.position = "none")
mortality_income

# Bonus: joint plot using ggarrange():-----------
library(ggpubr)
plot_list <- ggarrange(
  mortality_income, gini_mortality, gini_income, ncol = 3)

ggsave(
  plot = plot_list,
  filename = here("output/scatter_plots.pdf"),
  width = 9, height = 3)

# Bonus: add a regression line-------------------

gini_mortality_line <- gini_mortality +
  geom_smooth(
    data = full_data_prepared,
    mapping = aes(
      y = log(gini_disp),
      x = log(child_mort))
)

gini_income_line <- gini_income +
  geom_smooth(
    data = full_data_prepared,
    mapping = aes(
      y = log(gini_disp),
      x = log(gdp_pc))
)

mortality_income_line <- mortality_income +
  geom_smooth(
    data = full_data_prepared,
    mapping = aes(
      y = log(child_mort),
      x = log(gdp_pc))
  )

line_plot_list <- ggarrange(
  mortality_income_line, gini_mortality_line, gini_income_line, ncol = 3)

ggsave(
  plot = line_plot_list,
  filename = here("output/scatter_plots_regs.pdf"),
  width = 9, height = 3)
	here::i_am("R/further_tasks.R")
	library(here)
	library(data.table)
	library(dplyr)
	library(tidyr)
	library(ggplot2)
	library(scales)
	library(data.table)

	# Read in data---------------
	base_data <- fread(here("data/tidy/full_data_prepared.csv")) %>%
	as_tibble() %>%
	select(country, year, gdp_pc)

	# Compute country groups---------------
	incomegroups <- base_data %>%
	filter(year==1990, !is.na(gdp_pc)) %>%
	mutate(
	threshold=quantile(gdp_pc, 0.8),
	country_class = ifelse(
	test = gdp_pc>threshold,
	yes = "Rich countries",
	no = "Poor countries")
	) %>%
	select(country, country_class)

	base_data_incomegroups <- left_join(
	base_data, incomegroups, by=c("country")) %>%
	filter(!is.na(country_class), year>=1990, year<=2019)

	# Add data on population and CO2 emissions------------
	download_data <- FALSE # Set to true to download data
	if (download_data){
	further_wb_data <- WDI::WDI(
	indicator = c(
	"co2"="EN.ATM.CO2E.KT",
	"population"="SP.POP.TOTL"),
	start = 1990, end = 2019) %>%
	select(-c("iso2c", "iso3c"))

	fwrite(further_wb_data, file = here("data/tidy/wb_co2_pop.csv"))
	} else{
	further_wb_data <- fread(here("data/tidy/wb_co2_pop.csv"))
	}

	# Merge data-----------------
	data_final <- left_join(
	x = base_data_incomegroups,
	y = further_wb_data,
	by=c("country", "year"))

	# Plot on CO2 emissions and GDP per capita------------
	gdp_có2 <- data_final %>%
	mutate(co2_pc = co2/population) %>%
	summarise(
	gdp_pc = mean(gdp_pc, na.rm = TRUE),
	co2_pc = mean(co2_pc, na.rm = TRUE),
	.by = c("country", "country_class")) %>%
	ggplot(
	data = .,
	mapping = aes(x=gdp_pc, y=co2_pc, color=country_class)
	) +
	geom_point() +
	scale_x_continuous(
	labels = scales::number_format(scale = 0.001, suffix = "k")
	) +
	scale_y_continuous(
	labels = scales::number_format(scale = 1000)
	) +
	scale_color_brewer(palette = "Set1") +
	labs(
	title = "GDP and CO2 emissions",
	caption = "Data: World Bank. Plot shows averages over 1990-2019.",
	x = "GDP per capita (PPP, 2017$)",
	y = "CO2 (TNT equivalents)") +
	theme_bw() +
	theme(
	legend.position = "bottom",
	legend.title = element_blank(),
	axis.title.x = element_blank(),
	plot.title = element_text(hjust = 0.5)
	)
	gdp_có2

	# Shares in emissions and population-------------

	share_data <- data_final %>%
	dplyr::summarize(
	co2_group = sum(co2, na.rm=TRUE),
	population_group = sum(population, na.rm=TRUE),
	.by = c("country_class", "year")
	) %>%
	dplyr::mutate(
	co2_world = sum(co2_group),
	population_world = sum(population_group),
	.by = "year"
	) %>%
	dplyr::mutate(
	share_co2 = co2_group/co2_world,
	share_pop = population_group/population_world
	)

	share_plot_co2 <- ggplot(
	data = share_data,
	mapping = aes(x=year, y=share_co2, color=country_class)) +
	geom_point() + geom_line() +
	scale_y_continuous(
	labels = scales::percent_format(scale = 100),
	limits = c(0, 1), expand = expansion()
	) +
	scale_x_continuous(
	breaks = c(seq(1990, 2015, 5), 2019)
	) +
	labs(
	title = "Shares of CO2 emissions",
	caption = "Data: World Bank.",
	y = "Share of world aggregate") +
	scale_color_brewer(palette = "Set1") +
	theme_bw() +
	theme(
	legend.position = "bottom",
	legend.title = element_blank(),
	axis.title.x = element_blank(),
	plot.title = element_text(hjust = 0.5)
	)
	share_plot_co2

	share_plot_pop <- ggplot(
	data = share_data,
	mapping = aes(x=year, y=share_pop, color=country_class)) +
	geom_point() + geom_line() +
	scale_x_continuous(
	breaks = c(seq(1990, 2015, 5), 2019)
	) +
	scale_y_continuous(
	labels = scales::percent_format(scale = 100)
	) +
	labs(
	title = "Shares of population",
	caption = "Data: World Bank.",
	y = "Share of world aggregate") +
	scale_color_brewer(palette = "Set1") +
	theme_bw() +
	theme(
	legend.position = "bottom",
	legend.title = element_blank(),
	axis.title.x = element_blank(),
	plot.title = element_text(hjust = 0.5)
	)
	share_plot_pop

	# Final plot-----------------
	full_plot <- ggarrange(
	gdp_có2, share_plot_co2, share_plot_pop, ncol = 3,
	common.legend = TRUE, legend = "bottom")

	ggsave(
	plot = full_plot,
	filename = here("output/further_task.pdf"),
	width = 10, height = 3.5)
	here::i_am("R/inclass_exercise.R.R")
	library(here)
	library(data.table)
	library(dplyr)
	library(ggplot2)
	library(data.table)

	# Import data----------------
	wb_gdp_pc_raw <- fread(
	file = here("data/raw/API_NY.GDP.PCAP.PP.KD_DS2_en_csv_v2_5359165.csv"),
	header = TRUE) %>%
	as_tibble()

	wb_child_mort_raw <- fread(
	file = here("data/raw/API_SH.DYN.MORT_DS2_en_csv_v2_5358988.csv"),
	header = TRUE) %>%
	as_tibble()

	# SWIID: https://doi.org/10.7910/DVN/LM4OWF
	gini_data <- fread(here("data/raw/swiid9_4_summary.csv")) %>%
	as_tibble() %>%
	select(c("country", "year", "gini_disp"))

	# Alternative: download World Bank Data using the WDI package:

	# wb_data <- WDI::WDI(
	# indicator = c(
	# "gdp_pc"="NY.GDP.PCAP.PP.KD",
	# "child_mort"="SH.DYN.MORT")
	# )
	# fwrite(wb_data, here("data/raw/wb_data.csv"))

	# Prepare world bank data:--------------
	# The wrangling steps for the World Bank data are not necessary if you
	# download the data using the WDI package

	wb_gdp_pc <- wb_gdp_pc_raw %>%
	select(-c("Country Code", "Indicator Name", "Indicator Code", "V67")) %>%
	tidyr::pivot_longer(
	cols = -`Country Name`,
	names_to = "year",
	values_to = "gdp_pc")

	wb_child_mort <- wb_child_mort_raw %>%
	select(-c("Country Code", "Indicator Name", "Indicator Code", "V67")) %>%
	tidyr::pivot_longer(
	cols = -`Country Name`,
	names_to = "year",
	values_to = "child_mort")

	wb_data <- inner_join(
	x = wb_gdp_pc,
	y = wb_child_mort,
	by = c("Country Name", "year")) %>%
	dplyr::mutate(
	year = as.double(year)
	)

	# Merge and save data sets-----
	full_data <- inner_join(
	x = gini_data, y = wb_data,
	by=c("country" = "Country Name", "year"))

	fwrite(full_data, here("data/tidy/full_data_prepared.csv"))

	# Summarize data-------------
	full_data_prepared <- full_data %>%
	dplyr::filter(year>=2010, year<=2020) %>%
	dplyr::summarise(
	gdp_pc = mean(gdp_pc, na.rm = TRUE),
	child_mort = mean(child_mort, na.rm = TRUE),
	gini_disp = mean(gini_disp, na.rm = TRUE),
	.by = "country"
	)

	# Alternative using across:
	full_data_prepared <- full_data %>%
	dplyr::filter(year>=2010, year<=2020) %>%
	dplyr::summarise(across(
	.cols = where(is.numeric),
	.fns = ~ mean(.x, na.rm = TRUE)),
	.by = "country"
	)

	# Make the scatter plots----------
	# Note: the color mapping could also be specified in ggplot(), but this would
	# make the creation of the bonus plots below more difficult

	gini_mortality <- ggplot(
	data = full_data_prepared,
	mapping = aes(
	y = log(gini_disp),
	x = log(child_mort))) +
	geom_point(mapping = aes(color=country)) +
	scale_color_viridis_d() +
	labs(
	title = "Inequality and child mortality",
	x = "Child mortality (log)",
	y = "Gini (disposable income, log)",
	caption = "Data: SWIID and World Bank.") +
	theme_bw() +
	theme(legend.position = "none")
	gini_mortality

	gini_income <- ggplot(
	data = full_data_prepared,
	mapping = aes(
	y = log(gini_disp),
	x = log(gdp_pc))) +
	geom_point(mapping = aes(color=country)) +
	scale_color_viridis_d() +
	labs(
	title = "Inequality and income",
	x = "GDP per capita (PPP, 2017$)",
	y = "Gini (disposable income)",
	caption = "Data: SWIID and World Bank.") +
	theme_bw() +
	theme(legend.position = "none")
	gini_income

	mortality_income <- ggplot(
	data = full_data_prepared,
	mapping = aes(
	y = log(child_mort),
	x = log(gdp_pc))) +
	geom_point(mapping = aes(color=country)) +
	scale_color_viridis_d() +
	labs(
	title = "Income and child mortality",
	x = "GDP per capita (PPP, 2017$)",
	y = "Child mortality (log)",
	caption = "Data: World Bank.") +
	theme_bw() +
	theme(legend.position = "none")
	mortality_income

	# Bonus: joint plot using ggarrange():-----------
	library(ggpubr)
	plot_list <- ggarrange(
	mortality_income, gini_mortality, gini_income, ncol = 3)

	ggsave(
	plot = plot_list,
	filename = here("output/scatter_plots.pdf"),
	width = 9, height = 3)

	# Bonus: add a regression line-------------------

	gini_mortality_line <- gini_mortality +
	geom_smooth(
	data = full_data_prepared,
	mapping = aes(
	y = log(gini_disp),
	x = log(child_mort))
	)

	gini_income_line <- gini_income +
	geom_smooth(
	data = full_data_prepared,
	mapping = aes(
	y = log(gini_disp),
	x = log(gdp_pc))
	)

	mortality_income_line <- mortality_income +
	geom_smooth(
	data = full_data_prepared,
	mapping = aes(
	y = log(child_mort),
	x = log(gdp_pc))
	)

	line_plot_list <- ggarrange(
	mortality_income_line, gini_mortality_line, gini_income_line, ncol = 3)

	ggsave(
	plot = line_plot_list,
	filename = here("output/scatter_plots_regs.pdf"),
	width = 9, height = 3)