Skip to content

Instantly share code, notes, and snippets.

@graebnerc
Created April 28, 2023 07:35
Show Gist options
  • Save graebnerc/5d5ec7591a45d6cbad3a58ddf06fff6b to your computer and use it in GitHub Desktop.
Save graebnerc/5d5ec7591a45d6cbad3a58ddf06fff6b to your computer and use it in GitHub Desktop.
Example solutions for the recap session on exploratory data analysis (spring semester 2023).
Example solutions for the recap session on exploratory data analysis (spring semester 2023).
here::i_am("R/further_tasks.R")
library(here)
library(data.table)
library(dplyr)
library(tidyr)
library(ggplot2)
library(scales)
library(data.table)
# Read in data---------------
base_data <- fread(here("data/tidy/full_data_prepared.csv")) %>%
as_tibble() %>%
select(country, year, gdp_pc)
# Compute country groups---------------
incomegroups <- base_data %>%
filter(year==1990, !is.na(gdp_pc)) %>%
mutate(
threshold=quantile(gdp_pc, 0.8),
country_class = ifelse(
test = gdp_pc>threshold,
yes = "Rich countries",
no = "Poor countries")
) %>%
select(country, country_class)
base_data_incomegroups <- left_join(
base_data, incomegroups, by=c("country")) %>%
filter(!is.na(country_class), year>=1990, year<=2019)
# Add data on population and CO2 emissions------------
download_data <- FALSE # Set to true to download data
if (download_data){
further_wb_data <- WDI::WDI(
indicator = c(
"co2"="EN.ATM.CO2E.KT",
"population"="SP.POP.TOTL"),
start = 1990, end = 2019) %>%
select(-c("iso2c", "iso3c"))
fwrite(further_wb_data, file = here("data/tidy/wb_co2_pop.csv"))
} else{
further_wb_data <- fread(here("data/tidy/wb_co2_pop.csv"))
}
# Merge data-----------------
data_final <- left_join(
x = base_data_incomegroups,
y = further_wb_data,
by=c("country", "year"))
# Plot on CO2 emissions and GDP per capita------------
gdp_có2 <- data_final %>%
mutate(co2_pc = co2/population) %>%
summarise(
gdp_pc = mean(gdp_pc, na.rm = TRUE),
co2_pc = mean(co2_pc, na.rm = TRUE),
.by = c("country", "country_class")) %>%
ggplot(
data = .,
mapping = aes(x=gdp_pc, y=co2_pc, color=country_class)
) +
geom_point() +
scale_x_continuous(
labels = scales::number_format(scale = 0.001, suffix = "k")
) +
scale_y_continuous(
labels = scales::number_format(scale = 1000)
) +
scale_color_brewer(palette = "Set1") +
labs(
title = "GDP and CO2 emissions",
caption = "Data: World Bank. Plot shows averages over 1990-2019.",
x = "GDP per capita (PPP, 2017$)",
y = "CO2 (TNT equivalents)") +
theme_bw() +
theme(
legend.position = "bottom",
legend.title = element_blank(),
axis.title.x = element_blank(),
plot.title = element_text(hjust = 0.5)
)
gdp_có2
# Shares in emissions and population-------------
share_data <- data_final %>%
dplyr::summarize(
co2_group = sum(co2, na.rm=TRUE),
population_group = sum(population, na.rm=TRUE),
.by = c("country_class", "year")
) %>%
dplyr::mutate(
co2_world = sum(co2_group),
population_world = sum(population_group),
.by = "year"
) %>%
dplyr::mutate(
share_co2 = co2_group/co2_world,
share_pop = population_group/population_world
)
share_plot_co2 <- ggplot(
data = share_data,
mapping = aes(x=year, y=share_co2, color=country_class)) +
geom_point() + geom_line() +
scale_y_continuous(
labels = scales::percent_format(scale = 100),
limits = c(0, 1), expand = expansion()
) +
scale_x_continuous(
breaks = c(seq(1990, 2015, 5), 2019)
) +
labs(
title = "Shares of CO2 emissions",
caption = "Data: World Bank.",
y = "Share of world aggregate") +
scale_color_brewer(palette = "Set1") +
theme_bw() +
theme(
legend.position = "bottom",
legend.title = element_blank(),
axis.title.x = element_blank(),
plot.title = element_text(hjust = 0.5)
)
share_plot_co2
share_plot_pop <- ggplot(
data = share_data,
mapping = aes(x=year, y=share_pop, color=country_class)) +
geom_point() + geom_line() +
scale_x_continuous(
breaks = c(seq(1990, 2015, 5), 2019)
) +
scale_y_continuous(
labels = scales::percent_format(scale = 100)
) +
labs(
title = "Shares of population",
caption = "Data: World Bank.",
y = "Share of world aggregate") +
scale_color_brewer(palette = "Set1") +
theme_bw() +
theme(
legend.position = "bottom",
legend.title = element_blank(),
axis.title.x = element_blank(),
plot.title = element_text(hjust = 0.5)
)
share_plot_pop
# Final plot-----------------
full_plot <- ggarrange(
gdp_có2, share_plot_co2, share_plot_pop, ncol = 3,
common.legend = TRUE, legend = "bottom")
ggsave(
plot = full_plot,
filename = here("output/further_task.pdf"),
width = 10, height = 3.5)
here::i_am("R/inclass_exercise.R.R")
library(here)
library(data.table)
library(dplyr)
library(ggplot2)
library(data.table)
# Import data----------------
wb_gdp_pc_raw <- fread(
file = here("data/raw/API_NY.GDP.PCAP.PP.KD_DS2_en_csv_v2_5359165.csv"),
header = TRUE) %>%
as_tibble()
wb_child_mort_raw <- fread(
file = here("data/raw/API_SH.DYN.MORT_DS2_en_csv_v2_5358988.csv"),
header = TRUE) %>%
as_tibble()
# SWIID: https://doi.org/10.7910/DVN/LM4OWF
gini_data <- fread(here("data/raw/swiid9_4_summary.csv")) %>%
as_tibble() %>%
select(c("country", "year", "gini_disp"))
# Alternative: download World Bank Data using the WDI package:
# wb_data <- WDI::WDI(
# indicator = c(
# "gdp_pc"="NY.GDP.PCAP.PP.KD",
# "child_mort"="SH.DYN.MORT")
# )
# fwrite(wb_data, here("data/raw/wb_data.csv"))
# Prepare world bank data:--------------
# The wrangling steps for the World Bank data are not necessary if you
# download the data using the WDI package
wb_gdp_pc <- wb_gdp_pc_raw %>%
select(-c("Country Code", "Indicator Name", "Indicator Code", "V67")) %>%
tidyr::pivot_longer(
cols = -`Country Name`,
names_to = "year",
values_to = "gdp_pc")
wb_child_mort <- wb_child_mort_raw %>%
select(-c("Country Code", "Indicator Name", "Indicator Code", "V67")) %>%
tidyr::pivot_longer(
cols = -`Country Name`,
names_to = "year",
values_to = "child_mort")
wb_data <- inner_join(
x = wb_gdp_pc,
y = wb_child_mort,
by = c("Country Name", "year")) %>%
dplyr::mutate(
year = as.double(year)
)
# Merge and save data sets-----
full_data <- inner_join(
x = gini_data, y = wb_data,
by=c("country" = "Country Name", "year"))
fwrite(full_data, here("data/tidy/full_data_prepared.csv"))
# Summarize data-------------
full_data_prepared <- full_data %>%
dplyr::filter(year>=2010, year<=2020) %>%
dplyr::summarise(
gdp_pc = mean(gdp_pc, na.rm = TRUE),
child_mort = mean(child_mort, na.rm = TRUE),
gini_disp = mean(gini_disp, na.rm = TRUE),
.by = "country"
)
# Alternative using across:
full_data_prepared <- full_data %>%
dplyr::filter(year>=2010, year<=2020) %>%
dplyr::summarise(across(
.cols = where(is.numeric),
.fns = ~ mean(.x, na.rm = TRUE)),
.by = "country"
)
# Make the scatter plots----------
# Note: the color mapping could also be specified in ggplot(), but this would
# make the creation of the bonus plots below more difficult
gini_mortality <- ggplot(
data = full_data_prepared,
mapping = aes(
y = log(gini_disp),
x = log(child_mort))) +
geom_point(mapping = aes(color=country)) +
scale_color_viridis_d() +
labs(
title = "Inequality and child mortality",
x = "Child mortality (log)",
y = "Gini (disposable income, log)",
caption = "Data: SWIID and World Bank.") +
theme_bw() +
theme(legend.position = "none")
gini_mortality
gini_income <- ggplot(
data = full_data_prepared,
mapping = aes(
y = log(gini_disp),
x = log(gdp_pc))) +
geom_point(mapping = aes(color=country)) +
scale_color_viridis_d() +
labs(
title = "Inequality and income",
x = "GDP per capita (PPP, 2017$)",
y = "Gini (disposable income)",
caption = "Data: SWIID and World Bank.") +
theme_bw() +
theme(legend.position = "none")
gini_income
mortality_income <- ggplot(
data = full_data_prepared,
mapping = aes(
y = log(child_mort),
x = log(gdp_pc))) +
geom_point(mapping = aes(color=country)) +
scale_color_viridis_d() +
labs(
title = "Income and child mortality",
x = "GDP per capita (PPP, 2017$)",
y = "Child mortality (log)",
caption = "Data: World Bank.") +
theme_bw() +
theme(legend.position = "none")
mortality_income
# Bonus: joint plot using ggarrange():-----------
library(ggpubr)
plot_list <- ggarrange(
mortality_income, gini_mortality, gini_income, ncol = 3)
ggsave(
plot = plot_list,
filename = here("output/scatter_plots.pdf"),
width = 9, height = 3)
# Bonus: add a regression line-------------------
gini_mortality_line <- gini_mortality +
geom_smooth(
data = full_data_prepared,
mapping = aes(
y = log(gini_disp),
x = log(child_mort))
)
gini_income_line <- gini_income +
geom_smooth(
data = full_data_prepared,
mapping = aes(
y = log(gini_disp),
x = log(gdp_pc))
)
mortality_income_line <- mortality_income +
geom_smooth(
data = full_data_prepared,
mapping = aes(
y = log(child_mort),
x = log(gdp_pc))
)
line_plot_list <- ggarrange(
mortality_income_line, gini_mortality_line, gini_income_line, ncol = 3)
ggsave(
plot = line_plot_list,
filename = here("output/scatter_plots_regs.pdf"),
width = 9, height = 3)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment