Data for the blog post "The Evolution of a ggplot (Vol. 1)"
library(tidyverse) | |
df_students <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-05-07/student_teacher_ratio.csv") | |
df_world_tile <- readr::read_csv("https://gist.githubusercontent.com/maartenzam/787498bbc07ae06b637447dbd430ea0a/raw/9a9dafafb44d8990f85243a9c7ca349acd3a0d07/worldtilegrid.csv") %>% | |
mutate( | |
## Namibias two-digit country code is handled as `NA` - let us fix that | |
alpha.2 = if_else(name == "Namibia", "NA", alpha.2), | |
## We are going to split "Americas" into "North America" and "Sout America" | |
region = if_else(region == "Americas", sub.region, region), | |
region = if_else(region %in% c("Northern America", "Central America", "Caribbean"), | |
"North America", region), | |
region = if_else(region == "Southern America", "South America", region), | |
## to join both data sets, we need a id column | |
country_code = alpha.3 | |
) | |
df_ratios <- df_students %>% | |
## Let's keep only the most recent data per country | |
group_by(country, indicator) %>% | |
filter(year == max(year)) %>% | |
ungroup() %>% | |
# Create `NA`s for countries which do not have any data 2012-2018 | |
complete(indicator, nesting(country, country_code)) %>% | |
## Let's focus on primary education and keep only countries (coded by letters) | |
filter( | |
indicator == "Primary Education", | |
str_detect(country_code, "[A-Z]") | |
) %>% | |
## merge with world tile map data | |
full_join(df_world_tile) %>% | |
filter( | |
!is.na(region), | |
!is.na(indicator) | |
) %>% | |
group_by(region) %>% | |
mutate(student_ratio_region = median(student_ratio, na.rm = T)) %>% | |
ungroup() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment