guyabel/migrant_sankey.R

## migrant_sankey.R
library(tidyverse)
library(readxl)
library(tweenr)
library(ggforce)
library(gridExtra)
library(animation)
library(magick)

# download the migrant stock origin-destination xlsx file from the UN site
x <- tempfile(fileext = ".xlsx")
download.file(
  url = "https://www.un.org/development/desa/pd/sites/www.un.org.development.desa.pd/files/undesa_pd_2020_ims_stock_by_sex_destination_and_origin.xlsx",
  destfile = x, mode = "wb"
)

# read in data for all years
d <- read_excel(path = x, sheet = 2, range = "B11:N37062", na = "..", guess_max = 1e5)
d

# # read in data for all years, including "Other" origin for unknowns, not in sheet 2
# d <- tibble(year = seq(from = 2020, to = 1990, by = -5),
#             sheet = 3:9) %>%
#   mutate(m = map(.x = sheet, .f = ~read_excel(path = x, sheet = .x, skip = 10)))
# d

# codes for world bank income groups and "unknown" origin
c1 <- c(1503:1500, 2003)

# world bank income group data frame
d1 <- d %>%
  select(-2, -4) %>%
  rename(dest = 1,
         dest_code = 2,
         orig = 3,
         orig_code = 4) %>%
  filter(orig_code %in% c1,
         dest_code %in% c1) %>%
  pivot_longer(cols = -(1:4), names_to = "year", values_to = "stock") %>%
  mutate_if(is.character, ~str_remove(string = ., pattern = "-income countries")) %>%
  mutate(year = as.numeric(year),
         stock = stock/1e6) %>%
  select(-contains("code"))
d1

# # code for world bank income group data frame if read in data with "unknown" origin
# d0 <- d %>%
#   unnest(m) %>%
#   select(-sheet, -3, -5) %>%
#   rename(dest = 2, dest_code = 3)  %>%
#   pivot_longer(cols = -(1:3), names_to = "orig_code", values_to = "stock") %>%
#   filter(dest_code %in% c1,
#          orig_code %in% c1) %>%
#   mutate(orig_code = as.numeric(orig_code),
#          dest = str_remove(string = dest,
#                            pattern = "-income countries"))
#
# # origin names to join on to data frame
# c1 <- d0 %>%
#   distinct(dest, dest_code) %>%
#   set_names(nm = str_replace(string = names(.),
#                              pattern = "dest",
#                              replacement = "orig"))
#
# # join on origin names
# d1 <- d0 %>%
#   left_join(c1) %>%
#   replace_na(list(orig = "Unknown")) %>%
#   mutate(stock = stock/1e6) %>%
#   select(-contains("code"))

# tween data frame
d2 <- d1 %>%
  mutate(corridor = paste(orig, dest, sep = " -> ")) %>%
  select(corridor, year, stock) %>%
  mutate(ease = "linear") %>%
  tween_elements(time = "year", group = c("corridor"), ease = "ease", nframes = 50) %>%
  as_tibble() %>%
  separate(col = .group, into = c("orig", "dest"), sep = " -> ") %>%
  relocate(orig, dest)
d2

# format data for sankey plot using gather_set_data() and factors for plot ordering
d2 <- d2 %>%
  mutate(orig = factor(orig, levels = unique(d1$orig)),
         dest = factor(dest, levels = unique(d1$orig))) %>%
  arrange(orig, dest, .frame) %>%
  gather_set_data(x = 1:2) %>%
  mutate(x = fct_inorder(x))
d2

# set up data frame for position of labels; h for the horizontal justification, n for nudge away from center
p <- d2 %>%
  distinct(x, y) %>%
  mutate(h = as.numeric(x == "orig"),
         n = h * -0.1 + 0.05)
p

# plots within for loop
g <- NULL
for(f in unique(d2$.frame)){
  # year for title
  y <- d2 %>%
    filter(.frame == f) %>%
    pull(year) %>%
    unique() %>%
    `/`(5) %>%
    round() %>%
    `*`(5)

  # plot for each frame
  g[[f + 1]] <-
    ggplot(data = filter(d2, .frame == f),
           mapping = aes(x = x, id = id, value = stock, split = y)) +
    geom_parallel_sets(mapping = aes(fill = orig), axis.width = -0.05, alpha = 0.5) +
    geom_parallel_sets_axes(axis.width = 0.05, fill = "transparent", colour = "black") +
    geom_parallel_sets_labels(angle = 0, size = 3, lineheight = 0.8, hjust = p$h, position = position_nudge(x = p$n)) +
    guides(fill = FALSE) +
    scale_fill_brewer(palette = "Set1") +
    scale_x_discrete(labels = c(orig = "Place of Birth", dest = "Place of Residence")) +
    # uncomment ylim for fixed limit plot, 320 chosen after inspecting initial plots, use 350 when including "unknown" origin data
    # ylim(c(0, 320)) +
    theme_minimal() +
    theme(plot.subtitle = element_text(color = "darkgrey", size = 8)) +
    labs(x = "", y = "Migrants (millions)",
         title = paste("Migrant Population by World Bank Income Groups", y),
         subtitle = "Plot by @guyabelguyabel",
         caption = "Data from http://unmigration.org")
}

# create Grob for saving multiple plots to multi-page PDF
m <- marrangeGrob(g, nrow = 1, ncol = 1, top = NULL)

# save the Grob to PDF
ggsave(filename = "migrant_sankey.pdf", plot = m, height = 7, width = 5)

# set up frames for animation, pauses on first and last plot
ff <- c(rep(1, 5), 2:50, rep(51, 5))

# bring in image files from each page of PDF
pp <- image_read_pdf("migrant_sankey.pdf")
# image_info(pp)

# create animated version of images in the PDF
saveVideo(expr = {
  for(j in ff){
    img1 <- pp[j]
    par(mar = rep(0,4))
    plot(as.raster(img1))
  }},
  ani.width = 1500, ani.height = 2100, n = length(ff), loop = TRUE, interval = 1/10,
  ffmpeg = "C:/ffmpeg/bin/ffmpeg.exe",
  video.name = "migrant_sankey.mp4")
file.show("migrant_sankey.mp4")
	library(tidyverse)
	library(readxl)
	library(tweenr)
	library(ggforce)
	library(gridExtra)
	library(animation)
	library(magick)

	# download the migrant stock origin-destination xlsx file from the UN site
	x <- tempfile(fileext = ".xlsx")
	download.file(
	url = "https://www.un.org/development/desa/pd/sites/www.un.org.development.desa.pd/files/undesa_pd_2020_ims_stock_by_sex_destination_and_origin.xlsx",
	destfile = x, mode = "wb"
	)

	# read in data for all years
	d <- read_excel(path = x, sheet = 2, range = "B11:N37062", na = "..", guess_max = 1e5)
	d

	# # read in data for all years, including "Other" origin for unknowns, not in sheet 2
	# d <- tibble(year = seq(from = 2020, to = 1990, by = -5),
	# sheet = 3:9) %>%
	# mutate(m = map(.x = sheet, .f = ~read_excel(path = x, sheet = .x, skip = 10)))
	# d

	# codes for world bank income groups and "unknown" origin
	c1 <- c(1503:1500, 2003)

	# world bank income group data frame
	d1 <- d %>%
	select(-2, -4) %>%
	rename(dest = 1,
	dest_code = 2,
	orig = 3,
	orig_code = 4) %>%
	filter(orig_code %in% c1,
	dest_code %in% c1) %>%
	pivot_longer(cols = -(1:4), names_to = "year", values_to = "stock") %>%
	mutate_if(is.character, ~str_remove(string = ., pattern = "-income countries")) %>%
	mutate(year = as.numeric(year),
	stock = stock/1e6) %>%
	select(-contains("code"))
	d1

	# # code for world bank income group data frame if read in data with "unknown" origin
	# d0 <- d %>%
	# unnest(m) %>%
	# select(-sheet, -3, -5) %>%
	# rename(dest = 2, dest_code = 3) %>%
	# pivot_longer(cols = -(1:3), names_to = "orig_code", values_to = "stock") %>%
	# filter(dest_code %in% c1,
	# orig_code %in% c1) %>%
	# mutate(orig_code = as.numeric(orig_code),
	# dest = str_remove(string = dest,
	# pattern = "-income countries"))
	#
	# # origin names to join on to data frame
	# c1 <- d0 %>%
	# distinct(dest, dest_code) %>%
	# set_names(nm = str_replace(string = names(.),
	# pattern = "dest",
	# replacement = "orig"))
	#
	# # join on origin names
	# d1 <- d0 %>%
	# left_join(c1) %>%
	# replace_na(list(orig = "Unknown")) %>%
	# mutate(stock = stock/1e6) %>%
	# select(-contains("code"))

	# tween data frame
	d2 <- d1 %>%
	mutate(corridor = paste(orig, dest, sep = " -> ")) %>%
	select(corridor, year, stock) %>%
	mutate(ease = "linear") %>%
	tween_elements(time = "year", group = c("corridor"), ease = "ease", nframes = 50) %>%
	as_tibble() %>%
	separate(col = .group, into = c("orig", "dest"), sep = " -> ") %>%
	relocate(orig, dest)
	d2

	# format data for sankey plot using gather_set_data() and factors for plot ordering
	d2 <- d2 %>%
	mutate(orig = factor(orig, levels = unique(d1$orig)),
	dest = factor(dest, levels = unique(d1$orig))) %>%
	arrange(orig, dest, .frame) %>%
	gather_set_data(x = 1:2) %>%
	mutate(x = fct_inorder(x))
	d2

	# set up data frame for position of labels; h for the horizontal justification, n for nudge away from center
	p <- d2 %>%
	distinct(x, y) %>%
	mutate(h = as.numeric(x == "orig"),
	n = h * -0.1 + 0.05)
	p

	# plots within for loop
	g <- NULL
	for(f in unique(d2$.frame)){
	# year for title
	y <- d2 %>%
	filter(.frame == f) %>%
	pull(year) %>%
	unique() %>%
	`/`(5) %>%
	round() %>%
	`*`(5)

	# plot for each frame
	g[[f + 1]] <-
	ggplot(data = filter(d2, .frame == f),
	mapping = aes(x = x, id = id, value = stock, split = y)) +
	geom_parallel_sets(mapping = aes(fill = orig), axis.width = -0.05, alpha = 0.5) +
	geom_parallel_sets_axes(axis.width = 0.05, fill = "transparent", colour = "black") +
	geom_parallel_sets_labels(angle = 0, size = 3, lineheight = 0.8, hjust = p$h, position = position_nudge(x = p$n)) +
	guides(fill = FALSE) +
	scale_fill_brewer(palette = "Set1") +
	scale_x_discrete(labels = c(orig = "Place of Birth", dest = "Place of Residence")) +
	# uncomment ylim for fixed limit plot, 320 chosen after inspecting initial plots, use 350 when including "unknown" origin data
	# ylim(c(0, 320)) +
	theme_minimal() +
	theme(plot.subtitle = element_text(color = "darkgrey", size = 8)) +
	labs(x = "", y = "Migrants (millions)",
	title = paste("Migrant Population by World Bank Income Groups", y),
	subtitle = "Plot by @guyabelguyabel",
	caption = "Data from http://unmigration.org")
	}

	# create Grob for saving multiple plots to multi-page PDF
	m <- marrangeGrob(g, nrow = 1, ncol = 1, top = NULL)

	# save the Grob to PDF
	ggsave(filename = "migrant_sankey.pdf", plot = m, height = 7, width = 5)

	# set up frames for animation, pauses on first and last plot
	ff <- c(rep(1, 5), 2:50, rep(51, 5))

	# bring in image files from each page of PDF
	pp <- image_read_pdf("migrant_sankey.pdf")
	# image_info(pp)

	# create animated version of images in the PDF
	saveVideo(expr = {
	for(j in ff){
	img1 <- pp[j]
	par(mar = rep(0,4))
	plot(as.raster(img1))
	}},
	ani.width = 1500, ani.height = 2100, n = length(ff), loop = TRUE, interval = 1/10,
	ffmpeg = "C:/ffmpeg/bin/ffmpeg.exe",
	video.name = "migrant_sankey.mp4")
	file.show("migrant_sankey.mp4")