Instantly share code, notes, and snippets.

@tts /emigration.r
Last active Jun 13, 2016

Embed
What would you like to do?
Finnish emigration 1900-1945 as a sankey diagram
# See http://tuijasonkkila.fi/blog/2016/06/a-finnish-alien/
library(dplyr)
library(tidyr)
library(XLConnect)
download.file("http://www.siirtolaisuusinstituutti.fi/files/xls/maahan-_ja_maastamuutto/siirtolaisuus_1870-1945.xls",
"data.xls")
# Exclude years before 1900, and the last colum Total (Yhteensä)
sheet <- readWorksheetFromFile("data.xls", sheet="Taulukko", startRow=35, endRow=80, endCol=12, header=F)
# Rename columns
names(sheet) <- c("Year", "Faraway", "Americas", "US", "Canada", "Latin America", "Australia and New Zealand",
"Asia and Africa", "Sweden", "Soviet Union", "Rest of Europe", "Unknown")
# Exclude the Faraway column
sheet <- sheet[,c(1,3:12)]
# Clean data
sheet[ sheet == ".. " ] <- NA
sheet[ sheet == "- " ] <- NA
sheet$Americas <- gsub("\xc2\xa0", "", sheet$Americas)
data <- as.data.frame(apply(sheet,2,function (x) trimws(x)), stringsAsFactors = F)
# Type conversion
data[,1:11] <- data[,1:11] %>%
lapply(function(x) as.integer(as.character(x)))
# Aggregate over decades
sums <- aggregate(data,
by=list(data$Year %in% seq(1900, 1910, by=1),
data$Year %in% seq(1911, 1920, by=1),
data$Year %in% seq(1921, 1930, by=1),
data$Year %in% seq(1931, 1940, by=1),
data$Year %in% seq(1941, 1945, by=1)),
FUN = sum, na.rm = TRUE) %>%
mutate(Decade = c("1900","1910","1920","1930", "1940")) %>%
select(-starts_with("Group"), -Year)
# Transform to long format
sums_long <- sums %>%
gather(to, value, Americas:Unknown) %>%
mutate(from = Decade) %>%
select(to, from, value)
sums_long$value <- as.numeric(sums_long$value)
sums_long_df <- data.frame(lapply(sums_long, as.character), stringsAsFactors=FALSE)
# Define source and target nodes of the diagram. Note: numbering has to start from 0
nodes <- as.data.frame(c(unique(sums_long_df$from), unique(sums_long_df$to)), stringsAsFactors = F)
names(nodes) <- "name"
nodes$id <- seq(from=0, to=nrow(nodes)-1, by=1)
# and links between them
links <- sums_long %>%
rowwise() %>%
mutate(source = nodes[nodes$name==from, "id"]) %>%
mutate(target = nodes[nodes$name==to, "id"]) %>%
select(source, target, value)
# See https://github.com/christophergandrud/networkD3/issues/126
devtools::install_github('christophergandrud/networkD3', ref = 'tooltipCon')
library(networkD3)
sn <- sankeyNetwork(Links = links, Nodes = nodes, Source = "source",
Target = "target", NodeID = "name", Value = "value",
fontSize = 12, nodeWidth = 30, height = 700, width = 700)
sn
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment