Skip to content

Instantly share code, notes, and snippets.

@jebyrnes
Created May 27, 2020 19:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jebyrnes/79478c48a142cc2f22359d8aaf871c9b to your computer and use it in GitHub Desktop.
Save jebyrnes/79478c48a142cc2f22359d8aaf871c9b to your computer and use it in GitHub Desktop.
Get the iso subdivision codes from wikipedia
library(rvest)
library(dplyr)
library(purrr)
library(tidyr)
# https://en.wikipedia.org/wiki/ISO_3166-2
baseurl <- "https://en.wikipedia.org"
url <- "https://en.wikipedia.org/wiki/ISO_3166-2"
# get the tables from which spring links!
links <- read_html(url) %>%
html_nodes("table") %>%
`[`(1) %>% # first two tables
html_nodes("tr") %>%
html_nodes("a") %>%
html_attr("href") %>%
grep("ISO_3166", ., value = TRUE)
# weird duplicate col names that bind_rows does not want
# from tables where some columns have two headers
fix_dup_names <- function(adf) {
n <- names(adf)
if (length(unique(n)) == length(n)) {
return(adf)
}
names(adf) <- paste(n, 1:length(n), sep = "_")
names(adf)[n == "Code"] <- "Code"
adf[-1, ]
}
parse_one_page <- function(a_link, quiet = FALSE) {
if (!quiet) print(a_link)
read_html(paste0(baseurl, a_link)) %>%
html_nodes("body table.wikitable.sortable") %>%
html_table(fill = TRUE) %>%
map(fix_dup_names) %>%
data.table::rbindlist(fill = TRUE) %>% # used instead of bind_rows to deal with mixed classes
as_tibble() %>%
mutate(country_code = gsub("\\/wiki\\/ISO_3166-2\\:", "", a_link))
}
reshape_tab <- function(adf, quiet = FALSE) {
if (!quiet) print(adf[1, ])
adf %>%
mutate_all(as.character) %>% # sneaky integers sneaking in
pivot_longer(
cols = !matches("Code"),
names_to = "subdivision_name_type",
values_to = "subdivision_name"
) %>%
rename(code = Code)
}
tabs <- map(links, parse_one_page)
tabs <- discard(tabs, ~ nrow(.x) == 0)
iso_df <- map_df(tabs, reshape_tab)
# fix missing codes
iso_df_filtered <- iso_df %>%
filter(!is.na(subdivision_name)) %>%
mutate(
code = ifelse(is.na(code), `Former code`, code),
code = ifelse(is.na(code), `Alternative code`, code),
code = ifelse(is.na(code), `Netherlands ISO 3166-2 code`, code)
) %>%
# get rid of dups with no code
group_by(subdivision_name, country_code) %>%
filter(!is.na(code)) %>%
ungroup() %>%
# get rid of duplicate rows
group_by(code, subdivision_name, country_code) %>%
slice(1L) %>%
ungroup() %>%
# sift down
select(code, country_code, subdivision_name_type, subdivision_name) %>%
filter(grepl("[n,N]ame", subdivision_name_type) | subdivision_name_type== "Local variant")
iso_df_filtered
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment