Last active
October 2, 2019 19:58
-
-
Save andybega/0f16b6a0de0185c157e2a280b5e05074 to your computer and use it in GitHub Desktop.
Scrape the Wikipedia terror incidents pages/lists
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# copy and run this code in R to setup and source `wikipedia-terrorism.R` from this gist | |
# | |
packs <- c("dplyr", "rvest", "httr", "XML", "stringr", "purrr", "xml2", "tibble", "devtools") | |
need <- packs[!packs %in% rownames(installed.packages())] | |
install.packages(need) | |
devtools::source_gist("https://gist.github.com/andybega/0f16b6a0de0185c157e2a280b5e05074", | |
filename = "wikipedia-terrorism.R") | |
library("dplyr") | |
wikipedia_terrorism_scrape(from = "2019-08") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# Scrape the Wikipedia terror incidents pages/lists | |
# 2019-10-02 | |
# | |
# this relies on a hard coded URL for the terrorism page list template page, | |
# which includes URLs for all relevant pages with data tables | |
# https://en.wikipedia.org/wiki/Template:Lists_of_Terrorist_Incidents | |
# | |
# dependencies that should be installed for this to work: | |
# c("dplyr", "rvest", "httr", "XML", "stringr", "purrr", "xml2", "tibble") | |
# | |
# only the pipe operator needs to be actually attached, all other functions are | |
# called with package::func(), so it should not be neccessary to actually load | |
# the packages | |
# | |
# try: | |
# | |
# packs <- c("dplyr", "rvest", "httr", "XML", "stringr", "purrr", "xml2", "tibble") | |
# need <- packs[!packs %in% rownames(installed.packages())] | |
# install.packages(need) | |
# library("dplyr") | |
# | |
# usage: | |
# | |
# source this script, then: | |
# | |
# wikipedia_terrorism_scrape(from = "2019-08") | |
# | |
#' @param from year or "YYYY-MM" string | |
#' @param to year or "YYYY-MM" string, if NULL will default to latest | |
#' | |
#' @export | |
wikipedia_terrorism_scrape <- function(from = 2019, to = NULL) { | |
# parse from and to dates | |
if (nchar(from)==4) from <- as.Date(paste0(from, "-01-01")) | |
if (nchar(from)==7) from <- as.Date(paste0(from, "-01")) | |
if (!inherits(from, "Date")) { | |
stop("Could not parse 'from', form should be 'YYYY' or 'YYYY-MM'") | |
} | |
if (is.null(to)) to <- 2100 | |
if (nchar(to)==4) to <- as.Date(paste0(to, "-01-01")) | |
if (nchar(to)==7) to <- as.Date(paste0(to, "-01")) | |
if (!inherits(to, "Date")) { | |
stop("Could not parse 'to', form should be 'YYYY' or 'YYYY-MM'") | |
} | |
urls <- wikipedia_terrorism_pages() | |
# extract normalized date from URL | |
url_date <- wikipedia_terrorism_parse_url_date(urls)[["start"]] | |
want <- url_date >= from & url_date <= to | |
urls <- urls[want] | |
# the actual meat and potatoes...loop through and get the pages/tables/data | |
datatbl <- list() | |
for (i in seq_along(urls)) { | |
tbl_i <- wikipedia_terrorism_scrape_table(urls[i]) | |
datatbl[[i]] <- tbl_i | |
} | |
datatbl <- dplyr::bind_rows(datatbl) | |
datatbl | |
} | |
#' @export | |
wikipedia_terrorism_pages <- function() { | |
url <- "https://en.wikipedia.org/wiki/Template:Lists_of_Terrorist_Incidents" | |
page <- xml2::read_html(url) | |
page_urls <- rvest::html_nodes(page, "a") %>% | |
rvest::html_attr("href") %>% | |
purrr::keep(stringr::str_detect(., "terrorist_incidents_in")) %>% | |
paste0("https://en.wikipedia.org/", .) | |
# from 2015 on, there are month pages but also summary year pages | |
# exclude the year pages to avoid duplication | |
exclude <- page_urls %>% | |
stringr::str_extract(., "in_[0-9]{4}$") %>% | |
stringr::str_extract(., "[0-9]+") %>% | |
as.integer() %>% | |
sapply(., function(x) isTRUE(x >= 2015)) | |
page_urls <- page_urls[!exclude] | |
page_urls | |
} | |
#' @export | |
wikipedia_terrorism_scrape_table <- function(url) { | |
page <- httr::GET(url) | |
# a page can include multiple tables, some of which may be irrelevant | |
all_tables <- XML::readHTMLTable(doc = httr::content(page, "text"), | |
stringsAsFactors = FALSE, | |
header = TRUE) | |
# filter out irrelevant tables | |
all_tables <- id_relevant_tables(all_tables) | |
table_ids <- names(all_tables) | |
# iterate through data tables and clean | |
datatbl <- list() | |
for (i in seq_along(all_tables)) { | |
tbl_i <- tibble::as_tibble(all_tables[[i]]) | |
table_id_i <- table_ids[i] | |
date_stem <- wikipedia_terrorism_parse_table_date(table_id_i, url) | |
colnames(tbl_i) <- colnames(tbl_i) %>% | |
tolower() %>% | |
stringr::str_replace_all(" ", "_") %>% | |
stringr::str_replace_all("\n", "") | |
# replace em-dash with regular dash for dates like 1-3 January | |
tbl_i$date <- stringr::str_replace(tbl_i$date, "\u2013", "-") | |
# some tables have dates like "1", "1-3"; others include the month, | |
# like "January 1" | |
# if no month, use the date stem from ULR to id | |
month <- ifelse(stringr::str_detect(tbl_i$date, "^[0-9\\-]+$"), | |
date_stem$month, | |
tbl_i$date %>% | |
stringr::str_extract("^[A-Za-z]+") %>% | |
match(month.name) | |
) | |
# take out month in date if present, leaving only day | |
# keep orig for debugging dump if parsing failures | |
date_orig <- tbl_i$date | |
tbl_i$date <- tbl_i$date %>% | |
stringr::str_extract("[0-9\\-]+$") | |
# for multi-day dates, keep first day and record end date in end_date var | |
tbl_i$end_date <- tbl_i$date %>% | |
stringr::str_extract(., "-[0-9]{1,2}$") %>% | |
stringr::str_remove("-") %>% | |
ifelse(!is.na(.), paste0(month, .), .) %>% | |
as.Date(format = "%Y-%m-%d") | |
tbl_i$date <- tbl_i$date %>% | |
stringr::str_extract(., "^[0-9]+") %>% | |
paste(date_stem$year, month, ., sep = "-") %>% | |
as.Date(format = "%Y-%m-%d") | |
# parse minimum fatality count | |
tbl_i$dead_min <- tbl_i$dead %>% | |
stringr::str_extract(., "^[0-9]+") %>% | |
as.numeric() | |
# warn if there were parsing failures | |
if (any(is.na(tbl_i$date))) { | |
idx <- is.na(tbl_i$date) | |
payload <- tibble::as_tibble(cbind(orig = date_orig[idx], | |
tbl_i[idx, "date"], | |
stringsAsFactors = FALSE)) | |
payload <- paste(capture.output(print(payload)), collapse = "\n") | |
warning("Some dates were not parsed\n", | |
"Table: ", table_id_i, "\n", | |
payload) | |
} | |
if (any(is.na(tbl_i$dead_min))) { | |
payload <- tbl_i[is.na(tbl_i$dead_min), c("dead", "dead_min")] | |
payload <- paste(capture.output(print(payload)), collapse = "\n") | |
warning("Some fatality figures were not parsed\n", | |
"Table: ", table_id_i, "\n", | |
payload) | |
} | |
# finished with this chunk | |
datatbl[[i]] <- tbl_i | |
} | |
datatbl <- dplyr::bind_rows(datatbl) | |
datatbl | |
} | |
wikipedia_terrorism_parse_table_date <- function(table_id, url) { | |
stopifnot(length(table_id)==1 | is.null(table_id), | |
length(url)==1) | |
if (length(table_id)==1) { | |
if (table_id=="NULL") table_id <- NULL | |
} | |
url_dates <- wikipedia_terrorism_parse_url_date(url) | |
if (is.null(table_id)) { | |
stem <- list(year = as.integer(substr(url_dates$start, 1, 4)), | |
month = NA) | |
} else { | |
# table_id like terrorIncidents2011Feb | |
table_id_end <- stringr::str_extract(table_id, "[0-9]{4}[A-Za-z]{3}") | |
year <- as.integer(substr(table_id_end, 1, 4)) | |
month <- match(substr(table_id_end, 5, 7), month.abb) | |
# sometimes the table can have an incorrect ID on wikipedia, i.e. | |
# page ULR for January 2019 events but the table ID ends in something | |
# other | |
# check for this and warn | |
if (url_dates$start==url_dates$end) { | |
url_month <- as.integer(substr(url_dates$start, 6, 7)) | |
if (url_month!=month) { | |
warning("Wikipedia table ID date does not match implied URL date, using URL date:\n", | |
"Table ID: ", table_id, "\n", | |
"URL: ", url) | |
month <- url_month | |
} | |
} | |
stem <- list(year = year, month = month) | |
} | |
stem | |
} | |
wikipedia_terrorism_parse_url_date <- function(x) { | |
# there are three kinds of URLs | |
# year only: https://en.wikipedia.org//wiki/List_of_terrorist_incidents_in_2006 | |
# year and month range: https://en.wikipedia.org//wiki/List_of_terrorist_incidents_in_January%E2%80%93June_2011 | |
# year and month: List_of_terrorist_incidents_in_September_2019 | |
xbase <- basename(x) | |
dates <- lapply(xbase, function(x) { | |
year <- x %>% | |
stringr::str_extract("[0-9]{4}$") %>% | |
as.integer() | |
if (stringr::str_detect(x, "incidents_in_[0-9]{4}$")) { | |
start_date <- as.Date(paste0(year, "-01-01")) | |
end_date <- as.Date(paste0(year, "-12-01")) | |
} | |
if (stringr::str_detect(x, "%E2%80%93")) { | |
month1 <- x %>% | |
stringr::str_extract("[A-Za-z]+%E2%80%93") %>% | |
stringr::str_remove("%E2%80%93") | |
month2 <- x %>% | |
stringr::str_extract("%E2%80%93[A-Za-z]+") %>% | |
stringr::str_remove("%E2%80%93") | |
start_date <- as.Date(paste0(year, "-", month1, "-01"), format = "%Y-%B-%d") | |
end_date <- as.Date(paste0(year, "-", month2, "-01"), format = "%Y-%B-%d") | |
} | |
if (stringr::str_detect(x, "_in_[A-Za-z]+_[0-9]+$")) { | |
month <- x %>% | |
stringr::str_extract(., "_[A-Za-z]+_[0-9]{4}$") %>% | |
stringr::str_extract("[A-Za-z]+") | |
start_date <- end_date <- as.Date(paste0(year, "-", month, "-01"), | |
format = "%Y-%B-%d") | |
} | |
tibble::tibble(start = start_date, end = end_date) | |
}) | |
# put it together | |
dplyr::bind_rows(dates) | |
} | |
id_relevant_tables <- function(x) { | |
# for year pages, e.g. 2011, the tables don't have names | |
# for year-month pages the tables usually have names to ID them | |
# easier to check if the first column has the name "Date\n" | |
yes <- sapply(x, function(xx) { | |
isTRUE(names(xx)[1]=="Date\n") | |
}) | |
if (!any(yes)) { | |
stop("Something is wrong, could not find any relevant tables") | |
} | |
x[yes] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment