benmarwick/000-scrape-wikipedia.R

## 000-scrape-wikipedia.R
# list of lists
url <- "https://en.wikipedia.org/wiki/Lists_of_World_Heritage_Sites"

# Table of sites per country can be found at each of these pages
# probably the simplest entry point

# Africa
africa <- "https://en.wikipedia.org/wiki/List_of_World_Heritage_Sites_in_Africa"

# Americas
n_america <- "https://en.wikipedia.org/wiki/List_of_World_Heritage_Sites_in_North_America"
c_america <- "https://en.wikipedia.org/wiki/List_of_World_Heritage_Sites_in_Central_America"
caribbean <- "https://en.wikipedia.org/wiki/List_of_World_Heritage_Sites_in_the_Caribbean"
s_america <- "https://en.wikipedia.org/wiki/List_of_World_Heritage_Sites_in_South_America"

# Asia
n_and_c_asia <- "https://en.wikipedia.org/wiki/List_of_World_Heritage_Sites_in_Northern_and_Central_Asia"
w_asia <- "https://en.wikipedia.org/wiki/List_of_World_Heritage_Sites_in_Western_Asia"
e_asia <- "https://en.wikipedia.org/wiki/List_of_World_Heritage_sites_in_Eastern_Asia"
s_asia <- "https://en.wikipedia.org/wiki/List_of_World_Heritage_sites_in_Southern_Asia"
# SE Asia list currently removed due to copyvio :(
se_asia <- "https://en.wikipedia.org/w/index.php?title=List_of_World_Heritage_Sites_in_Southeast_Asia&direction=prev&oldid=878049179"

# Europe
n_europe <- "https://en.wikipedia.org/wiki/List_of_World_Heritage_Sites_in_Northern_Europe"
w_europe <- "https://en.wikipedia.org/wiki/List_of_World_Heritage_sites_in_Western_Europe"
e_europe <- "https://en.wikipedia.org/wiki/List_of_World_Heritage_sites_in_Eastern_Europe"
s_europe <- "https://en.wikipedia.org/wiki/List_of_World_Heritage_Sites_in_Southern_Europe"

# Oceania
oceania <- "https://en.wikipedia.org/wiki/List_of_World_Heritage_Sites_in_Oceania"

#-----------------------------------------------------

# Get the table of WH locations from each website listed above
library('rvest')
library('tidyverse')
library("parzer") # devtools::install_github("ropenscilabs/parzer")
library("tmap")
data("World")
library(vegan)

#-----------------------------------------------------
# get table of sites per region

get_table_of_sites_per_region <- function(region){

  # here are all tables on the page
  region_table_xml_all_tables <-
    read_html(region) %>%
    html_nodes("table")

  # we need to get the xml of the table of sites, since we will use the XML later
if(region != se_asia){

which_table_has_the_sites <-
   map(region_table_xml_all_tables,
       ~html_table(.x, fill = TRUE))  %>%
   map_lgl(~names(.x)[1] == "Site")

  region_table_xml <-
    keep(region_table_xml_all_tables,
         which_table_has_the_sites)

  # here is the table of sites for that region
  # first column is called "Site"
  region_table_xml_table <-
    map(region_table_xml,
        html_table,
        fill = TRUE) %>%
    keep(~names(.x)[1] == "Site") %>%
    .[[1]] %>%
    as_tibble(., .name_repair = "unique")

} else {

  # the SE Asia table is troublesome
  region_table_xml <- region_table_xml_all_tables[3]
  region_table_xml_table <- html_table(region_table_xml)[[1]]
}

  # filter so we only keep the cultural sites
  region_tbl <-
    region_table_xml_table %>%
    filter(str_detect(Criteria, 'Cultural')) %>%
    mutate(Year_num = ifelse(is.integer(Year), Year, parse_number(Year)))

  # scrape out the coords for mapping
  coords_chars <- "\\d|°|′|″|N|E|S|W|\\."
  region_coords <-
    map_chr(str_extract_all(region_tbl$Location,
                            coords_chars),
            ~paste0(.x, collapse = "") %>%
              str_replace_all(., "^[A-Z]*", "") %>%
              str_extract(., ".+?(?=E|W)"))

  region_coords_clean <-
    region_coords %>%
    str_split("N|S") %>%
    Reduce(rbind, .) %>%
    as_tibble(., .name_repair = "universal") %>%
    mutate(lat = parse_lat(`...1`),
           lon = parse_lon(`...2`))

  # attach clean coords to main table
  region_tbl_coords <-
    region_tbl %>%
    bind_cols(region_coords_clean)

  # get country of site from location text
  # Laos, Czech Republic, Micronesia, Zimbabwe, South Sudan, Chad,
  # Central African Republic, Congo, Gabon, Cameroon, Nigeria, Bosnia and Herzegovina
  #  Cote d'Ivoire, Sierra Leone, Guyana, Belize,
  country_names <-  paste(World$name, collapse="|")
  region_tbl_coords <-
    region_tbl_coords %>%
    mutate(country = str_extract(Location,
                                 regex(country_names,
                                       ignore.case=TRUE)))

  # get links to each site's wiki page
  get_link <- function(html_table, Site){
    html_table %>%
      html_nodes(xpath=paste0("//a[text()='", Site, "']")) %>%
      html_attr("href")
  }

  region_tbl_coords_links <-
    region_tbl_coords %>%
    mutate(site_page_name = map_chr(Site,
                                    ~get_link(region_table_xml, .x)[1])) %>%
    mutate(site_page_link = as.character(str_glue('https://en.wikipedia.org{site_page_name}')))

  return(region_tbl_coords_links)
}
#-----------------------------------------------------


# get the wikilink to each page for each WH site in each region
tbl_africa  <-        map_df(africa ,      get_table_of_sites_per_region)

tbl_n_america  <-     map_df(n_america ,   get_table_of_sites_per_region)
tbl_c_america <-      map_df(c_america,   get_table_of_sites_per_region)
tbl_caribbean  <-     map_df(caribbean ,   get_table_of_sites_per_region)
tbl_s_america  <-     map_df(s_america ,   get_table_of_sites_per_region)

tbl_n_and_c_asia  <-  map_df(n_and_c_asia,   get_table_of_sites_per_region)
tbl_w_asia  <-        map_df(w_asia ,   get_table_of_sites_per_region)
tbl_e_asia  <-        map_df(e_asia ,   get_table_of_sites_per_region)
tbl_s_asia  <-        map_df(s_asia ,   get_table_of_sites_per_region)

# special handling needed
tbl_se_asia  <-       map_df(se_asia ,   get_table_of_sites_per_region)

tbl_n_europe <-       map_df(n_europe,   get_table_of_sites_per_region)
tbl_w_europe <-       map_df(w_europe,   get_table_of_sites_per_region)
tbl_e_europe <-       map_df(e_europe,   get_table_of_sites_per_region)
tbl_s_europe <-       map_df(s_europe,   get_table_of_sites_per_region)
tbl_russia <-         tbl_russia # has it's own weird page

tbl_oceania <-        map_df(oceania,    get_table_of_sites_per_region)

# Put them all into one big data frame
cols_we_want <-
  c(
    "Site"     ,
    "Location"     ,
    "Criteria"       ,
    "Areaha (acre)" ,
    "Year_num"    ,
    "lat"     ,
    "lon"          ,
    "country"      ,
    "site_page_name" ,
    "site_page_link"
  )

all_regions <- list(
  tbl_africa       ,
  tbl_n_america    ,
  tbl_c_america    ,
  tbl_caribbean    ,
  tbl_s_america    ,
  tbl_n_and_c_asia  ,
  tbl_w_asia       ,
  tbl_e_asia       ,
  tbl_s_asia       ,
  tbl_se_asia      ,
  tbl_n_europe     ,
  tbl_w_europe     ,
  tbl_e_europe     ,
  tbl_s_europe     ,
  tbl_russia,
  tbl_oceania
)

wh_wiki_table <- map_df(all_regions, ~select(., cols_we_want))
write_csv(wh_wiki_table, 'data/wh_wiki_table.csv')

# nothing from Russia or the UK

# sites that we have wikipages for
all_regions_cols_we_want_with_pages <-
  wh_wiki_table %>%
  filter(!is.na(site_page_name))

# sites that we do not have wikipages for
all_regions_cols_we_want_without_pages <-
  wh_wiki_table %>%
  filter(is.na(site_page_name))

# ratio of sites-with-pages to all-sites-in-the-country
country_site_page_ratio <-
  wh_wiki_table %>%
  group_by(country) %>%
  count(pg = !is.na(site_page_name)) %>%
  spread(pg,  n) %>%
  mutate(no_page = ifelse(is.na(`FALSE`), 0,  `FALSE`),
         has_page = ifelse(is.na(`TRUE`), 0,  `TRUE`),
         total = no_page + has_page) %>%
  mutate(ratio_sites_with_pages = has_page / total) %>%
  filter(!is.na(total))


# spatial distribution of sites with no page
library(tmap)
data("World")

# tmap_mode("view") # for interactive
tmap_mode("plot")

tm_map_data <-
World %>%
  left_join(country_site_page_ratio,
            by = c( 'name' = 'country'))

tm_shape(tm_map_data) +
  tm_polygons("ratio_sites_with_pages",
              palette='viridis',
              popup.vars = 'name') +
  tm_shape(tm_map_data) +
  tm_symbols(col = "red",
             alpha = 0.1,
             size = "total",
             scale = 2)


#-----------------------------------------------------
#
# Now, for each site page, let's get some stuff...
#

#- page size
#- number of links on page
#- number of references on page
#- number of pages-linking-to
#- talk page size
#- revision history
#- number of edits
#- number of editors
#- diversity of editors
#- distribution of diff sizes
#- do editors co-occur on multiple pages?
#- geometric mean so edits per day and of words changed per day

get_data_about_page <- function(the_page) {

  page <-
    the_page %>%
    read_html() %>%
    html_nodes("#content")

  page_name <-
    the_page %>%
    str_remove("https://en.wikipedia.org/wiki/")

  page_wordcount <-
    page %>%
    html_text() %>%
    stringi::stri_count_words()

  page_wikilinks_out <-
    page %>%
    html_nodes("p > a") %>%
    html_attr('href') %>%
    length()

  page_wikilinks_in <-
    the_page %>%
    read_html() %>%
    html_nodes("#t-whatlinkshere a") %>%
    html_attr('href') %>%
    str_remove("/wiki/") %>%
    str_glue("https://en.wikipedia.org/w/index.php?title=",.,
             "&limit=1000") %>%
    read_html() %>%
    html_nodes("#mw-whatlinkshere-list li > a") %>%
    html_attr('href') %>%
    length()

  page_cited_items_on <-
    page %>%
    html_nodes(".reference a") %>%
    html_text() %>%
    unique() %>%
    length()

  revision_history_page <-
    the_page %>%
    read_html() %>%
    html_nodes("#ca-history a") %>%
    html_attr('href') %>%
    str_glue("https://en.wikipedia.org",.) %>%
    str_replace("&action=history",
                "&offset=&limit=2000&action=history")


  rh_date = revision_history_page %>%
    read_html %>%
    html_nodes('li > :nth-child(4)') %>%
               html_text() %>%
               lubridate::parse_date_time("H:M, d b Y")

  rh_user = revision_history_page %>%
    read_html %>%
    html_nodes( 'bdi') %>%
               html_text()

  rh_size = revision_history_page %>%
    read_html %>%
    html_nodes( '.history-size') %>%
               html_text()

  rh_diff_size = revision_history_page %>%
    read_html %>%
    html_nodes( '.mw-diff-bytes') %>%
               html_text()

  rh_comment = revision_history_page %>%
    read_html %>%
               html_nodes(' #pagehistory') %>%
               html_text() %>%
               str_split(regex("\n")) %>%
               .[[1]] %>%
               str_replace("^.+ . .", "") %>%
               enframe %>%
               filter(value != "") %>%
               pull(value)


  revision_history_page_details  <-
    tibble(rh_date = rh_date,
           rh_user = rh_user,
           rh_size = rh_size,
           rh_diff_size = rh_diff_size,
           rh_comment = rh_comment
           )

  revision_history_page_details <-
    revision_history_page_details %>%
    mutate(rh_size = parse_number(rh_size),
           rh_diff_size = parse_number(rh_diff_size))


  # bots and reverts
  revision_history_page_details <-
    revision_history_page_details %>%
    mutate(bot = ifelse(str_detect(rh_user,
                                   regex('bot',
                                         ignore_case = TRUE)),
                        1, 0),
           revert = ifelse(str_detect(rh_comment,
                                      regex('revert',
                                            ignore_case = TRUE)),
                           1, 0))

  # revision summary stats
  rh_user_simpson_idx <-
    revision_history_page_details %>%
    count(rh_user) %>%
    summarise(simpson = diversity(n, 'simpson')) %>%
    pull(simpson)

  rh_diff_size_cv <-
    revision_history_page_details %>%
    summarise(cv = sd(rh_diff_size) / mean(rh_diff_size)) %>%
    pull(cv)

  rh_user_bot_prop <-
    revision_history_page_details %>%
    count(bot) %>%
    summarise(bot_prop = ifelse(sum(bot) == 0,
                                0,
                                n[bot==1] / sum(n))) %>%
    pull(bot_prop)

  rh_revert_prop <-
    revision_history_page_details %>%
    count(revert) %>%
    summarise(revert_prop = ifelse(sum(revert) == 0,
                                   0,
                                   n[revert==1] / sum(n))) %>%
    pull(revert_prop)

  talk_page_wordcount <- function(the_page){
    the_page %>%
      read_html() %>%
      html_nodes("#ca-talk a") %>%
      html_attr('href') %>%
      str_glue("https://en.wikipedia.org",.) %>%
      read_html() %>%
      html_nodes("#content") %>%
      html_text() %>%
      stringi::stri_count_words()
  }

  talk_page_wordcount_result <- talk_page_wordcount(the_page)

  n_days <- 100
  page_views_end <- str_remove_all(Sys.Date() - 1, "-")
  page_views_start <- str_remove_all(Sys.Date() - n_days, "-")

  page_views_last_n_days_tbl <-
  the_page %>%
    read_html() %>%
    html_nodes("#ca-talk a") %>%
    html_attr('href') %>%
    str_remove("/wiki/Talk:") %>%
  str_glue("https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia.org/all-access/all-agents/",
  .,
  "/daily/",
  page_views_start,
  "/",
  page_views_end) %>%
    read_html() %>%
    html_text() %>%
    jsonlite::fromJSON() %>%
    .[['items']] %>%
    as_tibble()

  page_views_last_n_days_total = sum(page_views_last_n_days_tbl$views)


  return(list(page = page,
              page_wordcount = page_wordcount,
              page_wikilinks_out = page_wikilinks_out,
              page_wikilinks_in = page_wikilinks_in,
              page_cited_items_on = page_cited_items_on,
              revision_history_page_details = revision_history_page_details,
              rh_user_simpson_idx = rh_user_simpson_idx,
              rh_diff_size_cv = rh_diff_size_cv,
              rh_user_bot_prop = rh_user_bot_prop,
              rh_revert_prop = rh_revert_prop,
              talk_page_wordcount = talk_page_wordcount_result,
              page_views_last_n_days_total = page_views_last_n_days_total))

}

# don't stop if there is an error, let's see how many we can get
get_data_about_page_safe <-
  safely(get_data_about_page,
         otherwise = "some_problem")
#-----------------------------------------------------

# Here's a function

get_various_page_data_for_all_pages <- function(tbl_region){

# get these data for all pages in a set
# this is a time-consuming web-scraping step
region_tbl_coords_links_info <-
  tbl_region %>%
  filter(site_page_link != "https://en.wikipedia.orgNA") %>%
  mutate(page_info = map(site_page_link, get_data_about_page_safe))

# flatten out some of the results into the table with one row per WH site
region_tbl_coords_links_info_flat <-
  region_tbl_coords_links_info %>%
  mutate(page_info_t = transpose(page_info)[["result"]]) %>%
  filter(page_info_t != "some_problem") %>%
  mutate(page_wordcount =        map_int(page_info_t, ~.x$page_wordcount),
         page_wikilinks_out =    map_int(page_info_t, ~.x$page_wikilinks_out),
         page_wikilinks_in =     map_int(page_info_t, ~.x$page_wikilinks_in),
         page_cited_items_on =   map_int(page_info_t, ~.x$page_cited_items_on),
         rh_user_simpson_idx =   map_dbl(page_info_t, ~.x$rh_user_simpson_idx),
         rh_user_bot_prop =      map_dbl(page_info_t, ~.x$rh_user_bot_prop),
         rh_revert_prop =        map_dbl(page_info_t, ~.x$rh_revert_prop),
         talk_page_wordcount =   map_int(page_info_t, ~.x$talk_page_wordcount),
         page_views_last_n_days_total = map_int(page_info_t, ~.x$page_views_last_n_days_total)
  )

return(region_tbl_coords_links_info_flat)

}

# don't stop if there is an error, let's see how many we can get
get_various_page_data_for_all_pages_safe <-
  safely(get_various_page_data_for_all_pages,
         otherwise = "some_problem")

# this takes several hours
page_data_for_all_pages <-
      get_various_page_data_for_all_pages_safe(wh_wiki_table)

# take a quick look
page_data_for_all_pages$result %>%
  select_if(is.numeric)

# save it
saveRDS(page_data_for_all_pages$result,
        'data/page_data_for_all_pages.rds')


## 001-scrape-russia.R


russia <- "https://en.wikipedia.org/wiki/List_of_World_Heritage_Sites_in_Russia"

russia__xml_tables <-
read_html(russia) %>%
  html_nodes("table")

russia_tables_tbl <-
map(russia__xml_tables,
    ~html_table(.x, fill = TRUE) %>% as_tibble(., .name_repair = "unique"))

russia_tables_tbl_culture <-
russia_tables_tbl[[2]] %>%
  mutate(type = ifelse(Image == "", NA, Image),
         type = zoo::na.locf(type)) %>%
  filter(type == "Cultural") %>%
  slice(-1) %>%
  select(Name, Location, Yearlisted, Description) %>%
  set_names(c("Site", "Location", "Year", "Description")) %>%
  mutate(Year_num = ifelse(is.integer(Year), Year, parse_number(Year)))


# scrape out the coords for mapping
coords_chars <- "\\d|°|′|″|N|E|S|W|\\."
russia_tables_tbl_culture_coords <-
  map_chr(str_extract_all(russia_tables_tbl_culture$Location,
                          coords_chars),
          ~paste0(.x, collapse = "") %>%
            str_replace_all(., "^[A-Z]*", "") %>%
            str_extract(., ".+?(?=E|W)"))

russia_coords_clean <-
  russia_tables_tbl_culture_coords %>%
  str_split("N|S") %>%
  Reduce(rbind, .) %>%
  as_tibble(., .name_repair = "universal") %>%
  mutate(lat = parse_lat(`...1`),
         lon = parse_lon(`...2`))

# attach clean coords to main table
russia_tbl_coords <-
  russia_tables_tbl_culture %>%
  bind_cols(russia_coords_clean)

# get country of site from location text
russia_tbl_coords <-
  russia_tbl_coords %>%
  mutate(country = "Russia",
         Criteria = 'Cultural',
         `Areaha (acre)` = NA)

# get links to each site's wiki page
get_link <- function(html_table, Site){
  html_table %>%
    html_nodes(xpath=paste0("//a[text()='", Site, "']")) %>%
    html_attr("href")
}

cols_we_want <-
  c(
    "Site"     ,
    "Location"     ,
    "Criteria"       ,
    "Areaha (acre)" ,
    "Year_num"    ,
    "lat"     ,
    "lon"          ,
    "country"      ,
    "site_page_name" ,
    "site_page_link"
  )

russia_tbl_coords_links <-
  russia_tbl_coords %>%
  mutate(site_page_name = map_chr(Site,
                                  ~get_link(russia__xml_tables[2], .x)[1])) %>%
  mutate(site_page_link = as.character(str_glue('https://en.wikipedia.org{site_page_name}'))) %>%
  select(cols_we_want)

tbl_russia <- russia_tbl_coords_links


## 002-eda.R
names(page_data_for_all_pages$result)

#--------------------------------------------------------------------------
# take a look at the distribution of page variables
some_page_variables <-
  page_data_for_all_pages$result %>%
  select(page_wordcount,
         page_wikilinks_out,
         page_wikilinks_in,
         page_cited_items_on) %>%
  mutate(page_wikilinks_out_norm = page_wikilinks_out / page_wordcount,
         page_cited_items_on_norm = page_cited_items_on / page_wordcount)

some_page_variables %>%
  gather(variable, value) %>%
ggplot(aes(value)) +
  geom_histogram() +
  facet_wrap( ~ variable,
              scales = "free") +
  scale_x_log10() +
  theme_minimal()

library(GGally)
ggpairs( some_page_variables %>%
           mutate_all(log)) +
  theme_minimal()

# umap
library(uwot)
pages_umap_input <-
  some_page_variables  %>%
  mutate(page_wordcount_scaled = scale(page_wordcount),
         page_wikilinks_in_scaled = scale(page_wikilinks_in)) %>%
  select(-page_wordcount,
         -page_cited_items_on,
         -page_wikilinks_out,
         -page_wikilinks_in) %>%
  bind_cols(., page_data_for_all_pages$result[ , 'country'] ) %>%
  filter_all(all_vars(!is.na(.))) %>%
  left_join(World %>%
              select(name, continent),
            by = c('country' = 'name'))

# compute umap

pages_umap_input_selected <-
  pages_umap_input %>%
  select(-country,
         -continent,
         -geometry
         )

pages_umap_output <-
  pages_umap_input_selected %>%
  umap(.,
       n_neighbors = 50,
       min_dist = 0.9,
       nn_method = "annoy",
       init = "spca") %>%
  as_tibble()

# compute hdbscan clusters
library(dbscan)
hdbscan_out <- hdbscan(pages_umap_output,
                       minPts = 5)

table(hdbscan_out$cluster)

main_plot <-
ggplot(pages_umap_output,
       aes(V1, V2)) +
  geom_point(size = 3,
             aes(colour = factor(hdbscan_out$cluster))) +
  scale_color_viridis_d(guide = FALSE) +
  theme_minimal() +
  xlab("") +
  ylab("")

main_plot

# train a feature-selecting classificator like random forests on
# the cluster labels

rand_forest_input <-
  pages_umap_input_selected %>%
  mutate(clus = hdbscan_out$cluster) %>%
  filter(clus != 0)

library(caret)

fit <- train(
  clus ~ .,
  data = rand_forest_input,
  method = "ranger",
  trControl = trainControl(method="cv",
                            number = 10,
                            allowParallel = TRUE,
                            verbose = TRUE),
  importance = 'permutation')

fit
var_imp_tbl <- tibble(var = row.names(varImp(fit)$importance),
                      imp = varImp(fit)$importance$Overall)

theme_nogrid <- function (base_size = 12, base_family = "") {
  theme_bw(base_size = base_size,
           base_family = base_family) %+replace%
    theme(panel.grid = element_blank() )
}

sub_plot <-
ggplot(var_imp_tbl,
       aes(reorder( var, -imp ),
           imp)) +
  geom_col() +
  coord_flip() +
  xlab("") +
  ylab("") +
  theme_nogrid(base_size = 6)

# plot plus subplot
main_plot +
  annotation_custom(ggplotGrob(sub_plot),
                    xmin = -10.0,
                    xmax = -2,
                    ymin=-7.5,
                    ymax=-3.5)


pca_out <- prcomp(pages_umap_input_selected)

pca_out_df <- tibble(pc1 = pca_out$x[ , 1],
                     pc2 = pca_out$x[ , 2],
                     pc3 = pca_out$x[ , 3],
                     clus = hdbscan_out$cluster)

ggplot(pca_out_df,
       aes(pc1,
           pc3,
           colour = factor(clus))) +
  geom_point() +
  scale_color_viridis_d()


#-------------------------------------------------------------------
# edit variables

revision_history_page_details <-
  tibble(revision_history_page_details = map(page_data_for_all_pages$result$page_info_t,
      ~.x$revision_history_page_details)) %>%
  mutate(Site =         page_data_for_all_pages$result$Site,
         rh_n_editors = map_int(revision_history_page_details, ~n_distinct(.x$rh_user)),
         rh_n_edits =   map_int(revision_history_page_details, ~nrow(.x)))

ggplot(revision_history_page_details,
       aes(rh_n_edits)) +
  geom_histogram()

some_edit_variables <-
  page_data_for_all_pages$result %>%
  mutate(rh_n_editors = length(page_info_t$revision_history_page_details)) %>%
  select(talk_page_wordcount,
         rh_n_editors,
         rh_user_simpson_idx,
         rh_user_bot_prop,
         rh_revert_prop) %>%
  gather(variable, value)

ggplot(some_edit_variables,
       aes(value)) +
  geom_histogram() +
  facet_wrap( ~ variable,
              scales = "free") +
  scale_x_log10() +
  theme_minimal()

library(GGally)
ggpairs( page_data_for_all_pages$result %>%
           select(rh_user_simpson_idx,
                  rh_user_bot_prop,
                  rh_revert_prop))

## 003-random-pages-sample.R
library(tidyverse)
library('rvest')
library(vegan)

# gets a random WP page
rando <- "https://en.wikipedia.org/wiki/Special:Random"

# gets data about 1000 WP pages
n <- 10
random_page_data <- rerun(n, get_data_about_page_safe(rando) )

# drops the pages that returned an error
random_page_data_lst <- transpose(random_page_data)[["result"]]

random_page_data_lst <- discard(random_page_data_lst, ~.x[1] == "some_problem")

# put page data into a table
random_page_data_tbl <-
  tibble(page_wordcount =      map_int(random_page_data_lst, ~.x$page_wordcount),
         page_wikilinks_out =  map_int(random_page_data_lst, ~.x$page_wikilinks_out),
         page_wikilinks_in =   map_int(random_page_data_lst, ~.x$page_wikilinks_in),
         page_cited_items_on = map_int(random_page_data_lst, ~.x$page_cited_items_on),
         rh_user_simpson_idx = map_dbl(random_page_data_lst, ~.x$rh_user_simpson_idx),
         rh_user_bot_prop =    map_dbl(random_page_data_lst, ~.x$rh_user_bot_prop),
         rh_revert_prop =      map_dbl(random_page_data_lst, ~.x$rh_revert_prop),
         talk_page_wordcount = map_dbl(random_page_data_lst, ~.x$talk_page_wordcount),
         page_views_last_n_days_total = map_int(random_page_data_lst, ~.x$page_views_last_n_days_total)
  )

random_page_data_tbl %>%
  gather(variable, value) %>%
  ggplot(aes(value)) +
  geom_histogram() +
  facet_wrap( ~ variable, scales = "free")
	# list of lists
	url <- "https://en.wikipedia.org/wiki/Lists_of_World_Heritage_Sites"

	# Table of sites per country can be found at each of these pages
	# probably the simplest entry point

	# Africa
	africa <- "https://en.wikipedia.org/wiki/List_of_World_Heritage_Sites_in_Africa"

	# Americas
	n_america <- "https://en.wikipedia.org/wiki/List_of_World_Heritage_Sites_in_North_America"
	c_america <- "https://en.wikipedia.org/wiki/List_of_World_Heritage_Sites_in_Central_America"
	caribbean <- "https://en.wikipedia.org/wiki/List_of_World_Heritage_Sites_in_the_Caribbean"
	s_america <- "https://en.wikipedia.org/wiki/List_of_World_Heritage_Sites_in_South_America"

	# Asia
	n_and_c_asia <- "https://en.wikipedia.org/wiki/List_of_World_Heritage_Sites_in_Northern_and_Central_Asia"
	w_asia <- "https://en.wikipedia.org/wiki/List_of_World_Heritage_Sites_in_Western_Asia"
	e_asia <- "https://en.wikipedia.org/wiki/List_of_World_Heritage_sites_in_Eastern_Asia"
	s_asia <- "https://en.wikipedia.org/wiki/List_of_World_Heritage_sites_in_Southern_Asia"
	# SE Asia list currently removed due to copyvio :(
	se_asia <- "https://en.wikipedia.org/w/index.php?title=List_of_World_Heritage_Sites_in_Southeast_Asia&direction=prev&oldid=878049179"

	# Europe
	n_europe <- "https://en.wikipedia.org/wiki/List_of_World_Heritage_Sites_in_Northern_Europe"
	w_europe <- "https://en.wikipedia.org/wiki/List_of_World_Heritage_sites_in_Western_Europe"
	e_europe <- "https://en.wikipedia.org/wiki/List_of_World_Heritage_sites_in_Eastern_Europe"
	s_europe <- "https://en.wikipedia.org/wiki/List_of_World_Heritage_Sites_in_Southern_Europe"

	# Oceania
	oceania <- "https://en.wikipedia.org/wiki/List_of_World_Heritage_Sites_in_Oceania"

	#-----------------------------------------------------

	# Get the table of WH locations from each website listed above
	library('rvest')
	library('tidyverse')
	library("parzer") # devtools::install_github("ropenscilabs/parzer")
	library("tmap")
	data("World")
	library(vegan)

	#-----------------------------------------------------
	# get table of sites per region

	get_table_of_sites_per_region <- function(region){

	# here are all tables on the page
	region_table_xml_all_tables <-
	read_html(region) %>%
	html_nodes("table")

	# we need to get the xml of the table of sites, since we will use the XML later
	if(region != se_asia){

	which_table_has_the_sites <-
	map(region_table_xml_all_tables,
	~html_table(.x, fill = TRUE)) %>%
	map_lgl(~names(.x)[1] == "Site")

	region_table_xml <-
	keep(region_table_xml_all_tables,
	which_table_has_the_sites)

	# here is the table of sites for that region
	# first column is called "Site"
	region_table_xml_table <-
	map(region_table_xml,
	html_table,
	fill = TRUE) %>%
	keep(~names(.x)[1] == "Site") %>%
	.[[1]] %>%
	as_tibble(., .name_repair = "unique")

	} else {

	# the SE Asia table is troublesome
	region_table_xml <- region_table_xml_all_tables[3]
	region_table_xml_table <- html_table(region_table_xml)[[1]]
	}

	# filter so we only keep the cultural sites
	region_tbl <-
	region_table_xml_table %>%
	filter(str_detect(Criteria, 'Cultural')) %>%
	mutate(Year_num = ifelse(is.integer(Year), Year, parse_number(Year)))

	# scrape out the coords for mapping
	coords_chars <- "\\d\|°\|′\|″\|N\|E\|S\|W\|\\."
	region_coords <-
	map_chr(str_extract_all(region_tbl$Location,
	coords_chars),
	~paste0(.x, collapse = "") %>%
	str_replace_all(., "^[A-Z]*", "") %>%
	str_extract(., ".+?(?=E\|W)"))

	region_coords_clean <-
	region_coords %>%
	str_split("N\|S") %>%
	Reduce(rbind, .) %>%
	as_tibble(., .name_repair = "universal") %>%
	mutate(lat = parse_lat(`...1`),
	lon = parse_lon(`...2`))

	# attach clean coords to main table
	region_tbl_coords <-
	region_tbl %>%
	bind_cols(region_coords_clean)

	# get country of site from location text
	# Laos, Czech Republic, Micronesia, Zimbabwe, South Sudan, Chad,
	# Central African Republic, Congo, Gabon, Cameroon, Nigeria, Bosnia and Herzegovina
	# Cote d'Ivoire, Sierra Leone, Guyana, Belize,
	country_names <- paste(World$name, collapse="\|")
	region_tbl_coords <-
	region_tbl_coords %>%
	mutate(country = str_extract(Location,
	regex(country_names,
	ignore.case=TRUE)))

	# get links to each site's wiki page
	get_link <- function(html_table, Site){
	html_table %>%
	html_nodes(xpath=paste0("//a[text()='", Site, "']")) %>%
	html_attr("href")
	}

	region_tbl_coords_links <-
	region_tbl_coords %>%
	mutate(site_page_name = map_chr(Site,
	~get_link(region_table_xml, .x)[1])) %>%
	mutate(site_page_link = as.character(str_glue('https://en.wikipedia.org{site_page_name}')))

	return(region_tbl_coords_links)
	}
	#-----------------------------------------------------


	# get the wikilink to each page for each WH site in each region
	tbl_africa <- map_df(africa , get_table_of_sites_per_region)

	tbl_n_america <- map_df(n_america , get_table_of_sites_per_region)
	tbl_c_america <- map_df(c_america, get_table_of_sites_per_region)
	tbl_caribbean <- map_df(caribbean , get_table_of_sites_per_region)
	tbl_s_america <- map_df(s_america , get_table_of_sites_per_region)

	tbl_n_and_c_asia <- map_df(n_and_c_asia, get_table_of_sites_per_region)
	tbl_w_asia <- map_df(w_asia , get_table_of_sites_per_region)
	tbl_e_asia <- map_df(e_asia , get_table_of_sites_per_region)
	tbl_s_asia <- map_df(s_asia , get_table_of_sites_per_region)

	# special handling needed
	tbl_se_asia <- map_df(se_asia , get_table_of_sites_per_region)

	tbl_n_europe <- map_df(n_europe, get_table_of_sites_per_region)
	tbl_w_europe <- map_df(w_europe, get_table_of_sites_per_region)
	tbl_e_europe <- map_df(e_europe, get_table_of_sites_per_region)
	tbl_s_europe <- map_df(s_europe, get_table_of_sites_per_region)
	tbl_russia <- tbl_russia # has it's own weird page

	tbl_oceania <- map_df(oceania, get_table_of_sites_per_region)

	# Put them all into one big data frame
	cols_we_want <-
	c(
	"Site" ,
	"Location" ,
	"Criteria" ,
	"Areaha (acre)" ,
	"Year_num" ,
	"lat" ,
	"lon" ,
	"country" ,
	"site_page_name" ,
	"site_page_link"
	)

	all_regions <- list(
	tbl_africa ,
	tbl_n_america ,
	tbl_c_america ,
	tbl_caribbean ,
	tbl_s_america ,
	tbl_n_and_c_asia ,
	tbl_w_asia ,
	tbl_e_asia ,
	tbl_s_asia ,
	tbl_se_asia ,
	tbl_n_europe ,
	tbl_w_europe ,
	tbl_e_europe ,
	tbl_s_europe ,
	tbl_russia,
	tbl_oceania
	)

	wh_wiki_table <- map_df(all_regions, ~select(., cols_we_want))
	write_csv(wh_wiki_table, 'data/wh_wiki_table.csv')

	# nothing from Russia or the UK

	# sites that we have wikipages for
	all_regions_cols_we_want_with_pages <-
	wh_wiki_table %>%
	filter(!is.na(site_page_name))

	# sites that we do not have wikipages for
	all_regions_cols_we_want_without_pages <-
	wh_wiki_table %>%
	filter(is.na(site_page_name))

	# ratio of sites-with-pages to all-sites-in-the-country
	country_site_page_ratio <-
	wh_wiki_table %>%
	group_by(country) %>%
	count(pg = !is.na(site_page_name)) %>%
	spread(pg, n) %>%
	mutate(no_page = ifelse(is.na(`FALSE`), 0, `FALSE`),
	has_page = ifelse(is.na(`TRUE`), 0, `TRUE`),
	total = no_page + has_page) %>%
	mutate(ratio_sites_with_pages = has_page / total) %>%
	filter(!is.na(total))


	# spatial distribution of sites with no page
	library(tmap)
	data("World")

	# tmap_mode("view") # for interactive
	tmap_mode("plot")

	tm_map_data <-
	World %>%
	left_join(country_site_page_ratio,
	by = c( 'name' = 'country'))

	tm_shape(tm_map_data) +
	tm_polygons("ratio_sites_with_pages",
	palette='viridis',
	popup.vars = 'name') +
	tm_shape(tm_map_data) +
	tm_symbols(col = "red",
	alpha = 0.1,
	size = "total",
	scale = 2)





	#-----------------------------------------------------
	#
	# Now, for each site page, let's get some stuff...
	#

	#- page size
	#- number of links on page
	#- number of references on page
	#- number of pages-linking-to
	#- talk page size
	#- revision history
	#- number of edits
	#- number of editors
	#- diversity of editors
	#- distribution of diff sizes
	#- do editors co-occur on multiple pages?
	#- geometric mean so edits per day and of words changed per day

	get_data_about_page <- function(the_page) {

	page <-
	the_page %>%
	read_html() %>%
	html_nodes("#content")

	page_name <-
	the_page %>%
	str_remove("https://en.wikipedia.org/wiki/")

	page_wordcount <-
	page %>%
	html_text() %>%
	stringi::stri_count_words()

	page_wikilinks_out <-
	page %>%
	html_nodes("p > a") %>%
	html_attr('href') %>%
	length()

	page_wikilinks_in <-
	the_page %>%
	read_html() %>%
	html_nodes("#t-whatlinkshere a") %>%
	html_attr('href') %>%
	str_remove("/wiki/") %>%
	str_glue("https://en.wikipedia.org/w/index.php?title=",.,
	"&limit=1000") %>%
	read_html() %>%
	html_nodes("#mw-whatlinkshere-list li > a") %>%
	html_attr('href') %>%
	length()

	page_cited_items_on <-
	page %>%
	html_nodes(".reference a") %>%
	html_text() %>%
	unique() %>%
	length()

	revision_history_page <-
	the_page %>%
	read_html() %>%
	html_nodes("#ca-history a") %>%
	html_attr('href') %>%
	str_glue("https://en.wikipedia.org",.) %>%
	str_replace("&action=history",
	"&offset=&limit=2000&action=history")


	rh_date = revision_history_page %>%
	read_html %>%
	html_nodes('li > :nth-child(4)') %>%
	html_text() %>%
	lubridate::parse_date_time("H:M, d b Y")

	rh_user = revision_history_page %>%
	read_html %>%
	html_nodes( 'bdi') %>%
	html_text()

	rh_size = revision_history_page %>%
	read_html %>%
	html_nodes( '.history-size') %>%
	html_text()

	rh_diff_size = revision_history_page %>%
	read_html %>%
	html_nodes( '.mw-diff-bytes') %>%
	html_text()

	rh_comment = revision_history_page %>%
	read_html %>%
	html_nodes(' #pagehistory') %>%
	html_text() %>%
	str_split(regex("\n")) %>%
	.[[1]] %>%
	str_replace("^.+ . .", "") %>%
	enframe %>%
	filter(value != "") %>%
	pull(value)


	revision_history_page_details <-
	tibble(rh_date = rh_date,
	rh_user = rh_user,
	rh_size = rh_size,
	rh_diff_size = rh_diff_size,
	rh_comment = rh_comment
	)

	revision_history_page_details <-
	revision_history_page_details %>%
	mutate(rh_size = parse_number(rh_size),
	rh_diff_size = parse_number(rh_diff_size))


	# bots and reverts
	revision_history_page_details <-
	revision_history_page_details %>%
	mutate(bot = ifelse(str_detect(rh_user,
	regex('bot',
	ignore_case = TRUE)),
	1, 0),
	revert = ifelse(str_detect(rh_comment,
	regex('revert',
	ignore_case = TRUE)),
	1, 0))

	# revision summary stats
	rh_user_simpson_idx <-
	revision_history_page_details %>%
	count(rh_user) %>%
	summarise(simpson = diversity(n, 'simpson')) %>%
	pull(simpson)

	rh_diff_size_cv <-
	revision_history_page_details %>%
	summarise(cv = sd(rh_diff_size) / mean(rh_diff_size)) %>%
	pull(cv)

	rh_user_bot_prop <-
	revision_history_page_details %>%
	count(bot) %>%
	summarise(bot_prop = ifelse(sum(bot) == 0,
	0,
	n[bot==1] / sum(n))) %>%
	pull(bot_prop)

	rh_revert_prop <-
	revision_history_page_details %>%
	count(revert) %>%
	summarise(revert_prop = ifelse(sum(revert) == 0,
	0,
	n[revert==1] / sum(n))) %>%
	pull(revert_prop)

	talk_page_wordcount <- function(the_page){
	the_page %>%
	read_html() %>%
	html_nodes("#ca-talk a") %>%
	html_attr('href') %>%
	str_glue("https://en.wikipedia.org",.) %>%
	read_html() %>%
	html_nodes("#content") %>%
	html_text() %>%
	stringi::stri_count_words()
	}

	talk_page_wordcount_result <- talk_page_wordcount(the_page)

	n_days <- 100
	page_views_end <- str_remove_all(Sys.Date() - 1, "-")
	page_views_start <- str_remove_all(Sys.Date() - n_days, "-")

	page_views_last_n_days_tbl <-
	the_page %>%
	read_html() %>%
	html_nodes("#ca-talk a") %>%
	html_attr('href') %>%
	str_remove("/wiki/Talk:") %>%
	str_glue("https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia.org/all-access/all-agents/",
	.,
	"/daily/",
	page_views_start,
	"/",
	page_views_end) %>%
	read_html() %>%
	html_text() %>%
	jsonlite::fromJSON() %>%
	.[['items']] %>%
	as_tibble()

	page_views_last_n_days_total = sum(page_views_last_n_days_tbl$views)


	return(list(page = page,
	page_wordcount = page_wordcount,
	page_wikilinks_out = page_wikilinks_out,
	page_wikilinks_in = page_wikilinks_in,
	page_cited_items_on = page_cited_items_on,
	revision_history_page_details = revision_history_page_details,
	rh_user_simpson_idx = rh_user_simpson_idx,
	rh_diff_size_cv = rh_diff_size_cv,
	rh_user_bot_prop = rh_user_bot_prop,
	rh_revert_prop = rh_revert_prop,
	talk_page_wordcount = talk_page_wordcount_result,
	page_views_last_n_days_total = page_views_last_n_days_total))

	}

	# don't stop if there is an error, let's see how many we can get
	get_data_about_page_safe <-
	safely(get_data_about_page,
	otherwise = "some_problem")
	#-----------------------------------------------------

	# Here's a function

	get_various_page_data_for_all_pages <- function(tbl_region){

	# get these data for all pages in a set
	# this is a time-consuming web-scraping step
	region_tbl_coords_links_info <-
	tbl_region %>%
	filter(site_page_link != "https://en.wikipedia.orgNA") %>%
	mutate(page_info = map(site_page_link, get_data_about_page_safe))

	# flatten out some of the results into the table with one row per WH site
	region_tbl_coords_links_info_flat <-
	region_tbl_coords_links_info %>%
	mutate(page_info_t = transpose(page_info)[["result"]]) %>%
	filter(page_info_t != "some_problem") %>%
	mutate(page_wordcount = map_int(page_info_t, ~.x$page_wordcount),
	page_wikilinks_out = map_int(page_info_t, ~.x$page_wikilinks_out),
	page_wikilinks_in = map_int(page_info_t, ~.x$page_wikilinks_in),
	page_cited_items_on = map_int(page_info_t, ~.x$page_cited_items_on),
	rh_user_simpson_idx = map_dbl(page_info_t, ~.x$rh_user_simpson_idx),
	rh_user_bot_prop = map_dbl(page_info_t, ~.x$rh_user_bot_prop),
	rh_revert_prop = map_dbl(page_info_t, ~.x$rh_revert_prop),
	talk_page_wordcount = map_int(page_info_t, ~.x$talk_page_wordcount),
	page_views_last_n_days_total = map_int(page_info_t, ~.x$page_views_last_n_days_total)
	)

	return(region_tbl_coords_links_info_flat)

	}

	# don't stop if there is an error, let's see how many we can get
	get_various_page_data_for_all_pages_safe <-
	safely(get_various_page_data_for_all_pages,
	otherwise = "some_problem")

	# this takes several hours
	page_data_for_all_pages <-
	get_various_page_data_for_all_pages_safe(wh_wiki_table)

	# take a quick look
	page_data_for_all_pages$result %>%
	select_if(is.numeric)

	# save it
	saveRDS(page_data_for_all_pages$result,
	'data/page_data_for_all_pages.rds')


	russia <- "https://en.wikipedia.org/wiki/List_of_World_Heritage_Sites_in_Russia"

	russia__xml_tables <-
	read_html(russia) %>%
	html_nodes("table")

	russia_tables_tbl <-
	map(russia__xml_tables,
	~html_table(.x, fill = TRUE) %>% as_tibble(., .name_repair = "unique"))

	russia_tables_tbl_culture <-
	russia_tables_tbl[[2]] %>%
	mutate(type = ifelse(Image == "", NA, Image),
	type = zoo::na.locf(type)) %>%
	filter(type == "Cultural") %>%
	slice(-1) %>%
	select(Name, Location, Yearlisted, Description) %>%
	set_names(c("Site", "Location", "Year", "Description")) %>%
	mutate(Year_num = ifelse(is.integer(Year), Year, parse_number(Year)))


	# scrape out the coords for mapping
	coords_chars <- "\\d\|°\|′\|″\|N\|E\|S\|W\|\\."
	russia_tables_tbl_culture_coords <-
	map_chr(str_extract_all(russia_tables_tbl_culture$Location,
	coords_chars),
	~paste0(.x, collapse = "") %>%
	str_replace_all(., "^[A-Z]*", "") %>%
	str_extract(., ".+?(?=E\|W)"))

	russia_coords_clean <-
	russia_tables_tbl_culture_coords %>%
	str_split("N\|S") %>%
	Reduce(rbind, .) %>%
	as_tibble(., .name_repair = "universal") %>%
	mutate(lat = parse_lat(`...1`),
	lon = parse_lon(`...2`))

	# attach clean coords to main table
	russia_tbl_coords <-
	russia_tables_tbl_culture %>%
	bind_cols(russia_coords_clean)

	# get country of site from location text
	russia_tbl_coords <-
	russia_tbl_coords %>%
	mutate(country = "Russia",
	Criteria = 'Cultural',
	`Areaha (acre)` = NA)

	# get links to each site's wiki page
	get_link <- function(html_table, Site){
	html_table %>%
	html_nodes(xpath=paste0("//a[text()='", Site, "']")) %>%
	html_attr("href")
	}

	cols_we_want <-
	c(
	"Site" ,
	"Location" ,
	"Criteria" ,
	"Areaha (acre)" ,
	"Year_num" ,
	"lat" ,
	"lon" ,
	"country" ,
	"site_page_name" ,
	"site_page_link"
	)

	russia_tbl_coords_links <-
	russia_tbl_coords %>%
	mutate(site_page_name = map_chr(Site,
	~get_link(russia__xml_tables[2], .x)[1])) %>%
	mutate(site_page_link = as.character(str_glue('https://en.wikipedia.org{site_page_name}'))) %>%
	select(cols_we_want)

	tbl_russia <- russia_tbl_coords_links
	names(page_data_for_all_pages$result)

	#--------------------------------------------------------------------------
	# take a look at the distribution of page variables
	some_page_variables <-
	page_data_for_all_pages$result %>%
	select(page_wordcount,
	page_wikilinks_out,
	page_wikilinks_in,
	page_cited_items_on) %>%
	mutate(page_wikilinks_out_norm = page_wikilinks_out / page_wordcount,
	page_cited_items_on_norm = page_cited_items_on / page_wordcount)

	some_page_variables %>%
	gather(variable, value) %>%
	ggplot(aes(value)) +
	geom_histogram() +
	facet_wrap( ~ variable,
	scales = "free") +
	scale_x_log10() +
	theme_minimal()

	library(GGally)
	ggpairs( some_page_variables %>%
	mutate_all(log)) +
	theme_minimal()

	# umap
	library(uwot)
	pages_umap_input <-
	some_page_variables %>%
	mutate(page_wordcount_scaled = scale(page_wordcount),
	page_wikilinks_in_scaled = scale(page_wikilinks_in)) %>%
	select(-page_wordcount,
	-page_cited_items_on,
	-page_wikilinks_out,
	-page_wikilinks_in) %>%
	bind_cols(., page_data_for_all_pages$result[ , 'country'] ) %>%
	filter_all(all_vars(!is.na(.))) %>%
	left_join(World %>%
	select(name, continent),
	by = c('country' = 'name'))

	# compute umap

	pages_umap_input_selected <-
	pages_umap_input %>%
	select(-country,
	-continent,
	-geometry
	)

	pages_umap_output <-
	pages_umap_input_selected %>%
	umap(.,
	n_neighbors = 50,
	min_dist = 0.9,
	nn_method = "annoy",
	init = "spca") %>%
	as_tibble()

	# compute hdbscan clusters
	library(dbscan)
	hdbscan_out <- hdbscan(pages_umap_output,
	minPts = 5)

	table(hdbscan_out$cluster)

	main_plot <-
	ggplot(pages_umap_output,
	aes(V1, V2)) +
	geom_point(size = 3,
	aes(colour = factor(hdbscan_out$cluster))) +
	scale_color_viridis_d(guide = FALSE) +
	theme_minimal() +
	xlab("") +
	ylab("")

	main_plot

	# train a feature-selecting classificator like random forests on
	# the cluster labels

	rand_forest_input <-
	pages_umap_input_selected %>%
	mutate(clus = hdbscan_out$cluster) %>%
	filter(clus != 0)

	library(caret)

	fit <- train(
	clus ~ .,
	data = rand_forest_input,
	method = "ranger",
	trControl = trainControl(method="cv",
	number = 10,
	allowParallel = TRUE,
	verbose = TRUE),
	importance = 'permutation')

	fit
	var_imp_tbl <- tibble(var = row.names(varImp(fit)$importance),
	imp = varImp(fit)$importance$Overall)

	theme_nogrid <- function (base_size = 12, base_family = "") {
	theme_bw(base_size = base_size,
	base_family = base_family) %+replace%
	theme(panel.grid = element_blank() )
	}

	sub_plot <-
	ggplot(var_imp_tbl,
	aes(reorder( var, -imp ),
	imp)) +
	geom_col() +
	coord_flip() +
	xlab("") +
	ylab("") +
	theme_nogrid(base_size = 6)

	# plot plus subplot
	main_plot +
	annotation_custom(ggplotGrob(sub_plot),
	xmin = -10.0,
	xmax = -2,
	ymin=-7.5,
	ymax=-3.5)




	pca_out <- prcomp(pages_umap_input_selected)

	pca_out_df <- tibble(pc1 = pca_out$x[ , 1],
	pc2 = pca_out$x[ , 2],
	pc3 = pca_out$x[ , 3],
	clus = hdbscan_out$cluster)

	ggplot(pca_out_df,
	aes(pc1,
	pc3,
	colour = factor(clus))) +
	geom_point() +
	scale_color_viridis_d()


	#-------------------------------------------------------------------
	# edit variables

	revision_history_page_details <-
	tibble(revision_history_page_details = map(page_data_for_all_pages$result$page_info_t,
	~.x$revision_history_page_details)) %>%
	mutate(Site = page_data_for_all_pages$result$Site,
	rh_n_editors = map_int(revision_history_page_details, ~n_distinct(.x$rh_user)),
	rh_n_edits = map_int(revision_history_page_details, ~nrow(.x)))

	ggplot(revision_history_page_details,
	aes(rh_n_edits)) +
	geom_histogram()

	some_edit_variables <-
	page_data_for_all_pages$result %>%
	mutate(rh_n_editors = length(page_info_t$revision_history_page_details)) %>%
	select(talk_page_wordcount,
	rh_n_editors,
	rh_user_simpson_idx,
	rh_user_bot_prop,
	rh_revert_prop) %>%
	gather(variable, value)

	ggplot(some_edit_variables,
	aes(value)) +
	geom_histogram() +
	facet_wrap( ~ variable,
	scales = "free") +
	scale_x_log10() +
	theme_minimal()

	library(GGally)
	ggpairs( page_data_for_all_pages$result %>%
	select(rh_user_simpson_idx,
	rh_user_bot_prop,
	rh_revert_prop))
	library(tidyverse)
	library('rvest')
	library(vegan)

	# gets a random WP page
	rando <- "https://en.wikipedia.org/wiki/Special:Random"

	# gets data about 1000 WP pages
	n <- 10
	random_page_data <- rerun(n, get_data_about_page_safe(rando) )

	# drops the pages that returned an error
	random_page_data_lst <- transpose(random_page_data)[["result"]]

	random_page_data_lst <- discard(random_page_data_lst, ~.x[1] == "some_problem")

	# put page data into a table
	random_page_data_tbl <-
	tibble(page_wordcount = map_int(random_page_data_lst, ~.x$page_wordcount),
	page_wikilinks_out = map_int(random_page_data_lst, ~.x$page_wikilinks_out),
	page_wikilinks_in = map_int(random_page_data_lst, ~.x$page_wikilinks_in),
	page_cited_items_on = map_int(random_page_data_lst, ~.x$page_cited_items_on),
	rh_user_simpson_idx = map_dbl(random_page_data_lst, ~.x$rh_user_simpson_idx),
	rh_user_bot_prop = map_dbl(random_page_data_lst, ~.x$rh_user_bot_prop),
	rh_revert_prop = map_dbl(random_page_data_lst, ~.x$rh_revert_prop),
	talk_page_wordcount = map_dbl(random_page_data_lst, ~.x$talk_page_wordcount),
	page_views_last_n_days_total = map_int(random_page_data_lst, ~.x$page_views_last_n_days_total)
	)

	random_page_data_tbl %>%
	gather(variable, value) %>%
	ggplot(aes(value)) +
	geom_histogram() +
	facet_wrap( ~ variable, scales = "free")