Skip to content

Instantly share code, notes, and snippets.

@benmarwick
Last active February 22, 2019 20:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save benmarwick/c934185429fb12a24ebd79ef7904261b to your computer and use it in GitHub Desktop.
Save benmarwick/c934185429fb12a24ebd79ef7904261b to your computer and use it in GitHub Desktop.
# list of lists
url <- "https://en.wikipedia.org/wiki/Lists_of_World_Heritage_Sites"
# Table of sites per country can be found at each of these pages
# probably the simplest entry point
# Africa
africa <- "https://en.wikipedia.org/wiki/List_of_World_Heritage_Sites_in_Africa"
# Americas
n_america <- "https://en.wikipedia.org/wiki/List_of_World_Heritage_Sites_in_North_America"
c_america <- "https://en.wikipedia.org/wiki/List_of_World_Heritage_Sites_in_Central_America"
caribbean <- "https://en.wikipedia.org/wiki/List_of_World_Heritage_Sites_in_the_Caribbean"
s_america <- "https://en.wikipedia.org/wiki/List_of_World_Heritage_Sites_in_South_America"
# Asia
n_and_c_asia <- "https://en.wikipedia.org/wiki/List_of_World_Heritage_Sites_in_Northern_and_Central_Asia"
w_asia <- "https://en.wikipedia.org/wiki/List_of_World_Heritage_Sites_in_Western_Asia"
e_asia <- "https://en.wikipedia.org/wiki/List_of_World_Heritage_sites_in_Eastern_Asia"
s_asia <- "https://en.wikipedia.org/wiki/List_of_World_Heritage_sites_in_Southern_Asia"
# SE Asia list currently removed due to copyvio :(
se_asia <- "https://en.wikipedia.org/w/index.php?title=List_of_World_Heritage_Sites_in_Southeast_Asia&direction=prev&oldid=878049179"
# Europe
n_europe <- "https://en.wikipedia.org/wiki/List_of_World_Heritage_Sites_in_Northern_Europe"
w_europe <- "https://en.wikipedia.org/wiki/List_of_World_Heritage_sites_in_Western_Europe"
e_europe <- "https://en.wikipedia.org/wiki/List_of_World_Heritage_sites_in_Eastern_Europe"
s_europe <- "https://en.wikipedia.org/wiki/List_of_World_Heritage_Sites_in_Southern_Europe"
# Oceania
oceania <- "https://en.wikipedia.org/wiki/List_of_World_Heritage_Sites_in_Oceania"
#-----------------------------------------------------
# Get the table of WH locations from each website listed above
library('rvest')
library('tidyverse')
library("parzer") # devtools::install_github("ropenscilabs/parzer")
library("tmap")
data("World")
library(vegan)
#-----------------------------------------------------
# get table of sites per region
get_table_of_sites_per_region <- function(region){
# here are all tables on the page
region_table_xml_all_tables <-
read_html(region) %>%
html_nodes("table")
# we need to get the xml of the table of sites, since we will use the XML later
if(region != se_asia){
which_table_has_the_sites <-
map(region_table_xml_all_tables,
~html_table(.x, fill = TRUE)) %>%
map_lgl(~names(.x)[1] == "Site")
region_table_xml <-
keep(region_table_xml_all_tables,
which_table_has_the_sites)
# here is the table of sites for that region
# first column is called "Site"
region_table_xml_table <-
map(region_table_xml,
html_table,
fill = TRUE) %>%
keep(~names(.x)[1] == "Site") %>%
.[[1]] %>%
as_tibble(., .name_repair = "unique")
} else {
# the SE Asia table is troublesome
region_table_xml <- region_table_xml_all_tables[3]
region_table_xml_table <- html_table(region_table_xml)[[1]]
}
# filter so we only keep the cultural sites
region_tbl <-
region_table_xml_table %>%
filter(str_detect(Criteria, 'Cultural')) %>%
mutate(Year_num = ifelse(is.integer(Year), Year, parse_number(Year)))
# scrape out the coords for mapping
coords_chars <- "\\d|°|′|″|N|E|S|W|\\."
region_coords <-
map_chr(str_extract_all(region_tbl$Location,
coords_chars),
~paste0(.x, collapse = "") %>%
str_replace_all(., "^[A-Z]*", "") %>%
str_extract(., ".+?(?=E|W)"))
region_coords_clean <-
region_coords %>%
str_split("N|S") %>%
Reduce(rbind, .) %>%
as_tibble(., .name_repair = "universal") %>%
mutate(lat = parse_lat(`...1`),
lon = parse_lon(`...2`))
# attach clean coords to main table
region_tbl_coords <-
region_tbl %>%
bind_cols(region_coords_clean)
# get country of site from location text
# Laos, Czech Republic, Micronesia, Zimbabwe, South Sudan, Chad,
# Central African Republic, Congo, Gabon, Cameroon, Nigeria, Bosnia and Herzegovina
# Cote d'Ivoire, Sierra Leone, Guyana, Belize,
country_names <- paste(World$name, collapse="|")
region_tbl_coords <-
region_tbl_coords %>%
mutate(country = str_extract(Location,
regex(country_names,
ignore.case=TRUE)))
# get links to each site's wiki page
get_link <- function(html_table, Site){
html_table %>%
html_nodes(xpath=paste0("//a[text()='", Site, "']")) %>%
html_attr("href")
}
region_tbl_coords_links <-
region_tbl_coords %>%
mutate(site_page_name = map_chr(Site,
~get_link(region_table_xml, .x)[1])) %>%
mutate(site_page_link = as.character(str_glue('https://en.wikipedia.org{site_page_name}')))
return(region_tbl_coords_links)
}
#-----------------------------------------------------
# get the wikilink to each page for each WH site in each region
tbl_africa <- map_df(africa , get_table_of_sites_per_region)
tbl_n_america <- map_df(n_america , get_table_of_sites_per_region)
tbl_c_america <- map_df(c_america, get_table_of_sites_per_region)
tbl_caribbean <- map_df(caribbean , get_table_of_sites_per_region)
tbl_s_america <- map_df(s_america , get_table_of_sites_per_region)
tbl_n_and_c_asia <- map_df(n_and_c_asia, get_table_of_sites_per_region)
tbl_w_asia <- map_df(w_asia , get_table_of_sites_per_region)
tbl_e_asia <- map_df(e_asia , get_table_of_sites_per_region)
tbl_s_asia <- map_df(s_asia , get_table_of_sites_per_region)
# special handling needed
tbl_se_asia <- map_df(se_asia , get_table_of_sites_per_region)
tbl_n_europe <- map_df(n_europe, get_table_of_sites_per_region)
tbl_w_europe <- map_df(w_europe, get_table_of_sites_per_region)
tbl_e_europe <- map_df(e_europe, get_table_of_sites_per_region)
tbl_s_europe <- map_df(s_europe, get_table_of_sites_per_region)
tbl_russia <- tbl_russia # has it's own weird page
tbl_oceania <- map_df(oceania, get_table_of_sites_per_region)
# Put them all into one big data frame
cols_we_want <-
c(
"Site" ,
"Location" ,
"Criteria" ,
"Areaha (acre)" ,
"Year_num" ,
"lat" ,
"lon" ,
"country" ,
"site_page_name" ,
"site_page_link"
)
all_regions <- list(
tbl_africa ,
tbl_n_america ,
tbl_c_america ,
tbl_caribbean ,
tbl_s_america ,
tbl_n_and_c_asia ,
tbl_w_asia ,
tbl_e_asia ,
tbl_s_asia ,
tbl_se_asia ,
tbl_n_europe ,
tbl_w_europe ,
tbl_e_europe ,
tbl_s_europe ,
tbl_russia,
tbl_oceania
)
wh_wiki_table <- map_df(all_regions, ~select(., cols_we_want))
write_csv(wh_wiki_table, 'data/wh_wiki_table.csv')
# nothing from Russia or the UK
# sites that we have wikipages for
all_regions_cols_we_want_with_pages <-
wh_wiki_table %>%
filter(!is.na(site_page_name))
# sites that we do not have wikipages for
all_regions_cols_we_want_without_pages <-
wh_wiki_table %>%
filter(is.na(site_page_name))
# ratio of sites-with-pages to all-sites-in-the-country
country_site_page_ratio <-
wh_wiki_table %>%
group_by(country) %>%
count(pg = !is.na(site_page_name)) %>%
spread(pg, n) %>%
mutate(no_page = ifelse(is.na(`FALSE`), 0, `FALSE`),
has_page = ifelse(is.na(`TRUE`), 0, `TRUE`),
total = no_page + has_page) %>%
mutate(ratio_sites_with_pages = has_page / total) %>%
filter(!is.na(total))
# spatial distribution of sites with no page
library(tmap)
data("World")
# tmap_mode("view") # for interactive
tmap_mode("plot")
tm_map_data <-
World %>%
left_join(country_site_page_ratio,
by = c( 'name' = 'country'))
tm_shape(tm_map_data) +
tm_polygons("ratio_sites_with_pages",
palette='viridis',
popup.vars = 'name') +
tm_shape(tm_map_data) +
tm_symbols(col = "red",
alpha = 0.1,
size = "total",
scale = 2)
#-----------------------------------------------------
#
# Now, for each site page, let's get some stuff...
#
#- page size
#- number of links on page
#- number of references on page
#- number of pages-linking-to
#- talk page size
#- revision history
#- number of edits
#- number of editors
#- diversity of editors
#- distribution of diff sizes
#- do editors co-occur on multiple pages?
#- geometric mean so edits per day and of words changed per day
get_data_about_page <- function(the_page) {
page <-
the_page %>%
read_html() %>%
html_nodes("#content")
page_name <-
the_page %>%
str_remove("https://en.wikipedia.org/wiki/")
page_wordcount <-
page %>%
html_text() %>%
stringi::stri_count_words()
page_wikilinks_out <-
page %>%
html_nodes("p > a") %>%
html_attr('href') %>%
length()
page_wikilinks_in <-
the_page %>%
read_html() %>%
html_nodes("#t-whatlinkshere a") %>%
html_attr('href') %>%
str_remove("/wiki/") %>%
str_glue("https://en.wikipedia.org/w/index.php?title=",.,
"&limit=1000") %>%
read_html() %>%
html_nodes("#mw-whatlinkshere-list li > a") %>%
html_attr('href') %>%
length()
page_cited_items_on <-
page %>%
html_nodes(".reference a") %>%
html_text() %>%
unique() %>%
length()
revision_history_page <-
the_page %>%
read_html() %>%
html_nodes("#ca-history a") %>%
html_attr('href') %>%
str_glue("https://en.wikipedia.org",.) %>%
str_replace("&action=history",
"&offset=&limit=2000&action=history")
rh_date = revision_history_page %>%
read_html %>%
html_nodes('li > :nth-child(4)') %>%
html_text() %>%
lubridate::parse_date_time("H:M, d b Y")
rh_user = revision_history_page %>%
read_html %>%
html_nodes( 'bdi') %>%
html_text()
rh_size = revision_history_page %>%
read_html %>%
html_nodes( '.history-size') %>%
html_text()
rh_diff_size = revision_history_page %>%
read_html %>%
html_nodes( '.mw-diff-bytes') %>%
html_text()
rh_comment = revision_history_page %>%
read_html %>%
html_nodes(' #pagehistory') %>%
html_text() %>%
str_split(regex("\n")) %>%
.[[1]] %>%
str_replace("^.+ . .", "") %>%
enframe %>%
filter(value != "") %>%
pull(value)
revision_history_page_details <-
tibble(rh_date = rh_date,
rh_user = rh_user,
rh_size = rh_size,
rh_diff_size = rh_diff_size,
rh_comment = rh_comment
)
revision_history_page_details <-
revision_history_page_details %>%
mutate(rh_size = parse_number(rh_size),
rh_diff_size = parse_number(rh_diff_size))
# bots and reverts
revision_history_page_details <-
revision_history_page_details %>%
mutate(bot = ifelse(str_detect(rh_user,
regex('bot',
ignore_case = TRUE)),
1, 0),
revert = ifelse(str_detect(rh_comment,
regex('revert',
ignore_case = TRUE)),
1, 0))
# revision summary stats
rh_user_simpson_idx <-
revision_history_page_details %>%
count(rh_user) %>%
summarise(simpson = diversity(n, 'simpson')) %>%
pull(simpson)
rh_diff_size_cv <-
revision_history_page_details %>%
summarise(cv = sd(rh_diff_size) / mean(rh_diff_size)) %>%
pull(cv)
rh_user_bot_prop <-
revision_history_page_details %>%
count(bot) %>%
summarise(bot_prop = ifelse(sum(bot) == 0,
0,
n[bot==1] / sum(n))) %>%
pull(bot_prop)
rh_revert_prop <-
revision_history_page_details %>%
count(revert) %>%
summarise(revert_prop = ifelse(sum(revert) == 0,
0,
n[revert==1] / sum(n))) %>%
pull(revert_prop)
talk_page_wordcount <- function(the_page){
the_page %>%
read_html() %>%
html_nodes("#ca-talk a") %>%
html_attr('href') %>%
str_glue("https://en.wikipedia.org",.) %>%
read_html() %>%
html_nodes("#content") %>%
html_text() %>%
stringi::stri_count_words()
}
talk_page_wordcount_result <- talk_page_wordcount(the_page)
n_days <- 100
page_views_end <- str_remove_all(Sys.Date() - 1, "-")
page_views_start <- str_remove_all(Sys.Date() - n_days, "-")
page_views_last_n_days_tbl <-
the_page %>%
read_html() %>%
html_nodes("#ca-talk a") %>%
html_attr('href') %>%
str_remove("/wiki/Talk:") %>%
str_glue("https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia.org/all-access/all-agents/",
.,
"/daily/",
page_views_start,
"/",
page_views_end) %>%
read_html() %>%
html_text() %>%
jsonlite::fromJSON() %>%
.[['items']] %>%
as_tibble()
page_views_last_n_days_total = sum(page_views_last_n_days_tbl$views)
return(list(page = page,
page_wordcount = page_wordcount,
page_wikilinks_out = page_wikilinks_out,
page_wikilinks_in = page_wikilinks_in,
page_cited_items_on = page_cited_items_on,
revision_history_page_details = revision_history_page_details,
rh_user_simpson_idx = rh_user_simpson_idx,
rh_diff_size_cv = rh_diff_size_cv,
rh_user_bot_prop = rh_user_bot_prop,
rh_revert_prop = rh_revert_prop,
talk_page_wordcount = talk_page_wordcount_result,
page_views_last_n_days_total = page_views_last_n_days_total))
}
# don't stop if there is an error, let's see how many we can get
get_data_about_page_safe <-
safely(get_data_about_page,
otherwise = "some_problem")
#-----------------------------------------------------
# Here's a function
get_various_page_data_for_all_pages <- function(tbl_region){
# get these data for all pages in a set
# this is a time-consuming web-scraping step
region_tbl_coords_links_info <-
tbl_region %>%
filter(site_page_link != "https://en.wikipedia.orgNA") %>%
mutate(page_info = map(site_page_link, get_data_about_page_safe))
# flatten out some of the results into the table with one row per WH site
region_tbl_coords_links_info_flat <-
region_tbl_coords_links_info %>%
mutate(page_info_t = transpose(page_info)[["result"]]) %>%
filter(page_info_t != "some_problem") %>%
mutate(page_wordcount = map_int(page_info_t, ~.x$page_wordcount),
page_wikilinks_out = map_int(page_info_t, ~.x$page_wikilinks_out),
page_wikilinks_in = map_int(page_info_t, ~.x$page_wikilinks_in),
page_cited_items_on = map_int(page_info_t, ~.x$page_cited_items_on),
rh_user_simpson_idx = map_dbl(page_info_t, ~.x$rh_user_simpson_idx),
rh_user_bot_prop = map_dbl(page_info_t, ~.x$rh_user_bot_prop),
rh_revert_prop = map_dbl(page_info_t, ~.x$rh_revert_prop),
talk_page_wordcount = map_int(page_info_t, ~.x$talk_page_wordcount),
page_views_last_n_days_total = map_int(page_info_t, ~.x$page_views_last_n_days_total)
)
return(region_tbl_coords_links_info_flat)
}
# don't stop if there is an error, let's see how many we can get
get_various_page_data_for_all_pages_safe <-
safely(get_various_page_data_for_all_pages,
otherwise = "some_problem")
# this takes several hours
page_data_for_all_pages <-
get_various_page_data_for_all_pages_safe(wh_wiki_table)
# take a quick look
page_data_for_all_pages$result %>%
select_if(is.numeric)
# save it
saveRDS(page_data_for_all_pages$result,
'data/page_data_for_all_pages.rds')
russia <- "https://en.wikipedia.org/wiki/List_of_World_Heritage_Sites_in_Russia"
russia__xml_tables <-
read_html(russia) %>%
html_nodes("table")
russia_tables_tbl <-
map(russia__xml_tables,
~html_table(.x, fill = TRUE) %>% as_tibble(., .name_repair = "unique"))
russia_tables_tbl_culture <-
russia_tables_tbl[[2]] %>%
mutate(type = ifelse(Image == "", NA, Image),
type = zoo::na.locf(type)) %>%
filter(type == "Cultural") %>%
slice(-1) %>%
select(Name, Location, Yearlisted, Description) %>%
set_names(c("Site", "Location", "Year", "Description")) %>%
mutate(Year_num = ifelse(is.integer(Year), Year, parse_number(Year)))
# scrape out the coords for mapping
coords_chars <- "\\d|°|′|″|N|E|S|W|\\."
russia_tables_tbl_culture_coords <-
map_chr(str_extract_all(russia_tables_tbl_culture$Location,
coords_chars),
~paste0(.x, collapse = "") %>%
str_replace_all(., "^[A-Z]*", "") %>%
str_extract(., ".+?(?=E|W)"))
russia_coords_clean <-
russia_tables_tbl_culture_coords %>%
str_split("N|S") %>%
Reduce(rbind, .) %>%
as_tibble(., .name_repair = "universal") %>%
mutate(lat = parse_lat(`...1`),
lon = parse_lon(`...2`))
# attach clean coords to main table
russia_tbl_coords <-
russia_tables_tbl_culture %>%
bind_cols(russia_coords_clean)
# get country of site from location text
russia_tbl_coords <-
russia_tbl_coords %>%
mutate(country = "Russia",
Criteria = 'Cultural',
`Areaha (acre)` = NA)
# get links to each site's wiki page
get_link <- function(html_table, Site){
html_table %>%
html_nodes(xpath=paste0("//a[text()='", Site, "']")) %>%
html_attr("href")
}
cols_we_want <-
c(
"Site" ,
"Location" ,
"Criteria" ,
"Areaha (acre)" ,
"Year_num" ,
"lat" ,
"lon" ,
"country" ,
"site_page_name" ,
"site_page_link"
)
russia_tbl_coords_links <-
russia_tbl_coords %>%
mutate(site_page_name = map_chr(Site,
~get_link(russia__xml_tables[2], .x)[1])) %>%
mutate(site_page_link = as.character(str_glue('https://en.wikipedia.org{site_page_name}'))) %>%
select(cols_we_want)
tbl_russia <- russia_tbl_coords_links
names(page_data_for_all_pages$result)
#--------------------------------------------------------------------------
# take a look at the distribution of page variables
some_page_variables <-
page_data_for_all_pages$result %>%
select(page_wordcount,
page_wikilinks_out,
page_wikilinks_in,
page_cited_items_on) %>%
mutate(page_wikilinks_out_norm = page_wikilinks_out / page_wordcount,
page_cited_items_on_norm = page_cited_items_on / page_wordcount)
some_page_variables %>%
gather(variable, value) %>%
ggplot(aes(value)) +
geom_histogram() +
facet_wrap( ~ variable,
scales = "free") +
scale_x_log10() +
theme_minimal()
library(GGally)
ggpairs( some_page_variables %>%
mutate_all(log)) +
theme_minimal()
# umap
library(uwot)
pages_umap_input <-
some_page_variables %>%
mutate(page_wordcount_scaled = scale(page_wordcount),
page_wikilinks_in_scaled = scale(page_wikilinks_in)) %>%
select(-page_wordcount,
-page_cited_items_on,
-page_wikilinks_out,
-page_wikilinks_in) %>%
bind_cols(., page_data_for_all_pages$result[ , 'country'] ) %>%
filter_all(all_vars(!is.na(.))) %>%
left_join(World %>%
select(name, continent),
by = c('country' = 'name'))
# compute umap
pages_umap_input_selected <-
pages_umap_input %>%
select(-country,
-continent,
-geometry
)
pages_umap_output <-
pages_umap_input_selected %>%
umap(.,
n_neighbors = 50,
min_dist = 0.9,
nn_method = "annoy",
init = "spca") %>%
as_tibble()
# compute hdbscan clusters
library(dbscan)
hdbscan_out <- hdbscan(pages_umap_output,
minPts = 5)
table(hdbscan_out$cluster)
main_plot <-
ggplot(pages_umap_output,
aes(V1, V2)) +
geom_point(size = 3,
aes(colour = factor(hdbscan_out$cluster))) +
scale_color_viridis_d(guide = FALSE) +
theme_minimal() +
xlab("") +
ylab("")
main_plot
# train a feature-selecting classificator like random forests on
# the cluster labels
rand_forest_input <-
pages_umap_input_selected %>%
mutate(clus = hdbscan_out$cluster) %>%
filter(clus != 0)
library(caret)
fit <- train(
clus ~ .,
data = rand_forest_input,
method = "ranger",
trControl = trainControl(method="cv",
number = 10,
allowParallel = TRUE,
verbose = TRUE),
importance = 'permutation')
fit
var_imp_tbl <- tibble(var = row.names(varImp(fit)$importance),
imp = varImp(fit)$importance$Overall)
theme_nogrid <- function (base_size = 12, base_family = "") {
theme_bw(base_size = base_size,
base_family = base_family) %+replace%
theme(panel.grid = element_blank() )
}
sub_plot <-
ggplot(var_imp_tbl,
aes(reorder( var, -imp ),
imp)) +
geom_col() +
coord_flip() +
xlab("") +
ylab("") +
theme_nogrid(base_size = 6)
# plot plus subplot
main_plot +
annotation_custom(ggplotGrob(sub_plot),
xmin = -10.0,
xmax = -2,
ymin=-7.5,
ymax=-3.5)
pca_out <- prcomp(pages_umap_input_selected)
pca_out_df <- tibble(pc1 = pca_out$x[ , 1],
pc2 = pca_out$x[ , 2],
pc3 = pca_out$x[ , 3],
clus = hdbscan_out$cluster)
ggplot(pca_out_df,
aes(pc1,
pc3,
colour = factor(clus))) +
geom_point() +
scale_color_viridis_d()
#-------------------------------------------------------------------
# edit variables
revision_history_page_details <-
tibble(revision_history_page_details = map(page_data_for_all_pages$result$page_info_t,
~.x$revision_history_page_details)) %>%
mutate(Site = page_data_for_all_pages$result$Site,
rh_n_editors = map_int(revision_history_page_details, ~n_distinct(.x$rh_user)),
rh_n_edits = map_int(revision_history_page_details, ~nrow(.x)))
ggplot(revision_history_page_details,
aes(rh_n_edits)) +
geom_histogram()
some_edit_variables <-
page_data_for_all_pages$result %>%
mutate(rh_n_editors = length(page_info_t$revision_history_page_details)) %>%
select(talk_page_wordcount,
rh_n_editors,
rh_user_simpson_idx,
rh_user_bot_prop,
rh_revert_prop) %>%
gather(variable, value)
ggplot(some_edit_variables,
aes(value)) +
geom_histogram() +
facet_wrap( ~ variable,
scales = "free") +
scale_x_log10() +
theme_minimal()
library(GGally)
ggpairs( page_data_for_all_pages$result %>%
select(rh_user_simpson_idx,
rh_user_bot_prop,
rh_revert_prop))
library(tidyverse)
library('rvest')
library(vegan)
# gets a random WP page
rando <- "https://en.wikipedia.org/wiki/Special:Random"
# gets data about 1000 WP pages
n <- 10
random_page_data <- rerun(n, get_data_about_page_safe(rando) )
# drops the pages that returned an error
random_page_data_lst <- transpose(random_page_data)[["result"]]
random_page_data_lst <- discard(random_page_data_lst, ~.x[1] == "some_problem")
# put page data into a table
random_page_data_tbl <-
tibble(page_wordcount = map_int(random_page_data_lst, ~.x$page_wordcount),
page_wikilinks_out = map_int(random_page_data_lst, ~.x$page_wikilinks_out),
page_wikilinks_in = map_int(random_page_data_lst, ~.x$page_wikilinks_in),
page_cited_items_on = map_int(random_page_data_lst, ~.x$page_cited_items_on),
rh_user_simpson_idx = map_dbl(random_page_data_lst, ~.x$rh_user_simpson_idx),
rh_user_bot_prop = map_dbl(random_page_data_lst, ~.x$rh_user_bot_prop),
rh_revert_prop = map_dbl(random_page_data_lst, ~.x$rh_revert_prop),
talk_page_wordcount = map_dbl(random_page_data_lst, ~.x$talk_page_wordcount),
page_views_last_n_days_total = map_int(random_page_data_lst, ~.x$page_views_last_n_days_total)
)
random_page_data_tbl %>%
gather(variable, value) %>%
ggplot(aes(value)) +
geom_histogram() +
facet_wrap( ~ variable, scales = "free")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment