Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save LiYingWang/c6993dce3c7bc86bc034f7946bd8aa30 to your computer and use it in GitHub Desktop.
Save LiYingWang/c6993dce3c7bc86bc034f7946bd8aa30 to your computer and use it in GitHub Desktop.
Inspect the edit history of a bunch of wikipedia pages to measure the proportion of the page contributed by each editor
library(tidyverse)
the_pages <- c(
"https://en.wikipedia.org/wiki/Angela_McGowan",
"https://en.wikipedia.org/wiki/Caroline_Bird_(archaeologist)",
"https://en.wikipedia.org/wiki/Jo_McDonald",
"https://en.wikipedia.org/wiki/Laurajane_Smith",
"https://en.wikipedia.org/wiki/Louise_Zarmati",
"https://en.wikipedia.org/wiki/Marcia-Anne_Dobres",
"https://en.wikipedia.org/wiki/Sarah_Colley",
"https://en.wikipedia.org/wiki/Sharon_Sullivan"
)
the_page_history <-
str_replace(the_pages,
'https://en.wikipedia.org/wiki/',
'https://en.wikipedia.org/w/index.php?title=') %>%
str_c(., '&action=history')
library(rvest)
# get a summary of edit history for each page
edit_byte_change_per_editor <-
map(the_page_history,
~read_html(.x) %>%
html_nodes('.mw-diff-bytes , bdi') %>%
html_text() %>%
matrix(., ncol = 2, byrow = T) %>%
data.frame() %>%
set_names(nm = c("editor", "edit_size")) %>%
mutate_all(as.character) %>%
mutate(edit_size = parse_number(edit_size))
)
# put the page names onto this list
names(edit_byte_change_per_editor) <-
str_replace(the_pages,
'https://en.wikipedia.org/wiki/', "")
# plot them all
bind_rows(edit_byte_change_per_editor,
.id = 'page') %>%
group_by(page, editor) %>%
summarise(total_edit_size_bytes = sum(edit_size)) %>%
mutate(percent_total_article = total_edit_size_bytes / sum(total_edit_size_bytes)) %>%
ggplot(aes(reorder(editor,
percent_total_article),
percent_total_article)) +
geom_col() +
theme_minimal(base_size = 10) +
coord_flip() +
facet_wrap( ~ page,
scales = "free") +
xlab("Editor") +
ylab("Proportion of bytes added to the page")
# plot by the total edit size
bind_rows(edit_byte_change_per_editor,
.id = 'page') %>%
group_by(page) %>%
summarise(total_edit_size_bytes = sum(edit_size)) %>%
ggplot(aes(reorder(page, desc(page)), total_edit_size_bytes)) +
geom_col() +
theme_minimal(base_size = 10) +
coord_flip() +
xlab("Archaeologist Page") +
ylab("Total Edit Size Bytes")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment