Skip to content

Instantly share code, notes, and snippets.

@benmarwick
Last active April 3, 2023 16:35
  • Star 4 You must be signed in to star a gist
  • Fork 3 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save benmarwick/ac394fd61f481393b0ae to your computer and use it in GitHub Desktop.
Using R with wikipedia for various things
# get recent changes from wikipedia
library(rvest)
n_changes <- 5000
recent_changes_url <- paste0("https://en.wikipedia.org/w/index.php?title=Special:RecentChanges&limit=", n_changes , "&days=1")
# connect to website
html <- read_html(recent_changes_url)
# full text of change log
recent_changes_txt <-
html %>%
html_nodes(".mw-changeslist-human") %>%
html_text()
length(recent_changes_txt)
# URLs of diff pages
recent_changes_urls <-
html %>%
html_nodes(".mw-changeslist-human > a:nth-child(1)") %>%
html_attr('href')
length(recent_changes_urls)
# full URL
recent_changes_urls <- paste0("https://en.wikipedia.org/", recent_changes_urls)
# get title of article
recent_changes_title <-
recent_changes_txt %>%
ifelse(grepl("\\(diff.* . . |;.*", .),
gsub("\\(diff.* . . |;.*", "", .), .) %>%
gsub("\\(diff.* . . |;.*", "", .) %>%
ifelse(grepl(" \\. \\. m", .),
gsub(" \\. \\. m", "", .), .) %>%
ifelse(grepl("\\(diff.*hist\\) ", .),
gsub("\\(diff.*hist\\) ", "", .),
.)
# remove end of line char
Encoding(recent_changes_title) <- "UTF-8"
recent_changes_title <- gsub("\u200E", "", recent_changes_title)
length(recent_changes_title)
# combine diff data into a data frame
library(dplyr)
recent_changes_df <- data_frame(diff_title = recent_changes_title,
diff_txt = recent_changes_txt,
diff_url = recent_changes_urls,
diff_url_short = 0)
# filter so we only see rows about archaeology
recent_changes_df_archae <-
recent_changes_df %>%
filter(grepl("archaeol|Archaeol", diff_title))
recent_changes_df_archae
# if there are many articles, make one tweet per article
library(urlshorteneR)
library(twitteR)
twitter_api_key <- "xxx"
twitter_api_secret <- "xxx"
twitter_access_token <- "xxx"
access_token_secret <- "xxx"
# store prepared tweets
tweet_text <- vector(length = nrow(recent_changes_df_archae))
# loop over each row of recent_changes_df_archae to make a tweet for each
# edited article
for(i in seq_len(nrow(recent_changes_df_archae))){
# make a short url
recent_changes_df_archae$diff_url_short[i] <- isgd_LinksShorten(longUrl = recent_changes_df_archae$diff_url[i])
# compose a tweet
tweet_text[i] <- paste0("The Wikipedia article '",
recent_changes_df_archae$diff_title[i],
"' was just edited. See the change here: ",
recent_changes_df_archae$diff_url_short[i])
}
# post to twitter
# https://twitter.com/archaeoledits
setup_twitter_oauth(twitter_api_key,
twitter_api_secret,
twitter_access_token,
access_token_secret)
for(i in seq_along(tweet_text)){
tweet(tweet_text[i])
}
# setup a task in Windows to run this script automatically:
# http://www.r-datacollection.com/blog/Programming-a-Twitter-bot/
library(RCurl)
library(XML)
library(ggplot2)
devtools::install_github("hadley/reshape")
library(reshape2)
users <- c("Lizzy8127", "Arctickinkajou", "Rsuwsearch", "Hurlej", "Johnhart151",
"Rocket027", "Z.%20PUPU", "Cookingtheworld", "piesquared93", "Rjmath",
"Liann2009")
user_edits <- vector("list",length = length(users))
names(user_edits) <- users
for(i in 1:length(users)){
print(i)
theurl <- paste0("http://en.wikipedia.org/w/index.php?title=Special:Contributions&offset=&limit=5000&contribs=user&target=", users[i])
webpage <- getURL(theurl)
# Process escape characters
webpage <- readLines(tc <- textConnection(webpage)); close(tc)
# Parse the html tree, ignoring errors on the page
pagetree <- htmlTreeParse(webpage, error=function(...){}, useInternalNodes = TRUE)
# Extract table header and contents
dates <- xpathSApply(pagetree, "//*[@class='mw-changeslist-date']", xmlValue)
bytes_changed <- xpathSApply(pagetree, "//*[@dir='ltr']", xmlValue)[c(-1, -length
(xpathSApply(pagetree, "//*[@dir='ltr']", xmlValue)))]
articles <- xpathSApply(pagetree, "//*[@class='mw-contributions-title']",
xmlValue)
# put into data frame
user_edits[[i]] <- data.frame(dates = dates, bytes_changed = bytes_changed,
articles = articles)
}
### prepare data
names(bytes_changed) <- users
to_match <- c("\\+", "\\(", "\\)", ",")
bytes_changed <- lapply(user_edits, function(i) as.numeric(gsub(paste(to_match, collapse="|"), "", i$bytes_changed)))
long <- melt(user_edits)
### What topics did they edit?
library(wordcloud)
wordcloud(long$articles, random.order=FALSE, min.freq = 1, colors = brewer.pal(6,"Dark2"))
# exclude sandboxes
no_sandboxes <- long$articles[!grepl("sandbox", long$articles)]
wordcloud(no_sandboxes, random.order=FALSE, min.freq = 1, colors = brewer.pal(6,"Dark2"))
### who added the most text?
# total bytes per user
sort(sapply(bytes_changed, sum))
# plot bytes per edit per user
bytes_changed_stack <- stack(bytes_changed)
ggplot(bytes_changed_stack, aes(reorder(ind, -values, FUN = sum), values)) +
geom_jitter(position = position_jitter(width = .05), colour = "grey20", size = 2) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0, hjust=0)) +
xlab("") +
ylab("bytes changed")
# exclude outliers
ggplot(bytes_changed_stack, aes(reorder(ind, -values, FUN = sum), values)) +
geom_jitter(position = position_jitter(width = .05), colour = "grey20", size = 2) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0, hjust=0)) +
xlab("") +
ylab("bytes changed") +
ylim(-2500, 2500)
### how did editing change over time?
long$bytes_changed <- as.numeric(gsub(paste(to_match, collapse="|"), "", long$bytes_changed))
# convert dates to proper date format that R can recognise
long$dates <- strptime(as.character(long$dates), format = "%H:%M, %d %b %Y")
# get week of the year for each entry
library(lubridate)
long$week <- week(long$dates)
# plot
ggplot(long, aes(week, bytes_changed, colour = L1)) +
geom_jitter(position = position_jitter(width = .05), size = 2) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0, hjust=0)) +
xlab("week of the year") +
ylab("bytes changed")
# plot with smoother
ggplot(long, aes(week, bytes_changed)) +
geom_jitter(position = position_jitter(width = .05), size = 2) +
geom_smooth() +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0, hjust=0)) +
xlab("week of the year") +
ylab("bytes added")
# how did the topics change over time?
ggplot(long, aes(week, bytes_changed)) +
geom_text(aes(label = articles, size = abs(bytes_changed)), position = position_jitter(width=1, height=1), ) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0, hjust=0)) +
xlab("week of the year") +
ylab("bytes changed")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment