Last active
April 3, 2023 16:35
-
-
Save benmarwick/ac394fd61f481393b0ae to your computer and use it in GitHub Desktop.
Using R with wikipedia for various things
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# get recent changes from wikipedia | |
library(rvest) | |
n_changes <- 5000 | |
recent_changes_url <- paste0("https://en.wikipedia.org/w/index.php?title=Special:RecentChanges&limit=", n_changes , "&days=1") | |
# connect to website | |
html <- read_html(recent_changes_url) | |
# full text of change log | |
recent_changes_txt <- | |
html %>% | |
html_nodes(".mw-changeslist-human") %>% | |
html_text() | |
length(recent_changes_txt) | |
# URLs of diff pages | |
recent_changes_urls <- | |
html %>% | |
html_nodes(".mw-changeslist-human > a:nth-child(1)") %>% | |
html_attr('href') | |
length(recent_changes_urls) | |
# full URL | |
recent_changes_urls <- paste0("https://en.wikipedia.org/", recent_changes_urls) | |
# get title of article | |
recent_changes_title <- | |
recent_changes_txt %>% | |
ifelse(grepl("\\(diff.* . . |;.*", .), | |
gsub("\\(diff.* . . |;.*", "", .), .) %>% | |
gsub("\\(diff.* . . |;.*", "", .) %>% | |
ifelse(grepl(" \\. \\. m", .), | |
gsub(" \\. \\. m", "", .), .) %>% | |
ifelse(grepl("\\(diff.*hist\\) ", .), | |
gsub("\\(diff.*hist\\) ", "", .), | |
.) | |
# remove end of line char | |
Encoding(recent_changes_title) <- "UTF-8" | |
recent_changes_title <- gsub("\u200E", "", recent_changes_title) | |
length(recent_changes_title) | |
# combine diff data into a data frame | |
library(dplyr) | |
recent_changes_df <- data_frame(diff_title = recent_changes_title, | |
diff_txt = recent_changes_txt, | |
diff_url = recent_changes_urls, | |
diff_url_short = 0) | |
# filter so we only see rows about archaeology | |
recent_changes_df_archae <- | |
recent_changes_df %>% | |
filter(grepl("archaeol|Archaeol", diff_title)) | |
recent_changes_df_archae | |
# if there are many articles, make one tweet per article | |
library(urlshorteneR) | |
library(twitteR) | |
twitter_api_key <- "xxx" | |
twitter_api_secret <- "xxx" | |
twitter_access_token <- "xxx" | |
access_token_secret <- "xxx" | |
# store prepared tweets | |
tweet_text <- vector(length = nrow(recent_changes_df_archae)) | |
# loop over each row of recent_changes_df_archae to make a tweet for each | |
# edited article | |
for(i in seq_len(nrow(recent_changes_df_archae))){ | |
# make a short url | |
recent_changes_df_archae$diff_url_short[i] <- isgd_LinksShorten(longUrl = recent_changes_df_archae$diff_url[i]) | |
# compose a tweet | |
tweet_text[i] <- paste0("The Wikipedia article '", | |
recent_changes_df_archae$diff_title[i], | |
"' was just edited. See the change here: ", | |
recent_changes_df_archae$diff_url_short[i]) | |
} | |
# post to twitter | |
# https://twitter.com/archaeoledits | |
setup_twitter_oauth(twitter_api_key, | |
twitter_api_secret, | |
twitter_access_token, | |
access_token_secret) | |
for(i in seq_along(tweet_text)){ | |
tweet(tweet_text[i]) | |
} | |
# setup a task in Windows to run this script automatically: | |
# http://www.r-datacollection.com/blog/Programming-a-Twitter-bot/ | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(RCurl) | |
library(XML) | |
library(ggplot2) | |
devtools::install_github("hadley/reshape") | |
library(reshape2) | |
users <- c("Lizzy8127", "Arctickinkajou", "Rsuwsearch", "Hurlej", "Johnhart151", | |
"Rocket027", "Z.%20PUPU", "Cookingtheworld", "piesquared93", "Rjmath", | |
"Liann2009") | |
user_edits <- vector("list",length = length(users)) | |
names(user_edits) <- users | |
for(i in 1:length(users)){ | |
print(i) | |
theurl <- paste0("http://en.wikipedia.org/w/index.php?title=Special:Contributions&offset=&limit=5000&contribs=user&target=", users[i]) | |
webpage <- getURL(theurl) | |
# Process escape characters | |
webpage <- readLines(tc <- textConnection(webpage)); close(tc) | |
# Parse the html tree, ignoring errors on the page | |
pagetree <- htmlTreeParse(webpage, error=function(...){}, useInternalNodes = TRUE) | |
# Extract table header and contents | |
dates <- xpathSApply(pagetree, "//*[@class='mw-changeslist-date']", xmlValue) | |
bytes_changed <- xpathSApply(pagetree, "//*[@dir='ltr']", xmlValue)[c(-1, -length | |
(xpathSApply(pagetree, "//*[@dir='ltr']", xmlValue)))] | |
articles <- xpathSApply(pagetree, "//*[@class='mw-contributions-title']", | |
xmlValue) | |
# put into data frame | |
user_edits[[i]] <- data.frame(dates = dates, bytes_changed = bytes_changed, | |
articles = articles) | |
} | |
### prepare data | |
names(bytes_changed) <- users | |
to_match <- c("\\+", "\\(", "\\)", ",") | |
bytes_changed <- lapply(user_edits, function(i) as.numeric(gsub(paste(to_match, collapse="|"), "", i$bytes_changed))) | |
long <- melt(user_edits) | |
### What topics did they edit? | |
library(wordcloud) | |
wordcloud(long$articles, random.order=FALSE, min.freq = 1, colors = brewer.pal(6,"Dark2")) | |
# exclude sandboxes | |
no_sandboxes <- long$articles[!grepl("sandbox", long$articles)] | |
wordcloud(no_sandboxes, random.order=FALSE, min.freq = 1, colors = brewer.pal(6,"Dark2")) | |
### who added the most text? | |
# total bytes per user | |
sort(sapply(bytes_changed, sum)) | |
# plot bytes per edit per user | |
bytes_changed_stack <- stack(bytes_changed) | |
ggplot(bytes_changed_stack, aes(reorder(ind, -values, FUN = sum), values)) + | |
geom_jitter(position = position_jitter(width = .05), colour = "grey20", size = 2) + | |
theme_minimal() + | |
theme(axis.text.x = element_text(angle = 90, vjust = 0, hjust=0)) + | |
xlab("") + | |
ylab("bytes changed") | |
# exclude outliers | |
ggplot(bytes_changed_stack, aes(reorder(ind, -values, FUN = sum), values)) + | |
geom_jitter(position = position_jitter(width = .05), colour = "grey20", size = 2) + | |
theme_minimal() + | |
theme(axis.text.x = element_text(angle = 90, vjust = 0, hjust=0)) + | |
xlab("") + | |
ylab("bytes changed") + | |
ylim(-2500, 2500) | |
### how did editing change over time? | |
long$bytes_changed <- as.numeric(gsub(paste(to_match, collapse="|"), "", long$bytes_changed)) | |
# convert dates to proper date format that R can recognise | |
long$dates <- strptime(as.character(long$dates), format = "%H:%M, %d %b %Y") | |
# get week of the year for each entry | |
library(lubridate) | |
long$week <- week(long$dates) | |
# plot | |
ggplot(long, aes(week, bytes_changed, colour = L1)) + | |
geom_jitter(position = position_jitter(width = .05), size = 2) + | |
theme_minimal() + | |
theme(axis.text.x = element_text(angle = 90, vjust = 0, hjust=0)) + | |
xlab("week of the year") + | |
ylab("bytes changed") | |
# plot with smoother | |
ggplot(long, aes(week, bytes_changed)) + | |
geom_jitter(position = position_jitter(width = .05), size = 2) + | |
geom_smooth() + | |
theme_minimal() + | |
theme(axis.text.x = element_text(angle = 90, vjust = 0, hjust=0)) + | |
xlab("week of the year") + | |
ylab("bytes added") | |
# how did the topics change over time? | |
ggplot(long, aes(week, bytes_changed)) + | |
geom_text(aes(label = articles, size = abs(bytes_changed)), position = position_jitter(width=1, height=1), ) + | |
theme_minimal() + | |
theme(axis.text.x = element_text(angle = 90, vjust = 0, hjust=0)) + | |
xlab("week of the year") + | |
ylab("bytes changed") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment