benmarwick/tweet-edits-to-archaeology-articles.R

## tweet-edits-to-archaeology-articles.R


# get recent changes from wikipedia
library(rvest)
n_changes <- 5000
recent_changes_url <- paste0("https://en.wikipedia.org/w/index.php?title=Special:RecentChanges&limit=", n_changes , "&days=1")

# connect to website
html <- read_html(recent_changes_url)

# full text of change log
recent_changes_txt <-
  html %>%
  html_nodes(".mw-changeslist-human") %>%
  html_text()

length(recent_changes_txt)

# URLs of diff pages
recent_changes_urls <-
  html %>%
  html_nodes(".mw-changeslist-human > a:nth-child(1)") %>%
  html_attr('href')

length(recent_changes_urls)

# full URL
recent_changes_urls <- paste0("https://en.wikipedia.org/", recent_changes_urls)

# get title of article
recent_changes_title <-
  recent_changes_txt %>%
  ifelse(grepl("\\(diff.* . .  |;.*", .),
         gsub("\\(diff.* . .  |;.*", "", .), .) %>%
  gsub("\\(diff.* . .  |;.*", "", .) %>%
  ifelse(grepl(" \\. \\. m", .),
         gsub(" \\. \\. m", "", .), .) %>%
  ifelse(grepl("\\(diff.*hist\\) ", .),
         gsub("\\(diff.*hist\\) ", "", .),
         .)

# remove end of line char
Encoding(recent_changes_title) <- "UTF-8"
recent_changes_title <- gsub("\u200E", "", recent_changes_title)

length(recent_changes_title)


# combine diff data into a data frame
library(dplyr)
recent_changes_df <- data_frame(diff_title = recent_changes_title,
                                diff_txt = recent_changes_txt,
                                diff_url = recent_changes_urls,
                                diff_url_short = 0)

# filter so we only see rows about archaeology

recent_changes_df_archae <-
  recent_changes_df %>%
  filter(grepl("archaeol|Archaeol", diff_title))

recent_changes_df_archae

# if there are many articles, make one tweet per article

library(urlshorteneR)
library(twitteR)

twitter_api_key <- "xxx"
twitter_api_secret <- "xxx"
twitter_access_token <- "xxx"
access_token_secret <- "xxx"


# store prepared tweets
tweet_text <- vector(length = nrow(recent_changes_df_archae))

# loop over each row of recent_changes_df_archae to make a tweet for each
# edited article
for(i in seq_len(nrow(recent_changes_df_archae))){

  # make a short url
  recent_changes_df_archae$diff_url_short[i] <- isgd_LinksShorten(longUrl = recent_changes_df_archae$diff_url[i])

  # compose a tweet
  tweet_text[i] <- paste0("The Wikipedia article '",
                       recent_changes_df_archae$diff_title[i],
                       "' was just edited. See the change here: ",
                       recent_changes_df_archae$diff_url_short[i])


}

# post to twitter
# https://twitter.com/archaeoledits

setup_twitter_oauth(twitter_api_key,
                    twitter_api_secret,
                    twitter_access_token,
                    access_token_secret)


for(i in seq_along(tweet_text)){

  tweet(tweet_text[i])
}


# setup a task in Windows to run this script automatically:
# http://www.r-datacollection.com/blog/Programming-a-Twitter-bot/


## wiki-edit-scrape.R
library(RCurl)
library(XML)
library(ggplot2)
devtools::install_github("hadley/reshape")
library(reshape2)


users <- c("Lizzy8127", "Arctickinkajou", "Rsuwsearch", "Hurlej", "Johnhart151",
           "Rocket027", "Z.%20PUPU", "Cookingtheworld", "piesquared93", "Rjmath",
           "Liann2009")
user_edits <- vector("list",length = length(users))
names(user_edits) <- users

for(i in 1:length(users)){
  print(i)
  theurl <- paste0("http://en.wikipedia.org/w/index.php?title=Special:Contributions&offset=&limit=5000&contribs=user&target=", users[i])
  webpage <- getURL(theurl)
  # Process escape characters
  webpage <- readLines(tc <- textConnection(webpage)); close(tc)
  # Parse the html tree, ignoring errors on the page
  pagetree <- htmlTreeParse(webpage, error=function(...){}, useInternalNodes = TRUE)
  # Extract table header and contents
  dates <- xpathSApply(pagetree, "//*[@class='mw-changeslist-date']", xmlValue)
  bytes_changed <- xpathSApply(pagetree, "//*[@dir='ltr']", xmlValue)[c(-1, -length
                                                                        (xpathSApply(pagetree, "//*[@dir='ltr']", xmlValue)))]
  articles <- xpathSApply(pagetree, "//*[@class='mw-contributions-title']",
                          xmlValue)
  # put into data frame
  user_edits[[i]] <- data.frame(dates = dates, bytes_changed = bytes_changed,
                                articles = articles)
}

### prepare data
names(bytes_changed) <- users
to_match <- c("\\+", "\\(", "\\)", ",")
bytes_changed <- lapply(user_edits, function(i) as.numeric(gsub(paste(to_match, collapse="|"), "", i$bytes_changed)))
long <- melt(user_edits)

### What topics did they edit?
library(wordcloud)
wordcloud(long$articles, random.order=FALSE, min.freq = 1, colors =  brewer.pal(6,"Dark2"))

# exclude sandboxes
no_sandboxes <-  long$articles[!grepl("sandbox", long$articles)]
wordcloud(no_sandboxes, random.order=FALSE, min.freq = 1, colors =  brewer.pal(6,"Dark2"))

### who added the most text?
# total bytes per user
sort(sapply(bytes_changed, sum))

# plot bytes per edit per user
bytes_changed_stack <- stack(bytes_changed)
ggplot(bytes_changed_stack, aes(reorder(ind, -values, FUN = sum), values)) +
  geom_jitter(position = position_jitter(width = .05), colour = "grey20", size = 2) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0, hjust=0)) +
  xlab("") +
  ylab("bytes changed")

# exclude outliers
ggplot(bytes_changed_stack, aes(reorder(ind, -values, FUN = sum), values)) +
  geom_jitter(position = position_jitter(width = .05), colour = "grey20", size = 2) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0, hjust=0)) +
  xlab("") +
  ylab("bytes changed") +
  ylim(-2500, 2500)

### how did editing change over time?
long$bytes_changed <- as.numeric(gsub(paste(to_match, collapse="|"), "", long$bytes_changed))
# convert dates to proper date format that R can recognise
long$dates <- strptime(as.character(long$dates), format = "%H:%M, %d %b %Y")
# get week of the year for each entry
library(lubridate)
long$week <- week(long$dates)

# plot
ggplot(long, aes(week, bytes_changed, colour = L1)) +
  geom_jitter(position = position_jitter(width = .05), size = 2) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0, hjust=0)) +
  xlab("week of the year") +
  ylab("bytes changed")
# plot with smoother
ggplot(long, aes(week, bytes_changed)) +
  geom_jitter(position = position_jitter(width = .05), size = 2) +
  geom_smooth() +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0, hjust=0)) +
  xlab("week of the year") +
  ylab("bytes added")


# how did the topics change over time?
ggplot(long, aes(week, bytes_changed)) +
  geom_text(aes(label = articles, size =  abs(bytes_changed)), position = position_jitter(width=1, height=1), )  +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0, hjust=0)) +
  xlab("week of the year") +
  ylab("bytes changed")


	# get recent changes from wikipedia
	library(rvest)
	n_changes <- 5000
	recent_changes_url <- paste0("https://en.wikipedia.org/w/index.php?title=Special:RecentChanges&limit=", n_changes , "&days=1")

	# connect to website
	html <- read_html(recent_changes_url)

	# full text of change log
	recent_changes_txt <-
	html %>%
	html_nodes(".mw-changeslist-human") %>%
	html_text()

	length(recent_changes_txt)

	# URLs of diff pages
	recent_changes_urls <-
	html %>%
	html_nodes(".mw-changeslist-human > a:nth-child(1)") %>%
	html_attr('href')

	length(recent_changes_urls)

	# full URL
	recent_changes_urls <- paste0("https://en.wikipedia.org/", recent_changes_urls)

	# get title of article
	recent_changes_title <-
	recent_changes_txt %>%
	ifelse(grepl("\\(diff.* . . \|;.*", .),
	gsub("\\(diff.* . . \|;.*", "", .), .) %>%
	gsub("\\(diff.* . . \|;.*", "", .) %>%
	ifelse(grepl(" \\. \\. m", .),
	gsub(" \\. \\. m", "", .), .) %>%
	ifelse(grepl("\\(diff.*hist\\) ", .),
	gsub("\\(diff.*hist\\) ", "", .),
	.)

	# remove end of line char
	Encoding(recent_changes_title) <- "UTF-8"
	recent_changes_title <- gsub("\u200E", "", recent_changes_title)

	length(recent_changes_title)


	# combine diff data into a data frame
	library(dplyr)
	recent_changes_df <- data_frame(diff_title = recent_changes_title,
	diff_txt = recent_changes_txt,
	diff_url = recent_changes_urls,
	diff_url_short = 0)

	# filter so we only see rows about archaeology

	recent_changes_df_archae <-
	recent_changes_df %>%
	filter(grepl("archaeol\|Archaeol", diff_title))

	recent_changes_df_archae

	# if there are many articles, make one tweet per article

	library(urlshorteneR)
	library(twitteR)

	twitter_api_key <- "xxx"
	twitter_api_secret <- "xxx"
	twitter_access_token <- "xxx"
	access_token_secret <- "xxx"


	# store prepared tweets
	tweet_text <- vector(length = nrow(recent_changes_df_archae))

	# loop over each row of recent_changes_df_archae to make a tweet for each
	# edited article
	for(i in seq_len(nrow(recent_changes_df_archae))){

	# make a short url
	recent_changes_df_archae$diff_url_short[i] <- isgd_LinksShorten(longUrl = recent_changes_df_archae$diff_url[i])

	# compose a tweet
	tweet_text[i] <- paste0("The Wikipedia article '",
	recent_changes_df_archae$diff_title[i],
	"' was just edited. See the change here: ",
	recent_changes_df_archae$diff_url_short[i])


	}

	# post to twitter
	# https://twitter.com/archaeoledits

	setup_twitter_oauth(twitter_api_key,
	twitter_api_secret,
	twitter_access_token,
	access_token_secret)


	for(i in seq_along(tweet_text)){

	tweet(tweet_text[i])
	}


	# setup a task in Windows to run this script automatically:
	# http://www.r-datacollection.com/blog/Programming-a-Twitter-bot/
	library(RCurl)
	library(XML)
	library(ggplot2)
	devtools::install_github("hadley/reshape")
	library(reshape2)


	users <- c("Lizzy8127", "Arctickinkajou", "Rsuwsearch", "Hurlej", "Johnhart151",
	"Rocket027", "Z.%20PUPU", "Cookingtheworld", "piesquared93", "Rjmath",
	"Liann2009")
	user_edits <- vector("list",length = length(users))
	names(user_edits) <- users

	for(i in 1:length(users)){
	print(i)
	theurl <- paste0("http://en.wikipedia.org/w/index.php?title=Special:Contributions&offset=&limit=5000&contribs=user&target=", users[i])
	webpage <- getURL(theurl)
	# Process escape characters
	webpage <- readLines(tc <- textConnection(webpage)); close(tc)
	# Parse the html tree, ignoring errors on the page
	pagetree <- htmlTreeParse(webpage, error=function(...){}, useInternalNodes = TRUE)
	# Extract table header and contents
	dates <- xpathSApply(pagetree, "//*[@class='mw-changeslist-date']", xmlValue)
	bytes_changed <- xpathSApply(pagetree, "//*[@dir='ltr']", xmlValue)[c(-1, -length
	(xpathSApply(pagetree, "//*[@dir='ltr']", xmlValue)))]
	articles <- xpathSApply(pagetree, "//*[@class='mw-contributions-title']",
	xmlValue)
	# put into data frame
	user_edits[[i]] <- data.frame(dates = dates, bytes_changed = bytes_changed,
	articles = articles)
	}

	### prepare data
	names(bytes_changed) <- users
	to_match <- c("\\+", "\\(", "\\)", ",")
	bytes_changed <- lapply(user_edits, function(i) as.numeric(gsub(paste(to_match, collapse="\|"), "", i$bytes_changed)))
	long <- melt(user_edits)

	### What topics did they edit?
	library(wordcloud)
	wordcloud(long$articles, random.order=FALSE, min.freq = 1, colors = brewer.pal(6,"Dark2"))

	# exclude sandboxes
	no_sandboxes <- long$articles[!grepl("sandbox", long$articles)]
	wordcloud(no_sandboxes, random.order=FALSE, min.freq = 1, colors = brewer.pal(6,"Dark2"))

	### who added the most text?
	# total bytes per user
	sort(sapply(bytes_changed, sum))

	# plot bytes per edit per user
	bytes_changed_stack <- stack(bytes_changed)
	ggplot(bytes_changed_stack, aes(reorder(ind, -values, FUN = sum), values)) +
	geom_jitter(position = position_jitter(width = .05), colour = "grey20", size = 2) +
	theme_minimal() +
	theme(axis.text.x = element_text(angle = 90, vjust = 0, hjust=0)) +
	xlab("") +
	ylab("bytes changed")

	# exclude outliers
	ggplot(bytes_changed_stack, aes(reorder(ind, -values, FUN = sum), values)) +
	geom_jitter(position = position_jitter(width = .05), colour = "grey20", size = 2) +
	theme_minimal() +
	theme(axis.text.x = element_text(angle = 90, vjust = 0, hjust=0)) +
	xlab("") +
	ylab("bytes changed") +
	ylim(-2500, 2500)

	### how did editing change over time?
	long$bytes_changed <- as.numeric(gsub(paste(to_match, collapse="\|"), "", long$bytes_changed))
	# convert dates to proper date format that R can recognise
	long$dates <- strptime(as.character(long$dates), format = "%H:%M, %d %b %Y")
	# get week of the year for each entry
	library(lubridate)
	long$week <- week(long$dates)

	# plot
	ggplot(long, aes(week, bytes_changed, colour = L1)) +
	geom_jitter(position = position_jitter(width = .05), size = 2) +
	theme_minimal() +
	theme(axis.text.x = element_text(angle = 90, vjust = 0, hjust=0)) +
	xlab("week of the year") +
	ylab("bytes changed")
	# plot with smoother
	ggplot(long, aes(week, bytes_changed)) +
	geom_jitter(position = position_jitter(width = .05), size = 2) +
	geom_smooth() +
	theme_minimal() +
	theme(axis.text.x = element_text(angle = 90, vjust = 0, hjust=0)) +
	xlab("week of the year") +
	ylab("bytes added")


	# how did the topics change over time?
	ggplot(long, aes(week, bytes_changed)) +
	geom_text(aes(label = articles, size = abs(bytes_changed)), position = position_jitter(width=1, height=1), ) +
	theme_minimal() +
	theme(axis.text.x = element_text(angle = 90, vjust = 0, hjust=0)) +
	xlab("week of the year") +
	ylab("bytes changed")