Skip to content

Instantly share code, notes, and snippets.

@tts
Last active August 29, 2015 14:14
Show Gist options
  • Save tts/c90652e531ebfc3664ed to your computer and use it in GitHub Desktop.
Save tts/c90652e531ebfc3664ed to your computer and use it in GitHub Desktop.
Twitterbot for new items in Aaltodoc, Aalto University document repository.
####################################################################
#
# Code by Andy Teucher
#
# https://github.com/ateucher/crd_rare_bird_bot/blob/master/fun.R
#
####################################################################
shorten <- function(url, bitlytoken) {
stop_for_status(GET(url))
res <- GET("https://api-ssl.bitly.com/v3/shorten",
query = list(access_token=bitlytoken, longUrl=url))
stop_for_status(res)
con <- content(res)
short_url <- con$data$url
short_url
}
tweetme <- function(rss, type) {
# http://stackoverflow.com/a/16737731
feed <- xmlTreeParse(getURL(rss, .opts = (list(ssl.verifypeer = FALSE))))$doc
pre <- switch(type,
article = "[A]",
compiledworks = "[W]",
diss = "[D]",
lic = "[L]")
# Code by Noah Lorang https://github.com/noahhl/r-does-rss/blob/master/R/rss.R
parseRSS <- function(content) {
channel <- xmlToList(content$children$rss[['channel']])
results <- list()
results$header <- channel[names(channel) != "item"]
results$items <- channel[names(channel) == "item"]
names(results$items) <- rep("item", length(results$items))
return(results)
}
newdocs <- parseRSS(feed)
N <- length(newdocs$items)
resultslist <- vector("list", N)
for (i in 1:N) {
url <- newdocs$items[i]$item$link
html <- html(url)
au <- html %>%
html_nodes(xpath = "//meta[@name='DC.creator'][1]") %>%
html_attr('content')
nr.au <- html %>%
html_nodes(xpath = "//meta[@name='DC.creator']")
title <- html %>%
html_node(xpath = "//meta[@name='DC.title']") %>%
html_attr('content')
# Hack: if no abstract, save the title instead, and check the value later on
if ( !is.null(html_node(html, xpath = "//meta[@name='DCTERMS.abstract']")) ) {
abstract <- html %>%
html_node(xpath = "//meta[@name='DCTERMS.abstract']") %>%
html_attr('content')
} else {
abstract <- html %>%
html_node(xpath = "//meta[@name='DC.title']") %>%
html_attr('content')
}
link <- html %>%
html_node(xpath = "//meta[@name='DC.identifier' and @scheme='DCTERMS.URI']") %>%
html_attr('content')
df <- data.frame(au, length(nr.au), title, abstract, link, stringsAsFactors=FALSE)
resultslist[[i]] <- df
}
results <- rbind_all(resultslist)
names(results) <- c("au", "nr.au", "title", "abstract", "link")
# Take only the surname of the (1st) author
results$au <- sapply(results$au, function(x) unlist(strsplit(x, ","))[1])
# Update dataframe by fetching shortened URLs for links
r.df <- results %>%
rowwise() %>%
mutate(shorturl = shorten(link, bitlytoken = Sys.getenv("BITLY_TOKEN")))
# Read in the tweet log of this type
if (file.exists(paste0(Sys.getenv("HOME"), "/projektit/aaltodocbot/", type, "tweets.csv"))) {
tweeted <- read.table(paste0(Sys.getenv("HOME"), "/projektit/aaltodocbot/", type, "tweets.csv"), stringsAsFactors=FALSE)
names(tweeted) <- c("text")
}
for ( i in 1:nrow(r.df) ){
if ( exists("tweeted") && TRUE %in% grepl(substr(r.df$title[i],1,15), tweeted$text) ) {
# Already POSTed or this the 1st time ever
}
else
{
# Name the picture file individually, based on the last sequence of numbers in the link
file <- paste0(Sys.getenv("HOME"), "/projektit/aaltodocbot/wordcloud", rev(unlist(strsplit(r.df$link[i],"/")))[1],".png")
# If there was no abstract, skip the wcloud function
ifelse (r.df$abstract[i] == r.df$title[i], no.abs <- "1", wcloud(r.df$abstract[i], file))
# How much chars left in the tweet for the title? Note: punctuation chars in the final tweet, plus Twitter pic URL (26)
title.length <- nchar(r.df$title[i])
chars.left <- as.integer(140 - (nchar(r.df$au[i]) + nchar(r.df$shorturl[i])) - 4 - 7 - 2 - 4 - 26)
# This much we can say about the title
title.left <- substr(r.df$title[i],1,chars.left)
# If title.left>=title.length, append no ellipsis
ifelse( nchar(title.left) >= title.length,
tweet.text <- paste0(pre, " ", r.df$au[i], ifelse(r.df$nr.au[i]>1, " et al.", ""), ": ", title.left, " ", r.df$shorturl[i]),
tweet.text <- paste0(pre, " ", r.df$au[i], ifelse(r.df$nr.au[i]>1, " et al.", ""), ": ", title.left, "... ", r.df$shorturl[i]) )
# POST tweet, with wordcloud if there was an abstract
if(exists("no.abs")) tweet(tweet.text) else updateStatus(tweet.text, mediaPath=file)
# Log tweet
write.table(tweet.text, file=paste0(Sys.getenv("HOME"), "/projektit/aaltodocbot/",type,"tweets.csv"), append = TRUE, row.names=FALSE, col.names=FALSE)
# Delete the no.abs object before the next loop run
if (exists("no.abs")) remove(no.abs)
}
}
if (exists("tweeted")) remove(tweeted)
}
wcloud <- function(x, file) {
# See Hands-On Data Science with R,Text Mining
# http://onepager.togaware.com/TextMiningO.pdf
res.corpus <- Corpus(VectorSource(x))
res.corpus <- tm_map(res.corpus, removePunctuation)
res.corpus <- tm_map(res.corpus, content_transformer(tolower))
not.these <- c("dissertation", "technique", "results", "method",
"demonstrated", "measurements", "measurement", "studied", "new", "thesis")
res.corpus <- tm_map(res.corpus, function(x) removeWords(x, c(not.these, stopwords("english"))))
res.corpus <- tm_map(res.corpus, stemDocument)
tdm <- TermDocumentMatrix(res.corpus)
m <- as.matrix(tdm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
table(d$freq)
pal2 <- brewer.pal(8, "Dark2")
png(file, width=1280,height=800)
wordcloud(d$word,d$freq, scale=c(8,.2), min.freq=2, max.words=Inf, rot.per=0.2, colors=pal2)
dev.off()
}
#!/usr/bin/Rscript
###############################################
#
# 30.1.2015 Tuija Sonkkila
#
# Tweets new items in Aalto University
# document repository, Aaltodoc. Scheduled to
# run once a week.
#
# https://twitter.com/Aaltodoc/
#
###############################################
library(XML)
library(RCurl)
library(rvest)
library(twitteR)
library(wordcloud)
library(tm)
library(SnowballC)
library(dplyr)
library(httr)
source(paste0(Sys.getenv("HOME"), "/projektit/aaltodocbot/f_wcloud.R"))
source(paste0(Sys.getenv("HOME"), "/projektit/aaltodocbot/f_shortenurl.R"))
source(paste0(Sys.getenv("HOME"), "/projektit/aaltodocbot/f_tweet.R"))
# Aaltodoc credentials
consumer_key <- Sys.getenv("AALTODOC_CONSUMER_KEY")
consumer_secret <- Sys.getenv("AALTODOC_CONSUMER_SECRET")
access_token <- Sys.getenv("AALTODOC_ACCESS_TOKEN")
access_secret <- Sys.getenv("AALTODOC_ACCESS_SECRET")
setup_twitter_oauth(consumer_key, consumer_secret, access_token, access_secret)
# Articles RSS
tweetme("https://aaltodoc.aalto.fi/feed/rss_2.0/123456789/79", "article")
# Dissertations RSS
tweetme("https://aaltodoc.aalto.fi/feed/rss_2.0/123456789/5", "diss")
# Licentiate theses RSS
tweetme("https://aaltodoc.aalto.fi/feed/rss_2.0/123456789/4", "lic")
# Compiled works RSS
tweetme("https://aaltodoc.aalto.fi/feed/rss_2.0/123456789/15", "compiledworks")
# Tweet test
#
# tweet(paste0("Test", Sys.time()))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment