Last active
August 29, 2015 14:14
-
-
Save tts/c90652e531ebfc3664ed to your computer and use it in GitHub Desktop.
Twitterbot for new items in Aaltodoc, Aalto University document repository.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#################################################################### | |
# | |
# Code by Andy Teucher | |
# | |
# https://github.com/ateucher/crd_rare_bird_bot/blob/master/fun.R | |
# | |
#################################################################### | |
shorten <- function(url, bitlytoken) { | |
stop_for_status(GET(url)) | |
res <- GET("https://api-ssl.bitly.com/v3/shorten", | |
query = list(access_token=bitlytoken, longUrl=url)) | |
stop_for_status(res) | |
con <- content(res) | |
short_url <- con$data$url | |
short_url | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
tweetme <- function(rss, type) { | |
# http://stackoverflow.com/a/16737731 | |
feed <- xmlTreeParse(getURL(rss, .opts = (list(ssl.verifypeer = FALSE))))$doc | |
pre <- switch(type, | |
article = "[A]", | |
compiledworks = "[W]", | |
diss = "[D]", | |
lic = "[L]") | |
# Code by Noah Lorang https://github.com/noahhl/r-does-rss/blob/master/R/rss.R | |
parseRSS <- function(content) { | |
channel <- xmlToList(content$children$rss[['channel']]) | |
results <- list() | |
results$header <- channel[names(channel) != "item"] | |
results$items <- channel[names(channel) == "item"] | |
names(results$items) <- rep("item", length(results$items)) | |
return(results) | |
} | |
newdocs <- parseRSS(feed) | |
N <- length(newdocs$items) | |
resultslist <- vector("list", N) | |
for (i in 1:N) { | |
url <- newdocs$items[i]$item$link | |
html <- html(url) | |
au <- html %>% | |
html_nodes(xpath = "//meta[@name='DC.creator'][1]") %>% | |
html_attr('content') | |
nr.au <- html %>% | |
html_nodes(xpath = "//meta[@name='DC.creator']") | |
title <- html %>% | |
html_node(xpath = "//meta[@name='DC.title']") %>% | |
html_attr('content') | |
# Hack: if no abstract, save the title instead, and check the value later on | |
if ( !is.null(html_node(html, xpath = "//meta[@name='DCTERMS.abstract']")) ) { | |
abstract <- html %>% | |
html_node(xpath = "//meta[@name='DCTERMS.abstract']") %>% | |
html_attr('content') | |
} else { | |
abstract <- html %>% | |
html_node(xpath = "//meta[@name='DC.title']") %>% | |
html_attr('content') | |
} | |
link <- html %>% | |
html_node(xpath = "//meta[@name='DC.identifier' and @scheme='DCTERMS.URI']") %>% | |
html_attr('content') | |
df <- data.frame(au, length(nr.au), title, abstract, link, stringsAsFactors=FALSE) | |
resultslist[[i]] <- df | |
} | |
results <- rbind_all(resultslist) | |
names(results) <- c("au", "nr.au", "title", "abstract", "link") | |
# Take only the surname of the (1st) author | |
results$au <- sapply(results$au, function(x) unlist(strsplit(x, ","))[1]) | |
# Update dataframe by fetching shortened URLs for links | |
r.df <- results %>% | |
rowwise() %>% | |
mutate(shorturl = shorten(link, bitlytoken = Sys.getenv("BITLY_TOKEN"))) | |
# Read in the tweet log of this type | |
if (file.exists(paste0(Sys.getenv("HOME"), "/projektit/aaltodocbot/", type, "tweets.csv"))) { | |
tweeted <- read.table(paste0(Sys.getenv("HOME"), "/projektit/aaltodocbot/", type, "tweets.csv"), stringsAsFactors=FALSE) | |
names(tweeted) <- c("text") | |
} | |
for ( i in 1:nrow(r.df) ){ | |
if ( exists("tweeted") && TRUE %in% grepl(substr(r.df$title[i],1,15), tweeted$text) ) { | |
# Already POSTed or this the 1st time ever | |
} | |
else | |
{ | |
# Name the picture file individually, based on the last sequence of numbers in the link | |
file <- paste0(Sys.getenv("HOME"), "/projektit/aaltodocbot/wordcloud", rev(unlist(strsplit(r.df$link[i],"/")))[1],".png") | |
# If there was no abstract, skip the wcloud function | |
ifelse (r.df$abstract[i] == r.df$title[i], no.abs <- "1", wcloud(r.df$abstract[i], file)) | |
# How much chars left in the tweet for the title? Note: punctuation chars in the final tweet, plus Twitter pic URL (26) | |
title.length <- nchar(r.df$title[i]) | |
chars.left <- as.integer(140 - (nchar(r.df$au[i]) + nchar(r.df$shorturl[i])) - 4 - 7 - 2 - 4 - 26) | |
# This much we can say about the title | |
title.left <- substr(r.df$title[i],1,chars.left) | |
# If title.left>=title.length, append no ellipsis | |
ifelse( nchar(title.left) >= title.length, | |
tweet.text <- paste0(pre, " ", r.df$au[i], ifelse(r.df$nr.au[i]>1, " et al.", ""), ": ", title.left, " ", r.df$shorturl[i]), | |
tweet.text <- paste0(pre, " ", r.df$au[i], ifelse(r.df$nr.au[i]>1, " et al.", ""), ": ", title.left, "... ", r.df$shorturl[i]) ) | |
# POST tweet, with wordcloud if there was an abstract | |
if(exists("no.abs")) tweet(tweet.text) else updateStatus(tweet.text, mediaPath=file) | |
# Log tweet | |
write.table(tweet.text, file=paste0(Sys.getenv("HOME"), "/projektit/aaltodocbot/",type,"tweets.csv"), append = TRUE, row.names=FALSE, col.names=FALSE) | |
# Delete the no.abs object before the next loop run | |
if (exists("no.abs")) remove(no.abs) | |
} | |
} | |
if (exists("tweeted")) remove(tweeted) | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
wcloud <- function(x, file) { | |
# See Hands-On Data Science with R,Text Mining | |
# http://onepager.togaware.com/TextMiningO.pdf | |
res.corpus <- Corpus(VectorSource(x)) | |
res.corpus <- tm_map(res.corpus, removePunctuation) | |
res.corpus <- tm_map(res.corpus, content_transformer(tolower)) | |
not.these <- c("dissertation", "technique", "results", "method", | |
"demonstrated", "measurements", "measurement", "studied", "new", "thesis") | |
res.corpus <- tm_map(res.corpus, function(x) removeWords(x, c(not.these, stopwords("english")))) | |
res.corpus <- tm_map(res.corpus, stemDocument) | |
tdm <- TermDocumentMatrix(res.corpus) | |
m <- as.matrix(tdm) | |
v <- sort(rowSums(m),decreasing=TRUE) | |
d <- data.frame(word = names(v),freq=v) | |
table(d$freq) | |
pal2 <- brewer.pal(8, "Dark2") | |
png(file, width=1280,height=800) | |
wordcloud(d$word,d$freq, scale=c(8,.2), min.freq=2, max.words=Inf, rot.per=0.2, colors=pal2) | |
dev.off() | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/Rscript | |
############################################### | |
# | |
# 30.1.2015 Tuija Sonkkila | |
# | |
# Tweets new items in Aalto University | |
# document repository, Aaltodoc. Scheduled to | |
# run once a week. | |
# | |
# https://twitter.com/Aaltodoc/ | |
# | |
############################################### | |
library(XML) | |
library(RCurl) | |
library(rvest) | |
library(twitteR) | |
library(wordcloud) | |
library(tm) | |
library(SnowballC) | |
library(dplyr) | |
library(httr) | |
source(paste0(Sys.getenv("HOME"), "/projektit/aaltodocbot/f_wcloud.R")) | |
source(paste0(Sys.getenv("HOME"), "/projektit/aaltodocbot/f_shortenurl.R")) | |
source(paste0(Sys.getenv("HOME"), "/projektit/aaltodocbot/f_tweet.R")) | |
# Aaltodoc credentials | |
consumer_key <- Sys.getenv("AALTODOC_CONSUMER_KEY") | |
consumer_secret <- Sys.getenv("AALTODOC_CONSUMER_SECRET") | |
access_token <- Sys.getenv("AALTODOC_ACCESS_TOKEN") | |
access_secret <- Sys.getenv("AALTODOC_ACCESS_SECRET") | |
setup_twitter_oauth(consumer_key, consumer_secret, access_token, access_secret) | |
# Articles RSS | |
tweetme("https://aaltodoc.aalto.fi/feed/rss_2.0/123456789/79", "article") | |
# Dissertations RSS | |
tweetme("https://aaltodoc.aalto.fi/feed/rss_2.0/123456789/5", "diss") | |
# Licentiate theses RSS | |
tweetme("https://aaltodoc.aalto.fi/feed/rss_2.0/123456789/4", "lic") | |
# Compiled works RSS | |
tweetme("https://aaltodoc.aalto.fi/feed/rss_2.0/123456789/15", "compiledworks") | |
# Tweet test | |
# | |
# tweet(paste0("Test", Sys.time())) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment