tts/f_shortenurl.R

## f_shortenurl.R
####################################################################
#
# Code by Andy Teucher
#
# https://github.com/ateucher/crd_rare_bird_bot/blob/master/fun.R
#
####################################################################

shorten <- function(url, bitlytoken) {
  stop_for_status(GET(url))

  res <- GET("https://api-ssl.bitly.com/v3/shorten",
             query = list(access_token=bitlytoken, longUrl=url))

  stop_for_status(res)

  con <- content(res)

  short_url <- con$data$url

  short_url
}

## f_tweet.R
tweetme <- function(rss, type) {

  # http://stackoverflow.com/a/16737731
  feed <- xmlTreeParse(getURL(rss, .opts = (list(ssl.verifypeer = FALSE))))$doc

  pre <- switch(type,
         article = "[A]",
         compiledworks = "[W]",
         diss = "[D]",
         lic = "[L]")

  # Code by Noah Lorang https://github.com/noahhl/r-does-rss/blob/master/R/rss.R
  parseRSS <- function(content) {
    channel <- xmlToList(content$children$rss[['channel']])
    results <- list()
    results$header <- channel[names(channel) != "item"]
    results$items <- channel[names(channel) == "item"]
    names(results$items) <- rep("item", length(results$items))
    return(results)
    }

  newdocs <- parseRSS(feed)
  N <- length(newdocs$items)

  resultslist <- vector("list", N)

  for (i in 1:N) {

    url <- newdocs$items[i]$item$link
    html <- html(url)

    au <- html %>%
      html_nodes(xpath = "//meta[@name='DC.creator'][1]") %>%
      html_attr('content')

    nr.au <- html %>%
      html_nodes(xpath = "//meta[@name='DC.creator']")

    title <- html %>%
      html_node(xpath = "//meta[@name='DC.title']") %>%
      html_attr('content')

    # Hack: if no abstract, save the title instead, and check the value later on
    if ( !is.null(html_node(html, xpath = "//meta[@name='DCTERMS.abstract']")) ) {
      abstract <- html %>%
        html_node(xpath = "//meta[@name='DCTERMS.abstract']") %>%
        html_attr('content')
    } else {
      abstract <- html %>%
        html_node(xpath = "//meta[@name='DC.title']") %>%
        html_attr('content')
    }

    link <- html %>%
      html_node(xpath = "//meta[@name='DC.identifier' and @scheme='DCTERMS.URI']") %>%
      html_attr('content')

    df <- data.frame(au, length(nr.au), title, abstract, link, stringsAsFactors=FALSE)
    resultslist[[i]] <- df

    }

  results <- rbind_all(resultslist)
  names(results) <- c("au", "nr.au", "title", "abstract", "link")

  # Take only the surname of the (1st) author
  results$au <- sapply(results$au, function(x) unlist(strsplit(x, ","))[1])

  # Update dataframe by fetching shortened URLs for links
  r.df <- results %>%
    rowwise() %>%
    mutate(shorturl = shorten(link, bitlytoken = Sys.getenv("BITLY_TOKEN")))

  # Read in the tweet log of this type
  if (file.exists(paste0(Sys.getenv("HOME"), "/projektit/aaltodocbot/", type, "tweets.csv"))) {
    tweeted <- read.table(paste0(Sys.getenv("HOME"), "/projektit/aaltodocbot/", type, "tweets.csv"), stringsAsFactors=FALSE)
    names(tweeted) <- c("text")
    }

  for ( i in 1:nrow(r.df) ){
    if ( exists("tweeted") && TRUE %in% grepl(substr(r.df$title[i],1,15), tweeted$text) ) {
      # Already POSTed or this the 1st time ever
      }
    else
      {
        # Name the picture file individually, based on the last sequence of numbers in the link
        file <- paste0(Sys.getenv("HOME"), "/projektit/aaltodocbot/wordcloud", rev(unlist(strsplit(r.df$link[i],"/")))[1],".png")
        # If there was no abstract, skip the wcloud function
        ifelse (r.df$abstract[i] == r.df$title[i], no.abs <- "1", wcloud(r.df$abstract[i], file))
        # How much chars left in the tweet for the title? Note: punctuation chars in the final tweet, plus Twitter pic URL (26)
        title.length <- nchar(r.df$title[i])
        chars.left <- as.integer(140 - (nchar(r.df$au[i]) + nchar(r.df$shorturl[i])) - 4 - 7 - 2 - 4 - 26)
        # This much we can say about the title
        title.left <- substr(r.df$title[i],1,chars.left)
        # If title.left>=title.length, append no ellipsis
        ifelse( nchar(title.left) >= title.length,
                tweet.text <- paste0(pre, " ", r.df$au[i], ifelse(r.df$nr.au[i]>1, " et al.", ""), ": ", title.left, " ", r.df$shorturl[i]),
                tweet.text <- paste0(pre, " ", r.df$au[i], ifelse(r.df$nr.au[i]>1, " et al.", ""), ": ", title.left, "... ", r.df$shorturl[i]) )
        # POST tweet, with wordcloud if there was an abstract
        if(exists("no.abs")) tweet(tweet.text) else updateStatus(tweet.text, mediaPath=file)
        # Log tweet
        write.table(tweet.text, file=paste0(Sys.getenv("HOME"), "/projektit/aaltodocbot/",type,"tweets.csv"), append = TRUE, row.names=FALSE, col.names=FALSE)
        # Delete the no.abs object before the next loop run
        if (exists("no.abs")) remove(no.abs)
        }
    }

  if (exists("tweeted")) remove(tweeted)

}

## f_wcloud.R
wcloud <- function(x, file) {

  # See Hands-On Data Science with R,Text Mining
  # http://onepager.togaware.com/TextMiningO.pdf

  res.corpus <- Corpus(VectorSource(x))

  res.corpus <- tm_map(res.corpus, removePunctuation)
  res.corpus <- tm_map(res.corpus, content_transformer(tolower))
  not.these <- c("dissertation", "technique", "results", "method",
               "demonstrated", "measurements", "measurement", "studied", "new", "thesis")
  res.corpus <- tm_map(res.corpus, function(x) removeWords(x, c(not.these, stopwords("english"))))
  res.corpus <- tm_map(res.corpus, stemDocument)
  tdm <- TermDocumentMatrix(res.corpus)
  m <- as.matrix(tdm)
  v <- sort(rowSums(m),decreasing=TRUE)
  d <- data.frame(word = names(v),freq=v)
  table(d$freq)
  pal2 <- brewer.pal(8, "Dark2")
  png(file, width=1280,height=800)
  wordcloud(d$word,d$freq, scale=c(8,.2), min.freq=2, max.words=Inf, rot.per=0.2, colors=pal2)
  dev.off()

}

## runtweets.R
#!/usr/bin/Rscript

###############################################
#
# 30.1.2015 Tuija Sonkkila
#
# Tweets new items in Aalto University
# document repository, Aaltodoc. Scheduled to
# run once a week.
#
# https://twitter.com/Aaltodoc/
#
###############################################

library(XML)
library(RCurl)
library(rvest)
library(twitteR)
library(wordcloud)
library(tm)
library(SnowballC)
library(dplyr)
library(httr)

source(paste0(Sys.getenv("HOME"), "/projektit/aaltodocbot/f_wcloud.R"))
source(paste0(Sys.getenv("HOME"), "/projektit/aaltodocbot/f_shortenurl.R"))
source(paste0(Sys.getenv("HOME"), "/projektit/aaltodocbot/f_tweet.R"))

# Aaltodoc credentials
consumer_key <- Sys.getenv("AALTODOC_CONSUMER_KEY")
consumer_secret <- Sys.getenv("AALTODOC_CONSUMER_SECRET")
access_token <- Sys.getenv("AALTODOC_ACCESS_TOKEN")
access_secret <- Sys.getenv("AALTODOC_ACCESS_SECRET")

setup_twitter_oauth(consumer_key, consumer_secret, access_token, access_secret)

# Articles RSS
tweetme("https://aaltodoc.aalto.fi/feed/rss_2.0/123456789/79", "article")

# Dissertations RSS
tweetme("https://aaltodoc.aalto.fi/feed/rss_2.0/123456789/5", "diss")

# Licentiate theses RSS
tweetme("https://aaltodoc.aalto.fi/feed/rss_2.0/123456789/4", "lic")

# Compiled works RSS
tweetme("https://aaltodoc.aalto.fi/feed/rss_2.0/123456789/15", "compiledworks")

# Tweet test
#
# tweet(paste0("Test", Sys.time()))
	####################################################################
	#
	# Code by Andy Teucher
	#
	# https://github.com/ateucher/crd_rare_bird_bot/blob/master/fun.R
	#
	####################################################################

	shorten <- function(url, bitlytoken) {
	stop_for_status(GET(url))

	res <- GET("https://api-ssl.bitly.com/v3/shorten",
	query = list(access_token=bitlytoken, longUrl=url))

	stop_for_status(res)

	con <- content(res)

	short_url <- con$data$url

	short_url
	}
	tweetme <- function(rss, type) {

	# http://stackoverflow.com/a/16737731
	feed <- xmlTreeParse(getURL(rss, .opts = (list(ssl.verifypeer = FALSE))))$doc

	pre <- switch(type,
	article = "[A]",
	compiledworks = "[W]",
	diss = "[D]",
	lic = "[L]")

	# Code by Noah Lorang https://github.com/noahhl/r-does-rss/blob/master/R/rss.R
	parseRSS <- function(content) {
	channel <- xmlToList(content$children$rss[['channel']])
	results <- list()
	results$header <- channel[names(channel) != "item"]
	results$items <- channel[names(channel) == "item"]
	names(results$items) <- rep("item", length(results$items))
	return(results)
	}

	newdocs <- parseRSS(feed)
	N <- length(newdocs$items)

	resultslist <- vector("list", N)

	for (i in 1:N) {

	url <- newdocs$items[i]$item$link
	html <- html(url)

	au <- html %>%
	html_nodes(xpath = "//meta[@name='DC.creator'][1]") %>%
	html_attr('content')

	nr.au <- html %>%
	html_nodes(xpath = "//meta[@name='DC.creator']")

	title <- html %>%
	html_node(xpath = "//meta[@name='DC.title']") %>%
	html_attr('content')

	# Hack: if no abstract, save the title instead, and check the value later on
	if ( !is.null(html_node(html, xpath = "//meta[@name='DCTERMS.abstract']")) ) {
	abstract <- html %>%
	html_node(xpath = "//meta[@name='DCTERMS.abstract']") %>%
	html_attr('content')
	} else {
	abstract <- html %>%
	html_node(xpath = "//meta[@name='DC.title']") %>%
	html_attr('content')
	}

	link <- html %>%
	html_node(xpath = "//meta[@name='DC.identifier' and @scheme='DCTERMS.URI']") %>%
	html_attr('content')

	df <- data.frame(au, length(nr.au), title, abstract, link, stringsAsFactors=FALSE)
	resultslist[[i]] <- df

	}

	results <- rbind_all(resultslist)
	names(results) <- c("au", "nr.au", "title", "abstract", "link")

	# Take only the surname of the (1st) author
	results$au <- sapply(results$au, function(x) unlist(strsplit(x, ","))[1])

	# Update dataframe by fetching shortened URLs for links
	r.df <- results %>%
	rowwise() %>%
	mutate(shorturl = shorten(link, bitlytoken = Sys.getenv("BITLY_TOKEN")))

	# Read in the tweet log of this type
	if (file.exists(paste0(Sys.getenv("HOME"), "/projektit/aaltodocbot/", type, "tweets.csv"))) {
	tweeted <- read.table(paste0(Sys.getenv("HOME"), "/projektit/aaltodocbot/", type, "tweets.csv"), stringsAsFactors=FALSE)
	names(tweeted) <- c("text")
	}

	for ( i in 1:nrow(r.df) ){
	if ( exists("tweeted") && TRUE %in% grepl(substr(r.df$title[i],1,15), tweeted$text) ) {
	# Already POSTed or this the 1st time ever
	}
	else
	{
	# Name the picture file individually, based on the last sequence of numbers in the link
	file <- paste0(Sys.getenv("HOME"), "/projektit/aaltodocbot/wordcloud", rev(unlist(strsplit(r.df$link[i],"/")))[1],".png")
	# If there was no abstract, skip the wcloud function
	ifelse (r.df$abstract[i] == r.df$title[i], no.abs <- "1", wcloud(r.df$abstract[i], file))
	# How much chars left in the tweet for the title? Note: punctuation chars in the final tweet, plus Twitter pic URL (26)
	title.length <- nchar(r.df$title[i])
	chars.left <- as.integer(140 - (nchar(r.df$au[i]) + nchar(r.df$shorturl[i])) - 4 - 7 - 2 - 4 - 26)
	# This much we can say about the title
	title.left <- substr(r.df$title[i],1,chars.left)
	# If title.left>=title.length, append no ellipsis
	ifelse( nchar(title.left) >= title.length,
	tweet.text <- paste0(pre, " ", r.df$au[i], ifelse(r.df$nr.au[i]>1, " et al.", ""), ": ", title.left, " ", r.df$shorturl[i]),
	tweet.text <- paste0(pre, " ", r.df$au[i], ifelse(r.df$nr.au[i]>1, " et al.", ""), ": ", title.left, "... ", r.df$shorturl[i]) )
	# POST tweet, with wordcloud if there was an abstract
	if(exists("no.abs")) tweet(tweet.text) else updateStatus(tweet.text, mediaPath=file)
	# Log tweet
	write.table(tweet.text, file=paste0(Sys.getenv("HOME"), "/projektit/aaltodocbot/",type,"tweets.csv"), append = TRUE, row.names=FALSE, col.names=FALSE)
	# Delete the no.abs object before the next loop run
	if (exists("no.abs")) remove(no.abs)
	}
	}

	if (exists("tweeted")) remove(tweeted)

	}
	wcloud <- function(x, file) {

	# See Hands-On Data Science with R,Text Mining
	# http://onepager.togaware.com/TextMiningO.pdf

	res.corpus <- Corpus(VectorSource(x))

	res.corpus <- tm_map(res.corpus, removePunctuation)
	res.corpus <- tm_map(res.corpus, content_transformer(tolower))
	not.these <- c("dissertation", "technique", "results", "method",
	"demonstrated", "measurements", "measurement", "studied", "new", "thesis")
	res.corpus <- tm_map(res.corpus, function(x) removeWords(x, c(not.these, stopwords("english"))))
	res.corpus <- tm_map(res.corpus, stemDocument)
	tdm <- TermDocumentMatrix(res.corpus)
	m <- as.matrix(tdm)
	v <- sort(rowSums(m),decreasing=TRUE)
	d <- data.frame(word = names(v),freq=v)
	table(d$freq)
	pal2 <- brewer.pal(8, "Dark2")
	png(file, width=1280,height=800)
	wordcloud(d$word,d$freq, scale=c(8,.2), min.freq=2, max.words=Inf, rot.per=0.2, colors=pal2)
	dev.off()

	}
	#!/usr/bin/Rscript

	###############################################
	#
	# 30.1.2015 Tuija Sonkkila
	#
	# Tweets new items in Aalto University
	# document repository, Aaltodoc. Scheduled to
	# run once a week.
	#
	# https://twitter.com/Aaltodoc/
	#
	###############################################

	library(XML)
	library(RCurl)
	library(rvest)
	library(twitteR)
	library(wordcloud)
	library(tm)
	library(SnowballC)
	library(dplyr)
	library(httr)

	source(paste0(Sys.getenv("HOME"), "/projektit/aaltodocbot/f_wcloud.R"))
	source(paste0(Sys.getenv("HOME"), "/projektit/aaltodocbot/f_shortenurl.R"))
	source(paste0(Sys.getenv("HOME"), "/projektit/aaltodocbot/f_tweet.R"))

	# Aaltodoc credentials
	consumer_key <- Sys.getenv("AALTODOC_CONSUMER_KEY")
	consumer_secret <- Sys.getenv("AALTODOC_CONSUMER_SECRET")
	access_token <- Sys.getenv("AALTODOC_ACCESS_TOKEN")
	access_secret <- Sys.getenv("AALTODOC_ACCESS_SECRET")

	setup_twitter_oauth(consumer_key, consumer_secret, access_token, access_secret)

	# Articles RSS
	tweetme("https://aaltodoc.aalto.fi/feed/rss_2.0/123456789/79", "article")

	# Dissertations RSS
	tweetme("https://aaltodoc.aalto.fi/feed/rss_2.0/123456789/5", "diss")

	# Licentiate theses RSS
	tweetme("https://aaltodoc.aalto.fi/feed/rss_2.0/123456789/4", "lic")

	# Compiled works RSS
	tweetme("https://aaltodoc.aalto.fi/feed/rss_2.0/123456789/15", "compiledworks")

	# Tweet test
	#
	# tweet(paste0("Test", Sys.time()))