jbryer/GoogleReader.R

## GoogleReader.R
require(XML)
require(RCurl)

#' This function ruturns an XML tree of the RSS feed from the given URL.
#'
#' This function utilizes the (unofficial) Google Reader API to retrieve RSS
#' feeds. The advantage of access RSS feeds through the Google Reader API is that
#' you are not limited by the number of entries a website may included in their
#' feed. That is, Google maintains generally maintains a complete history of
#' entries from the RSS feed.
#'
#' Note that the contents of the results will be limited by what the website
#' provides in their feeds. That is, Google does not contain more information
#' per entry then what the website originally provided. If the initial feed
#' contained only excerpts of the article, the feed from Google will too only
#' contain excerpts. Be aware though that for sites that do provide the complete
#' contents of posts will result in potentially very large downloads.
#'
#' @param feedURL the full URL to the RSS feed.
#' @param email the email address for the Google Reader account
#' @param passwd the password for the Google Reader account
#' @param posts the number of posts to return
#' @return the root \code{XMLNode} for the RSS feed.
#' @seealso \code{\link{/xmlRoot}} for the format of the returned XML tree
#' @export
#' @example
#' \dontrun{
#' rbloggers = getRSSFeed(feedURL="http://r-bloggers.com/feed",
#'     email="USERNAME@gmail.com", passwd="PASSWORD")
#' }
#' @author Jason Bryer <\email{jason@@bryer.org}x>
getRSSFeed <- function(feedURL, email, passwd, posts=1000) {
	#Authenticate with Google
	curlHandle = getCurlHandle(cookiefile="rcookies", ssl.verifyhost=FALSE, ssl.verifypeer=FALSE)
	x = postForm("https://www.google.com/accounts/ClientLogin",
				 accountType="GOOGLE",
				 service="reader",
				 Email=email,
				 Passwd=passwd,
				 source="makeR",
				 curl = curlHandle)
	gtoken = unlist(strsplit(x, "\n"))
	parsed.gtoken <- unlist(strsplit(gtoken[3], "Auth="))
	if (length(parsed.gtoken) >= 2) {
		auth.token <- unlist(strsplit(gtoken[3], "Auth="))[[2]]
	} else {
		stop("Authentication failed.")
	}
	google.auth <- paste("GoogleLogin auth=", auth.token, sep='')

	#Function to retrieve up to 1000 posts
	getDoc <- function(n, c=NULL) {
		feedURL = paste("http://www.google.com/reader/atom/feed/", feedURL, "?n=", n,
						ifelse(is.null(c), "", paste("&c=", c, sep='')),
						sep='')
		feed = getURL(feedURL, .encoding = 'UTF-8', followlocation=TRUE,
					  httpheader=c("Authorization"=google.auth),
					  curl=curlHandle)
		doc = xmlTreeParse(feed, asText=TRUE)
		return(xmlRoot(doc))
	}

	root = NULL
	continueValue = NULL
	for(i in 1:ceiling(posts / 1000)) {
		r = getDoc(n=ifelse(i == ceiling(posts / 1000), (posts-1) %% 1000 + 1, 1000),
				   c=continueValue)
		if(is.null(root)) {
			root = r
		} else {
			entries = which(xmlSApply(r, xmlName) == 'entry')
			if(length(entries) > 0) {
				root = addChildren(root, kids=r[entries])
			}
		}
		if(is.null(r[['continuation']])) {
			break #No more posts to retrieve
		} else {
			continueValue = unclass(xmlChildren(r[['continuation']])$text)$value
		}
	}
	return(root)
}
	require(XML)
	require(RCurl)

	#' This function ruturns an XML tree of the RSS feed from the given URL.
	#'
	#' This function utilizes the (unofficial) Google Reader API to retrieve RSS
	#' feeds. The advantage of access RSS feeds through the Google Reader API is that
	#' you are not limited by the number of entries a website may included in their
	#' feed. That is, Google maintains generally maintains a complete history of
	#' entries from the RSS feed.
	#'
	#' Note that the contents of the results will be limited by what the website
	#' provides in their feeds. That is, Google does not contain more information
	#' per entry then what the website originally provided. If the initial feed
	#' contained only excerpts of the article, the feed from Google will too only
	#' contain excerpts. Be aware though that for sites that do provide the complete
	#' contents of posts will result in potentially very large downloads.
	#'
	#' @param feedURL the full URL to the RSS feed.
	#' @param email the email address for the Google Reader account
	#' @param passwd the password for the Google Reader account
	#' @param posts the number of posts to return
	#' @return the root \code{XMLNode} for the RSS feed.
	#' @seealso \code{\link{/xmlRoot}} for the format of the returned XML tree
	#' @export
	#' @example
	#' \dontrun{
	#' rbloggers = getRSSFeed(feedURL="http://r-bloggers.com/feed",
	#' email="USERNAME@gmail.com", passwd="PASSWORD")
	#' }
	#' @author Jason Bryer <\email{jason@@bryer.org}x>
	getRSSFeed <- function(feedURL, email, passwd, posts=1000) {
	#Authenticate with Google
	curlHandle = getCurlHandle(cookiefile="rcookies", ssl.verifyhost=FALSE, ssl.verifypeer=FALSE)
	x = postForm("https://www.google.com/accounts/ClientLogin",
	accountType="GOOGLE",
	service="reader",
	Email=email,
	Passwd=passwd,
	source="makeR",
	curl = curlHandle)
	gtoken = unlist(strsplit(x, "\n"))
	parsed.gtoken <- unlist(strsplit(gtoken[3], "Auth="))
	if (length(parsed.gtoken) >= 2) {
	auth.token <- unlist(strsplit(gtoken[3], "Auth="))[[2]]
	} else {
	stop("Authentication failed.")
	}
	google.auth <- paste("GoogleLogin auth=", auth.token, sep='')

	#Function to retrieve up to 1000 posts
	getDoc <- function(n, c=NULL) {
	feedURL = paste("http://www.google.com/reader/atom/feed/", feedURL, "?n=", n,
	ifelse(is.null(c), "", paste("&c=", c, sep='')),
	sep='')
	feed = getURL(feedURL, .encoding = 'UTF-8', followlocation=TRUE,
	httpheader=c("Authorization"=google.auth),
	curl=curlHandle)
	doc = xmlTreeParse(feed, asText=TRUE)
	return(xmlRoot(doc))
	}

	root = NULL
	continueValue = NULL
	for(i in 1:ceiling(posts / 1000)) {
	r = getDoc(n=ifelse(i == ceiling(posts / 1000), (posts-1) %% 1000 + 1, 1000),
	c=continueValue)
	if(is.null(root)) {
	root = r
	} else {
	entries = which(xmlSApply(r, xmlName) == 'entry')
	if(length(entries) > 0) {
	root = addChildren(root, kids=r[entries])
	}
	}
	if(is.null(r[['continuation']])) {
	break #No more posts to retrieve
	} else {
	continueValue = unclass(xmlChildren(r[['continuation']])$text)$value
	}
	}
	return(root)
	}