Created
January 13, 2012 14:46
-
-
Save jbryer/1606595 to your computer and use it in GitHub Desktop.
Function to return an RSS feed using the (unofficial) Google Reader API
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require(XML) | |
require(RCurl) | |
#' This function ruturns an XML tree of the RSS feed from the given URL. | |
#' | |
#' This function utilizes the (unofficial) Google Reader API to retrieve RSS | |
#' feeds. The advantage of access RSS feeds through the Google Reader API is that | |
#' you are not limited by the number of entries a website may included in their | |
#' feed. That is, Google maintains generally maintains a complete history of | |
#' entries from the RSS feed. | |
#' | |
#' Note that the contents of the results will be limited by what the website | |
#' provides in their feeds. That is, Google does not contain more information | |
#' per entry then what the website originally provided. If the initial feed | |
#' contained only excerpts of the article, the feed from Google will too only | |
#' contain excerpts. Be aware though that for sites that do provide the complete | |
#' contents of posts will result in potentially very large downloads. | |
#' | |
#' @param feedURL the full URL to the RSS feed. | |
#' @param email the email address for the Google Reader account | |
#' @param passwd the password for the Google Reader account | |
#' @param posts the number of posts to return | |
#' @return the root \code{XMLNode} for the RSS feed. | |
#' @seealso \code{\link{/xmlRoot}} for the format of the returned XML tree | |
#' @export | |
#' @example | |
#' \dontrun{ | |
#' rbloggers = getRSSFeed(feedURL="http://r-bloggers.com/feed", | |
#' email="USERNAME@gmail.com", passwd="PASSWORD") | |
#' } | |
#' @author Jason Bryer <\email{jason@@bryer.org}x> | |
getRSSFeed <- function(feedURL, email, passwd, posts=1000) { | |
#Authenticate with Google | |
curlHandle = getCurlHandle(cookiefile="rcookies", ssl.verifyhost=FALSE, ssl.verifypeer=FALSE) | |
x = postForm("https://www.google.com/accounts/ClientLogin", | |
accountType="GOOGLE", | |
service="reader", | |
Email=email, | |
Passwd=passwd, | |
source="makeR", | |
curl = curlHandle) | |
gtoken = unlist(strsplit(x, "\n")) | |
parsed.gtoken <- unlist(strsplit(gtoken[3], "Auth=")) | |
if (length(parsed.gtoken) >= 2) { | |
auth.token <- unlist(strsplit(gtoken[3], "Auth="))[[2]] | |
} else { | |
stop("Authentication failed.") | |
} | |
google.auth <- paste("GoogleLogin auth=", auth.token, sep='') | |
#Function to retrieve up to 1000 posts | |
getDoc <- function(n, c=NULL) { | |
feedURL = paste("http://www.google.com/reader/atom/feed/", feedURL, "?n=", n, | |
ifelse(is.null(c), "", paste("&c=", c, sep='')), | |
sep='') | |
feed = getURL(feedURL, .encoding = 'UTF-8', followlocation=TRUE, | |
httpheader=c("Authorization"=google.auth), | |
curl=curlHandle) | |
doc = xmlTreeParse(feed, asText=TRUE) | |
return(xmlRoot(doc)) | |
} | |
root = NULL | |
continueValue = NULL | |
for(i in 1:ceiling(posts / 1000)) { | |
r = getDoc(n=ifelse(i == ceiling(posts / 1000), (posts-1) %% 1000 + 1, 1000), | |
c=continueValue) | |
if(is.null(root)) { | |
root = r | |
} else { | |
entries = which(xmlSApply(r, xmlName) == 'entry') | |
if(length(entries) > 0) { | |
root = addChildren(root, kids=r[entries]) | |
} | |
} | |
if(is.null(r[['continuation']])) { | |
break #No more posts to retrieve | |
} else { | |
continueValue = unclass(xmlChildren(r[['continuation']])$text)$value | |
} | |
} | |
return(root) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment