public
Last active

Retrieving and Analyzing R-Bloggers using the Google Reader API

  • Download Gist
RBloggers.R
R
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
source('https://raw.github.com/gist/1606595/269d61dfcc7930f5275a212e11f3c43771ab2591/GoogleReader.R')
 
rbloggers = getRSSFeed(feedURL="http://r-bloggers.com/feed",
email="GOOGLE READER EMAIL",
passwd="GOOGLE READER PASSWORD",
posts=5000)
entries = rbloggers[which(names(rbloggers) == "entry")]
length(entries)
saveXML(rbloggers, file='rbloggers.xml')
 
#This will create a data frame with some of the information from the RSS feed
posts = data.frame(title=character(0), author=character(0),
link=character(0), stringsAsFactors=FALSE)
posts[1:length(entries),1:ncol(posts)] = NA
posts$published = as.Date(NA)
posts.categories = list()
for(i in 1:length(entries)) {
entry = entries[[i]]
posts[i,]$title = unclass(xmlChildren(entry[['title']])$text)$value
posts[i,]$author = unclass(xmlChildren(entry[['author']][['name']])$text)$value
posts[i,]$link = xmlAttrs(entry[['link']])[['href']]
posts[i,]$published = as.Date(substr(unclass(
xmlChildren(entry[['published']])$text)$value, 1, 10))
categories = entry[which(names(entry) == "category")]
posts.categories[[i]] = list()
if(length(categories) > 1) { #Ignore the first category as it is used for Google Reader
l = list()
for(j in 2:length(categories)) {
l[(j-1)] = xmlAttrs(categories[[j]])[['term']]
}
posts.categories[[i]] = l
}
}
 
 
#We'll use Paul Bleicher's calendarHeat function to visualize the number of posts per day
source('https://raw.github.com/tavisrudd/r_users_group_1/master/calendarHeat.R')
cal = as.data.frame(table(posts$published))
cal$Var1 = as.Date(cal$Var1)
calendarHeat(cal$Var1, cal$Freq, color="r2b", varname="Number of Posts on R-Bloggers.com")
 
#Create a word cloud
require(wordcloud)
ctab = unlist(posts.categories)
ctab = unlist(strsplit(ctab, ' '))
ctab = as.data.frame(table(ctab))
ctab = ctab[-which(ctab$ctab == 'Uncategorized'),]
wordcloud(ctab$ctab, ctab$Freq, min.freq=10)

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.