jbryer/RBloggers.R

## RBloggers.R
source('https://raw.github.com/gist/1606595/269d61dfcc7930f5275a212e11f3c43771ab2591/GoogleReader.R')

rbloggers = getRSSFeed(feedURL="http://r-bloggers.com/feed",
			   email="GOOGLE READER EMAIL",
			   passwd="GOOGLE READER PASSWORD",
			   posts=5000)
entries = rbloggers[which(names(rbloggers) == "entry")]
length(entries)
saveXML(rbloggers, file='rbloggers.xml')

#This will create a data frame with some of the information from the RSS feed
posts = data.frame(title=character(0), author=character(0),
				   link=character(0), stringsAsFactors=FALSE)
posts[1:length(entries),1:ncol(posts)] = NA
posts$published = as.Date(NA)
posts.categories = list()
for(i in 1:length(entries)) {
	entry = entries[[i]]
	posts[i,]$title = unclass(xmlChildren(entry[['title']])$text)$value
	posts[i,]$author = unclass(xmlChildren(entry[['author']][['name']])$text)$value
	posts[i,]$link = xmlAttrs(entry[['link']])[['href']]
	posts[i,]$published = as.Date(substr(unclass(
		xmlChildren(entry[['published']])$text)$value, 1, 10))
	categories = entry[which(names(entry) == "category")]
	posts.categories[[i]] = list()
	if(length(categories) > 1) { #Ignore the first category as it is used for Google Reader
		l = list()
		for(j in 2:length(categories)) {
			l[(j-1)] = xmlAttrs(categories[[j]])[['term']]
		}
		posts.categories[[i]] = l
	}
}


#We'll use Paul Bleicher's calendarHeat function to visualize the number of posts per day
source('https://raw.github.com/tavisrudd/r_users_group_1/master/calendarHeat.R')
cal = as.data.frame(table(posts$published))
cal$Var1 = as.Date(cal$Var1)
calendarHeat(cal$Var1, cal$Freq, color="r2b", varname="Number of Posts on R-Bloggers.com")

#Create a word cloud
require(wordcloud)
ctab = unlist(posts.categories)
ctab = unlist(strsplit(ctab, ' '))
ctab = as.data.frame(table(ctab))
ctab = ctab[-which(ctab$ctab == 'Uncategorized'),]
wordcloud(ctab$ctab, ctab$Freq, min.freq=10)
	source('https://raw.github.com/gist/1606595/269d61dfcc7930f5275a212e11f3c43771ab2591/GoogleReader.R')

	rbloggers = getRSSFeed(feedURL="http://r-bloggers.com/feed",
	email="GOOGLE READER EMAIL",
	passwd="GOOGLE READER PASSWORD",
	posts=5000)
	entries = rbloggers[which(names(rbloggers) == "entry")]
	length(entries)
	saveXML(rbloggers, file='rbloggers.xml')

	#This will create a data frame with some of the information from the RSS feed
	posts = data.frame(title=character(0), author=character(0),
	link=character(0), stringsAsFactors=FALSE)
	posts[1:length(entries),1:ncol(posts)] = NA
	posts$published = as.Date(NA)
	posts.categories = list()
	for(i in 1:length(entries)) {
	entry = entries[[i]]
	posts[i,]$title = unclass(xmlChildren(entry[['title']])$text)$value
	posts[i,]$author = unclass(xmlChildren(entry[['author']][['name']])$text)$value
	posts[i,]$link = xmlAttrs(entry[['link']])[['href']]
	posts[i,]$published = as.Date(substr(unclass(
	xmlChildren(entry[['published']])$text)$value, 1, 10))
	categories = entry[which(names(entry) == "category")]
	posts.categories[[i]] = list()
	if(length(categories) > 1) { #Ignore the first category as it is used for Google Reader
	l = list()
	for(j in 2:length(categories)) {
	l[(j-1)] = xmlAttrs(categories[[j]])[['term']]
	}
	posts.categories[[i]] = l
	}
	}


	#We'll use Paul Bleicher's calendarHeat function to visualize the number of posts per day
	source('https://raw.github.com/tavisrudd/r_users_group_1/master/calendarHeat.R')
	cal = as.data.frame(table(posts$published))
	cal$Var1 = as.Date(cal$Var1)
	calendarHeat(cal$Var1, cal$Freq, color="r2b", varname="Number of Posts on R-Bloggers.com")

	#Create a word cloud
	require(wordcloud)
	ctab = unlist(posts.categories)
	ctab = unlist(strsplit(ctab, ' '))
	ctab = as.data.frame(table(ctab))
	ctab = ctab[-which(ctab$ctab == 'Uncategorized'),]
	wordcloud(ctab$ctab, ctab$Freq, min.freq=10)