timelyportfolio/cfa12 twitter analysis.r

## cfa12 twitter analysis.r
require(twitteR)
require(ggplot2)

#get CFA12 tweets for exploration
#appears to limit n to 1500

cfatweets.1 <- searchTwitter("#cfa12", n=1500, since="2012-05-04",until="2012-05-07")
cfatweets.2 <- searchTwitter("#cfa12", n=1500, since="2012-05-07",until="2012-05-08")
cfatweets.3 <- searchTwitter("#cfa12", n=1500, since="2012-05-08",until="2012-05-15")

cfatweets.df <- rbind(
  twListToDF(cfatweets.1),
  twListToDF(cfatweets.2),
  twListToDF(cfatweets.3))

tweeter <- as.data.frame(table(cfatweets.df$screenName),stringsAsFactors=FALSE)
colnames(tweeter) <- c("tweeter","count")
o <- order(tweeter$count,decreasing=TRUE)[1:50]
tweeter.ranked <- tweeter[o,]
tweeter.ranked$tweeter <- reorder(x=tweeter.ranked$tweeter,X=tweeter.ranked$count)

#credit for these charts goes to
#http://isomorphismes.tumblr.com/post/20362455367/twitter
ggplot(data=tweeter.ranked,
       aes(y=count, x=tweeter, fill=tweeter))  +
      geom_bar(stat="identity") + coord_flip() +
      geom_text(aes(x=tweeter,y=0,label=paste("@",tweeter," ",count,sep=""),size=1,hjust=0)) +
      theme_bw() +
      opts(title="Prolific #cfa12 Tweeters", legend.position="none",
           axis.text.y = theme_blank(), axis.ticks = theme_blank(), axis.title.y = theme_blank()  )

ggplot(data=tweeter.ranked,
       aes(y=count, x=tweeter, fill=tweeter)) + coord_polar() +
         geom_bar(stat="identity") + #coord_flip() +
         theme_bw() +
         opts(title="Prolific #cfa12 Tweeters", legend.position="none",
              axis.title.y = theme_blank()  )


ggplot(data=cfatweets.df,aes(x=created,fill=format(cfatweets.df$created,"%Y-%m-%d"))) +
  geom_density() + theme_bw() +
  opts(title="#cfa12 Tweets by Time", legend.position="none",
       axis.title.y = theme_blank()  )

######################do word cloud#############################
#all credit goes to http://blog.ouseful.info/2012/02/15/generating-twitter-wordclouds-in-r-prompted-by-an-open-learning-blogpost/
#thanks for the very fine example

#Use a handy helper function to put the tweets into a dataframe
tw.df=cfatweets.df$text

##Note: there are some handy, basic Twitter related functions here:
##https://github.com/matteoredaelli/twitter-r-utils
#For example:
RemoveAtPeople <- function(tweet) {
  gsub("@\\w+", "", tweet)
}
RemoveHash <- function(tweet) {
  gsub("#\\w+", "", tweet)
}
#Then for example, remove @'d names
tweets <- as.vector(RemoveAtPeople(tw.df))
tweets <- as.vector(RemoveHash(tweets))

##Wordcloud - scripts available from various sources; I used:
#http://rdatamining.wordpress.com/2011/11/09/using-text-mining-to-find-out-what-rdatamining-tweets-are-about/

#Install the textmining library
require(tm)
#Call with eg: tw.c=generateCorpus(tw.df$text)
generateCorpus= function(df,my.stopwords=c()){
  #The following is cribbed and seems to do what it says on the can
  tw.corpus= Corpus(VectorSource(df))
  # remove punctuation
  tw.corpus = tm_map(tw.corpus, removePunctuation)
  #normalise case
  tw.corpus = tm_map(tw.corpus, tolower)
  # remove stopwords
  tw.corpus = tm_map(tw.corpus, removeWords, stopwords('english'))
  tw.corpus = tm_map(tw.corpus, removeWords, my.stopwords)

  tw.corpus
}

twCorpus <- generateCorpus(tweets)
tdm <- TermDocumentMatrix(twCorpus)
m <- as.matrix(tdm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)

require(wordcloud)
#wordcloud(d$word,d$freq,min.freq=5)
##### with colors #####
if(require(RColorBrewer)){
  pal <- brewer.pal(9,"BuGn")
  pal <- pal[-(1:4)]
  wordcloud(d$word,d$freq,c(2,.5),min.freq=10,,FALSE,,.1,pal)
}
	require(twitteR)
	require(ggplot2)

	#get CFA12 tweets for exploration
	#appears to limit n to 1500

	cfatweets.1 <- searchTwitter("#cfa12", n=1500, since="2012-05-04",until="2012-05-07")
	cfatweets.2 <- searchTwitter("#cfa12", n=1500, since="2012-05-07",until="2012-05-08")
	cfatweets.3 <- searchTwitter("#cfa12", n=1500, since="2012-05-08",until="2012-05-15")

	cfatweets.df <- rbind(
	twListToDF(cfatweets.1),
	twListToDF(cfatweets.2),
	twListToDF(cfatweets.3))

	tweeter <- as.data.frame(table(cfatweets.df$screenName),stringsAsFactors=FALSE)
	colnames(tweeter) <- c("tweeter","count")
	o <- order(tweeter$count,decreasing=TRUE)[1:50]
	tweeter.ranked <- tweeter[o,]
	tweeter.ranked$tweeter <- reorder(x=tweeter.ranked$tweeter,X=tweeter.ranked$count)

	#credit for these charts goes to
	#http://isomorphismes.tumblr.com/post/20362455367/twitter
	ggplot(data=tweeter.ranked,
	aes(y=count, x=tweeter, fill=tweeter)) +
	geom_bar(stat="identity") + coord_flip() +
	geom_text(aes(x=tweeter,y=0,label=paste("@",tweeter," ",count,sep=""),size=1,hjust=0)) +
	theme_bw() +
	opts(title="Prolific #cfa12 Tweeters", legend.position="none",
	axis.text.y = theme_blank(), axis.ticks = theme_blank(), axis.title.y = theme_blank() )

	ggplot(data=tweeter.ranked,
	aes(y=count, x=tweeter, fill=tweeter)) + coord_polar() +
	geom_bar(stat="identity") + #coord_flip() +
	theme_bw() +
	opts(title="Prolific #cfa12 Tweeters", legend.position="none",
	axis.title.y = theme_blank() )


	ggplot(data=cfatweets.df,aes(x=created,fill=format(cfatweets.df$created,"%Y-%m-%d"))) +
	geom_density() + theme_bw() +
	opts(title="#cfa12 Tweets by Time", legend.position="none",
	axis.title.y = theme_blank() )

	######################do word cloud#############################
	#all credit goes to http://blog.ouseful.info/2012/02/15/generating-twitter-wordclouds-in-r-prompted-by-an-open-learning-blogpost/
	#thanks for the very fine example

	#Use a handy helper function to put the tweets into a dataframe
	tw.df=cfatweets.df$text

	##Note: there are some handy, basic Twitter related functions here:
	##https://github.com/matteoredaelli/twitter-r-utils
	#For example:
	RemoveAtPeople <- function(tweet) {
	gsub("@\\w+", "", tweet)
	}
	RemoveHash <- function(tweet) {
	gsub("#\\w+", "", tweet)
	}
	#Then for example, remove @'d names
	tweets <- as.vector(RemoveAtPeople(tw.df))
	tweets <- as.vector(RemoveHash(tweets))

	##Wordcloud - scripts available from various sources; I used:
	#http://rdatamining.wordpress.com/2011/11/09/using-text-mining-to-find-out-what-rdatamining-tweets-are-about/

	#Install the textmining library
	require(tm)
	#Call with eg: tw.c=generateCorpus(tw.df$text)
	generateCorpus= function(df,my.stopwords=c()){
	#The following is cribbed and seems to do what it says on the can
	tw.corpus= Corpus(VectorSource(df))
	# remove punctuation
	tw.corpus = tm_map(tw.corpus, removePunctuation)
	#normalise case
	tw.corpus = tm_map(tw.corpus, tolower)
	# remove stopwords
	tw.corpus = tm_map(tw.corpus, removeWords, stopwords('english'))
	tw.corpus = tm_map(tw.corpus, removeWords, my.stopwords)

	tw.corpus
	}

	twCorpus <- generateCorpus(tweets)
	tdm <- TermDocumentMatrix(twCorpus)
	m <- as.matrix(tdm)
	v <- sort(rowSums(m),decreasing=TRUE)
	d <- data.frame(word = names(v),freq=v)

	require(wordcloud)
	#wordcloud(d$word,d$freq,min.freq=5)
	##### with colors #####
	if(require(RColorBrewer)){
	pal <- brewer.pal(9,"BuGn")
	pal <- pal[-(1:4)]
	wordcloud(d$word,d$freq,c(2,.5),min.freq=10,,FALSE,,.1,pal)
	}