dempseydata/Basic Tweet Tag Popularity Comparison

## Basic Tweet Tag Popularity Comparison
# Basic Twitter Analysis
# analyze and compare tweets for a defined set of hash and @ tags
# base code from "Data Science with R", a book for an introductory course from Syracuse, as per http://blog.revolutionanalytics.com/2013/02/free-e-book-on-data-science-with-r.html

# load the required packages:
require("bitops")
require("RCurl")
require("RJSONIO")
require("twitteR")
require("ROAuth")
require("stringr")  # for string manipulation
require("plyr") # data frame manipulation
require("ggplot2")  # plotting
require("tm")  # for text mining
require("wordcloud")  # for word cloud visual

############################
# Task 1 : handshake and register with Twitter in order to access the API
# - you must have a registered API key in order to use the API, contained within a RData file - see code for notes
############################

#create the credential variable
credential <- NULL

# check to see if there is a credential file present
if(file.exists("credential.RData"))
{
  load(file = "credential.RData")
} else {
  if(file.exists("mykey.Rdata"))
  {
    load(file = "mykey.RData")
    # get the credentials etc
    credential <- OAuthFactory$new(consumerKey=my.key,
                                   consumerSecret="<YOUR secret goes here>",
                                   requestURL="https://api.twitter.com/oauth/request_token",
                                   accessURL="https://api.twitter.com/oauth/access_token",
                                   authURL="https://api.twitter.com/oauth/authorize")
    # handshake with Twitter in order to get the necessary confirmation pin
    # THIS WILL LIKELY REQUIRE MANUAL URL ENTRY INTO A BROWSER - RStudio for example, will not allow copy and paste of the returned URL from the console
    credential$handshake()
    save(credential,file = "credential.RData")
  } else {
  stop("In order to run this program, you need to have an RData file called mykey.RData containing a single variable called my.key, containing, your twitterAPI key")
  }
}

#Now that we have the credential, we register it for this session
if(registerTwitterOAuth(credential))
{
  cat("Credential has been registered","\n")
} else {
  stop("There is an issue with the credential file, I suggest you delete it and rerun the script in order to re-handshake")
}

############################
# Task 2 : Access the API and download the raw data
############################
# initiate the tags
my.tags <- NULL
# read the tags from a CSV file with two columns, but no column header
if(file.exists("twitter_tags.txt"))
{
  my.tags <- read.csv("twitter_tags.txt", header = FALSE, stringsAsFactors = FALSE, col.names = c("group","tag"))
} else {
  stop("Cannot find the file twitter_tags.txt in working directory. This file needs to contain two or more lines in csv format of grouping,tag - no column headers - note: the grouping label should be the owner of the twitter account, such as a company, that you are wanting to group by and compare, but without the @")
}

# this function returns a data frame for each tweet search term requested
# limiting search to rolling 14 days finishing YESTERDAY (ie 14 complete days)
TweetFrame <- function(search.term, max.tweets){
  twt.list <- searchTwitter(search.term, n = max.tweets)#, since=as.character(Sys.Date() - 14), until=as.character(Sys.Date() - 1))  # get the list
  a <- do.call("rbind",lapply(twt.list, as.data.frame))  # convert to a data frame
  a <- a[order(as.integer(a$created)),]  # order in increasing time stamp
  a.delay <- as.integer(diff(a[,5]))  # what is the delay between tweets
  a.delay <- c(NA, a.delay)  # add an NA record to represent the first tweet having no delay
  a$delay <- as.data.frame(a.delay)  # add new column to data frame
  a$delay <- a.delay  # add the delay numbers to records other than the first, which as NA delay
  return(cbind(search.term, a))
}


# go and fetch 500 tweets for each of the required tags, returning a list of data frames
the.tweets <- apply(my.tags,1,(function(x) TweetFrame(x[2],500)))
# Convert this list into a single data frame
the.tweets <- do.call("rbind",the.tweets)
# rename the first column
colnames(the.tweets)[1] <- "tag"
# merge with my.tags in order to add the groupibn column
the.tweets <- merge(my.tags, the.tweets, by = "tag")


##########################
# Task 3 : Add additional column for: how long? how many words? is it a retweet? How many retweets? Was it tweeted by the owner? was it a retweet of the owners tweet? Does it contain any tags? How many and which other tags from the list are included?
#########################

# length
the.tweets$length <- str_length(the.tweets$text)
# number of words, after removiung double spaces - #words = #spaces + 1 "dog ran fast" = 3 words, 2 spaces
the.tweets$wordCount <- str_count(str_replace_all(the.tweets$text,"  ","")," ") + 1

# is it a retweet and if so how many times?

the.tweets$rtOf <-  str_match(the.tweets$text,"RT @[a-z,A-Z,0-9]*:")  # find the latest RT
the.tweets$rtOf <- str_replace(str_replace(the.tweets$rt,"RT @",""),":","")  # trim RT @ and :
the.tweets$rtOfOwner <- toupper(the.tweets$rtOf) == toupper(the.tweets$group)

# the problem with this step below, is tha it only counts exact matches of the rwtweeter being equal to the group. eg. Sidecar = Sidecar
# it does not capture sub-twitter accounts, such that SidecarCHI != Sidecar (the CHI indicating it is the Chicago office of sidecar)
# So, here we UNDERCOUNT, but we also avoid over counting false twitter accounts such as 'SidecarSucks' or other negative accounts as if they are the vendor
the.tweets$rtByOwner <- (the.tweets$isRetweet) && (toupper(the.tweets$screenName) == toupper(the.tweets$group))

# Need to add one column for each tag in the search list and flag for its inclusion in a specific tweet
# Will need to replace the # and @ symbols with text such as 'hash' and 'at'
tags.in.tweets <- as.data.frame(apply(my.tags,1,(function(x) !(is.na(str_match(the.tweets$text,x[2]))))))
# assign column names
colnames(tags.in.tweets) <- str_replace(str_replace(my.tags[,2],"#","hash"),"@","at")
# combine the two data frames
the.tweets <- cbind(the.tweets,tags.in.tweets)

# we no longer need the separate data frame of tags in tweets, tags, or the credentials vairable
rm("tags.in.tweets","my.tags","credential")

##### should I remove the retweets before counting the words?

##########################
# Task 4 : Analyze the data
# which tag has the shortest delay between tweets
# what is the relative popularity of the tags
# who has the most retweets
# who has more public versus host tweets
# what are the popular groupings of tags
#########################

# quickly grab the means of each tag
the.tweets.means <- ddply(the.tweets, ~group+tag, numcolwise(mean,na.rm=TRUE))

# add on the number of tweets for each tag in the data set
the.tweets.means <- merge(the.tweets.means, ddply(the.tweets, ~group+tag, nrow), by=c("group","tag"))
colnames(the.tweets.means) <- c("group","tag","favoriteCount","retweetCount","delay","length", "wordCount","tweetCount")

# But, what is the likelihood that the values we see in this data, are reasonable ones, so that we can be confident if one tag is more popular than the rest? Poisson test (PT)?
# 1 - PT each tag to look at the confidence interval - the narrower, the better
single.pt <- ddply(the.tweets, ~group+tag, function(x) {
  a <- poisson.test(sum(x$delay <= mean(x$delay, na.rm=TRUE), na.rm=TRUE), nrow(x))
  # the % less than mean, lower bound confidence interval, higher bound confidence interval
  b <- c(a["estimate"][[1]][[1]],a["conf.int"][[1]][[1]],a["conf.int"][[1]][[2]])
})

# merge and rename columns
the.tweets.means <- merge(the.tweets.means, single.pt, by=c("group","tag"))
colnames(the.tweets.means) <- c("group","tag","favoriteCount","retweetCount","delay","length", "wordCount","tweetCount","estimate","lowerConfidence","upperConfidence")
# force a max Upperconfidence interval of 1 (100%)
the.tweets.means$upperConfidence[the.tweets.means$upperConfidence > 1] <- 1

# 2 - Benchmark all tags against the one with the smallest mean delay, in terms of % of tweets quicker than the mean, then use the PT to check the confidence intervals. If a confidence interval overlaps with that of the 'quickest' tag, then there is a chance the quickest was not actually the quickest
multiple.pt <- ddply(the.tweets, ~group+tag, function(x) {
  a.mean <- mean(x$delay, na.rm=TRUE)  # the mean delay of the tag
  a.num.delays <- sum(x$delay <= a.mean, na.rm=TRUE)  # number of delays less than the mean of the tag
  a.nrow <- nrow(x)  # the number of rows of the tag
  # now lets compare every tag, against these values
  a.compare <- ddply(the.tweets, ~group+tag, function(y, b.mean=a.mean, b.num.delays = a.num.delays, b.row=a.nrow) {
    b.test <- poisson.test(c(b.num.delays,  # the tag being used as the baseline, from the outer ddply
                   sum(y$delay <= b.mean, na.rm=TRUE)),  # number of delays of the tag being analyzed quicker than the delay of the tag from the outer ddply
                 c(b.row,nrow(y)))  # nrow for the tage from the outer ddply and the inner ddply
    b.result <- c(x$tag[1],b.test["estimate"][[1]][[1]],b.test["conf.int"][[1]][[1]],b.test["conf.int"][[1]][[2]])
    })
})
# rename the columns
colnames(multiple.pt) <- c("group","tag","compareTag","compareEst","compareLowerConf","compareUpperConf")
# for some reason, the comparison columns are set to character, when they should be numeric
multiple.pt$compareEst <- as.numeric(multiple.pt$compareEst)
multiple.pt$compareLowerConf <- as.numeric(multiple.pt$compareLowerConf)
multiple.pt$compareUpperConf <- as.numeric(multiple.pt$compareUpperConf)

# this gets me the comparisons of all tags against all others, but in reality, all I want is the comparison of very tag against the tag with the lowest mean delay. So, lets pull this one out of the data fram and them add the columns into the means data frame
the.tweets.means <- merge(the.tweets.means,
      # pull out the records from the multiple.pt results, where the compare tag is that with the lowest delay
      multiple.pt[multiple.pt$compareTag == the.tweets.means$tag[the.tweets.means$delay == min(the.tweets.means$delay)],],
      by=c("group","tag"))


# reorder to dataframe for plotting by ascending mean delay
the.tweets.means <- the.tweets.means[with(the.tweets.means, order(delay)),]
# reorder the factor for tag to reflect the ascending mean delay
the.tweets.means$tag <- factor(the.tweets.means$tag, as.character(the.tweets.means[order(the.tweets.means$delay),]$tag),ordered = TRUE)


# plot histogram of tweet delays and the mean delay
hist.mean.plot <- ggplot(the.tweets.means) +  # base of plot, using means to ensure ascendign mean ordering
  geom_vline(aes(xintercept=delay), color="red") +  # what are the mean delays for each tag, NA is removed for the first tweet of each tag has NA delay
  geom_histogram(data=the.tweets, aes(x=delay), binwidth=60, colour="black", fill="white") +  # main histogram bin width in minutes
  facet_wrap(~tag+group, ncol=1) +   # layout by tag and then group - note the sorting is done in the order the facet is specified
  scale_x_continuous(limits=c(0,3600),name="Delay in seconds (1 hour limit)") +  # limit histogram to 12 hours
  scale_y_continuous(name="# Tweets")  +  # relable the Y
  ggtitle(expression(atop("Histogram of delays between tweets", atop(italic("Red bar indicated mean delay for tag"), ""))))

print(hist.mean.plot)

# plot the % of tweets arriving before the mean delay, and the confidence interval based on an ideal poisson distro
conf.int.plot <- ggplot(the.tweets.means, aes(x=tag)) +
  geom_point(aes(y=(estimate*100)), size=3) +
  geom_errorbar(aes(ymin=(lowerConfidence*100), ymax=(upperConfidence*100)), width=.1) +
  scale_y_continuous(limits=c(0,100), name="% Tweets quicker than the mean") +
  scale_x_discrete("Tag") +
  ggtitle(expression(atop("% of tweets quicker than the mean delay", atop(italic("95% confidence interval"), ""))))

print(conf.int.plot)

# plot to relative speeds of the tweets when compared to the tag with the smallest mean, and the confidence intervals
upperSpeedLimit <- the.tweets.means$compareUpperConf[the.tweets.means$tag == the.tweets.means$compareTag]  # find the upper confidence limit of the fastest tag

rel.slowness.plot <- ggplot(the.tweets.means, aes(x=tag)) +  # using the means data
  geom_point(aes(y=compareEst), size=3) +  # plot the estimated N times slower
  geom_errorbar(aes(ymin=compareLowerConf, ymax=compareUpperConf), width=.1) +  # plot the confidence interval
  geom_hline(aes(yintercept=upperSpeedLimit), color="red", width = .1, linetype=2) +   # plot the reference line for 1, being the same speed as the fastest tag
  scale_y_continuous(name="N times") +  # label Y
  scale_x_discrete("Tag") +  # label X
  ggtitle(expression(atop("How slow are the tags compared to the fastest?", atop(italic("95% confidence interval"), ""))))

print(rel.slowness.plot)

#######################
# Task 5 -  Text analysis of tweets
#######################

# this function removes all URLs retweets, hashtags and user names, leaving plain words only
TweetsToCleanedWordFrequecy <- function(tweets)
{
  # first, lets clean all the tweets up
  tweets$text <- str_replace_all(tweets$text,"http:[./a-zA-Z0-9]*","")  # remove URLs from end or first space, assuming alphanumeric shortening
  tweets$text <- str_replace_all(tweets$text,"https:[./a-zA-Z0-9]*","")  # remove URLs
  tweets$text <- str_replace_all(tweets$text,"RT @[a-zA-Z0-9]*","")  # remove all retweet labels
  tweets$text <- str_replace_all(tweets$text,"MT @[a-zA-Z0-9]*","")  # remove all modified tweet labels
  tweets$text <- str_replace_all(tweets$text,"#[a-zA-Z0-9]*","")  # remove all hashtags
  tweets$text <- str_replace_all(tweets$text,"@[a-zA-Z0-9]*","")  # remove all screen names
  tweets$text = str_replace_all(tweets$text,"[^[:alpha:] ]", "")  # Only keep alpha - removes all non-western and strange unicode characters
  tweets$text <- str_replace_all(tweets$text," {2,}"," ")  # replace all multiple spaces with a single space

  # now lets use the tm package to do some text mining
  tweet.corpus <- Corpus(VectorSource(tweets$text))  # covert to corpus object
  tweet.corpus <- tm_map(tweet.corpus, removeWords, stopwords('english'))  # remove stop words such as "the" "a" "at" etc
  tweet.corpus <- tm_map(tweet.corpus, tolower)  # force everything to lower case

  # In many cases, words need to be stemmed to retrieve their radicals. For instance, "example" and "examples" are both stemmed to "exampl". However, after that, one may want to complete the stems to their original forms, so that the words would look "normal" - this basically consolidates the various forms of a word together (thank + thanks + thanked etc)
  tweet.corpus.dict <- tweet.corpus
  tweet.corpus <- tm_map(tweet.corpus, stemDocument)
  tweet.corpus <- tm_map(tweet.corpus, stemCompletion, dictionary=tweet.corpus.dict)  #, mc.cores=4)
  # increasing the cores used for stemp compeletion can speed things up, in my example case, from 3:37 down to 2:43

  # we now have a bag of words, time to create a term-document-matrix
  tweet.tdm <- TermDocumentMatrix(tweet.corpus)
  td.matrix <- as.matrix(tweet.tdm)  # convert to matrix
  td.matrix <- sort(rowSums(td.matrix), decreasing=TRUE) # sort the matrix
  word.freq.frame <- data.frame(word=names(td.matrix),freq=td.matrix)  # convert to a data frame for visualization

  return(word.freq.frame)
}

# generate a seperate word frequency count for each tag
word.freq.frame <- ddply(the.tweets[,1:3], ~group+tag, TweetsToCleanedWordFrequecy)

# in order to plot multiple wordclouds on a single page, they must first be generated at PNG files and saved to the working directory
pal <- brewer.pal(9,"GnBu")[-(1:2)]  # generate a color pallete, but loose the palest shades

old.par <- par(no.readonly=TRUE)  # store current params before altering

par(mfrow = c(ceiling(nrow(the.tweets.means)/3),3),   # set to 3 columns wide and how ever many rows
    oma=c(0, 0, 2, 0))  # ensure there is room at the top for an overall title

d_ply(word.freq.frame, ~group+tag, function(x=word.freq.frame){
  wordcloud(x$word, x$freq, max.words=50, colors=pal)
  title(max(str_c(x$group," - ",x$tag)))
})
par(font.main=2, ps=30)  # set font style for main title only
title(main="Twitter Word Clouds", outer = TRUE)

par <- old.par  # reset the params
	# Basic Twitter Analysis
	# analyze and compare tweets for a defined set of hash and @ tags
	# base code from "Data Science with R", a book for an introductory course from Syracuse, as per http://blog.revolutionanalytics.com/2013/02/free-e-book-on-data-science-with-r.html

	# load the required packages:
	require("bitops")
	require("RCurl")
	require("RJSONIO")
	require("twitteR")
	require("ROAuth")
	require("stringr") # for string manipulation
	require("plyr") # data frame manipulation
	require("ggplot2") # plotting
	require("tm") # for text mining
	require("wordcloud") # for word cloud visual

	############################
	# Task 1 : handshake and register with Twitter in order to access the API
	# - you must have a registered API key in order to use the API, contained within a RData file - see code for notes
	############################

	#create the credential variable
	credential <- NULL

	# check to see if there is a credential file present
	if(file.exists("credential.RData"))
	{
	load(file = "credential.RData")
	} else {
	if(file.exists("mykey.Rdata"))
	{
	load(file = "mykey.RData")
	# get the credentials etc
	credential <- OAuthFactory$new(consumerKey=my.key,
	consumerSecret="<YOUR secret goes here>",
	requestURL="https://api.twitter.com/oauth/request_token",
	accessURL="https://api.twitter.com/oauth/access_token",
	authURL="https://api.twitter.com/oauth/authorize")
	# handshake with Twitter in order to get the necessary confirmation pin
	# THIS WILL LIKELY REQUIRE MANUAL URL ENTRY INTO A BROWSER - RStudio for example, will not allow copy and paste of the returned URL from the console
	credential$handshake()
	save(credential,file = "credential.RData")
	} else {
	stop("In order to run this program, you need to have an RData file called mykey.RData containing a single variable called my.key, containing, your twitterAPI key")
	}
	}

	#Now that we have the credential, we register it for this session
	if(registerTwitterOAuth(credential))
	{
	cat("Credential has been registered","\n")
	} else {
	stop("There is an issue with the credential file, I suggest you delete it and rerun the script in order to re-handshake")
	}

	############################
	# Task 2 : Access the API and download the raw data
	############################
	# initiate the tags
	my.tags <- NULL
	# read the tags from a CSV file with two columns, but no column header
	if(file.exists("twitter_tags.txt"))
	{
	my.tags <- read.csv("twitter_tags.txt", header = FALSE, stringsAsFactors = FALSE, col.names = c("group","tag"))
	} else {
	stop("Cannot find the file twitter_tags.txt in working directory. This file needs to contain two or more lines in csv format of grouping,tag - no column headers - note: the grouping label should be the owner of the twitter account, such as a company, that you are wanting to group by and compare, but without the @")
	}

	# this function returns a data frame for each tweet search term requested
	# limiting search to rolling 14 days finishing YESTERDAY (ie 14 complete days)
	TweetFrame <- function(search.term, max.tweets){
	twt.list <- searchTwitter(search.term, n = max.tweets)#, since=as.character(Sys.Date() - 14), until=as.character(Sys.Date() - 1)) # get the list
	a <- do.call("rbind",lapply(twt.list, as.data.frame)) # convert to a data frame
	a <- a[order(as.integer(a$created)),] # order in increasing time stamp
	a.delay <- as.integer(diff(a[,5])) # what is the delay between tweets
	a.delay <- c(NA, a.delay) # add an NA record to represent the first tweet having no delay
	a$delay <- as.data.frame(a.delay) # add new column to data frame
	a$delay <- a.delay # add the delay numbers to records other than the first, which as NA delay
	return(cbind(search.term, a))
	}


	# go and fetch 500 tweets for each of the required tags, returning a list of data frames
	the.tweets <- apply(my.tags,1,(function(x) TweetFrame(x[2],500)))
	# Convert this list into a single data frame
	the.tweets <- do.call("rbind",the.tweets)
	# rename the first column
	colnames(the.tweets)[1] <- "tag"
	# merge with my.tags in order to add the groupibn column
	the.tweets <- merge(my.tags, the.tweets, by = "tag")


	##########################
	# Task 3 : Add additional column for: how long? how many words? is it a retweet? How many retweets? Was it tweeted by the owner? was it a retweet of the owners tweet? Does it contain any tags? How many and which other tags from the list are included?
	#########################

	# length
	the.tweets$length <- str_length(the.tweets$text)
	# number of words, after removiung double spaces - #words = #spaces + 1 "dog ran fast" = 3 words, 2 spaces
	the.tweets$wordCount <- str_count(str_replace_all(the.tweets$text," ","")," ") + 1

	# is it a retweet and if so how many times?

	the.tweets$rtOf <- str_match(the.tweets$text,"RT @[a-z,A-Z,0-9]*:") # find the latest RT
	the.tweets$rtOf <- str_replace(str_replace(the.tweets$rt,"RT @",""),":","") # trim RT @ and :
	the.tweets$rtOfOwner <- toupper(the.tweets$rtOf) == toupper(the.tweets$group)

	# the problem with this step below, is tha it only counts exact matches of the rwtweeter being equal to the group. eg. Sidecar = Sidecar
	# it does not capture sub-twitter accounts, such that SidecarCHI != Sidecar (the CHI indicating it is the Chicago office of sidecar)
	# So, here we UNDERCOUNT, but we also avoid over counting false twitter accounts such as 'SidecarSucks' or other negative accounts as if they are the vendor
	the.tweets$rtByOwner <- (the.tweets$isRetweet) && (toupper(the.tweets$screenName) == toupper(the.tweets$group))

	# Need to add one column for each tag in the search list and flag for its inclusion in a specific tweet
	# Will need to replace the # and @ symbols with text such as 'hash' and 'at'
	tags.in.tweets <- as.data.frame(apply(my.tags,1,(function(x) !(is.na(str_match(the.tweets$text,x[2]))))))
	# assign column names
	colnames(tags.in.tweets) <- str_replace(str_replace(my.tags[,2],"#","hash"),"@","at")
	# combine the two data frames
	the.tweets <- cbind(the.tweets,tags.in.tweets)

	# we no longer need the separate data frame of tags in tweets, tags, or the credentials vairable
	rm("tags.in.tweets","my.tags","credential")

	##### should I remove the retweets before counting the words?

	##########################
	# Task 4 : Analyze the data
	# which tag has the shortest delay between tweets
	# what is the relative popularity of the tags
	# who has the most retweets
	# who has more public versus host tweets
	# what are the popular groupings of tags
	#########################

	# quickly grab the means of each tag
	the.tweets.means <- ddply(the.tweets, ~group+tag, numcolwise(mean,na.rm=TRUE))

	# add on the number of tweets for each tag in the data set
	the.tweets.means <- merge(the.tweets.means, ddply(the.tweets, ~group+tag, nrow), by=c("group","tag"))
	colnames(the.tweets.means) <- c("group","tag","favoriteCount","retweetCount","delay","length", "wordCount","tweetCount")

	# But, what is the likelihood that the values we see in this data, are reasonable ones, so that we can be confident if one tag is more popular than the rest? Poisson test (PT)?
	# 1 - PT each tag to look at the confidence interval - the narrower, the better
	single.pt <- ddply(the.tweets, ~group+tag, function(x) {
	a <- poisson.test(sum(x$delay <= mean(x$delay, na.rm=TRUE), na.rm=TRUE), nrow(x))
	# the % less than mean, lower bound confidence interval, higher bound confidence interval
	b <- c(a["estimate"][[1]][[1]],a["conf.int"][[1]][[1]],a["conf.int"][[1]][[2]])
	})

	# merge and rename columns
	the.tweets.means <- merge(the.tweets.means, single.pt, by=c("group","tag"))
	colnames(the.tweets.means) <- c("group","tag","favoriteCount","retweetCount","delay","length", "wordCount","tweetCount","estimate","lowerConfidence","upperConfidence")
	# force a max Upperconfidence interval of 1 (100%)
	the.tweets.means$upperConfidence[the.tweets.means$upperConfidence > 1] <- 1

	# 2 - Benchmark all tags against the one with the smallest mean delay, in terms of % of tweets quicker than the mean, then use the PT to check the confidence intervals. If a confidence interval overlaps with that of the 'quickest' tag, then there is a chance the quickest was not actually the quickest
	multiple.pt <- ddply(the.tweets, ~group+tag, function(x) {
	a.mean <- mean(x$delay, na.rm=TRUE) # the mean delay of the tag
	a.num.delays <- sum(x$delay <= a.mean, na.rm=TRUE) # number of delays less than the mean of the tag
	a.nrow <- nrow(x) # the number of rows of the tag
	# now lets compare every tag, against these values
	a.compare <- ddply(the.tweets, ~group+tag, function(y, b.mean=a.mean, b.num.delays = a.num.delays, b.row=a.nrow) {
	b.test <- poisson.test(c(b.num.delays, # the tag being used as the baseline, from the outer ddply
	sum(y$delay <= b.mean, na.rm=TRUE)), # number of delays of the tag being analyzed quicker than the delay of the tag from the outer ddply
	c(b.row,nrow(y))) # nrow for the tage from the outer ddply and the inner ddply
	b.result <- c(x$tag[1],b.test["estimate"][[1]][[1]],b.test["conf.int"][[1]][[1]],b.test["conf.int"][[1]][[2]])
	})
	})
	# rename the columns
	colnames(multiple.pt) <- c("group","tag","compareTag","compareEst","compareLowerConf","compareUpperConf")
	# for some reason, the comparison columns are set to character, when they should be numeric
	multiple.pt$compareEst <- as.numeric(multiple.pt$compareEst)
	multiple.pt$compareLowerConf <- as.numeric(multiple.pt$compareLowerConf)
	multiple.pt$compareUpperConf <- as.numeric(multiple.pt$compareUpperConf)

	# this gets me the comparisons of all tags against all others, but in reality, all I want is the comparison of very tag against the tag with the lowest mean delay. So, lets pull this one out of the data fram and them add the columns into the means data frame
	the.tweets.means <- merge(the.tweets.means,
	# pull out the records from the multiple.pt results, where the compare tag is that with the lowest delay
	multiple.pt[multiple.pt$compareTag == the.tweets.means$tag[the.tweets.means$delay == min(the.tweets.means$delay)],],
	by=c("group","tag"))


	# reorder to dataframe for plotting by ascending mean delay
	the.tweets.means <- the.tweets.means[with(the.tweets.means, order(delay)),]
	# reorder the factor for tag to reflect the ascending mean delay
	the.tweets.means$tag <- factor(the.tweets.means$tag, as.character(the.tweets.means[order(the.tweets.means$delay),]$tag),ordered = TRUE)


	# plot histogram of tweet delays and the mean delay
	hist.mean.plot <- ggplot(the.tweets.means) + # base of plot, using means to ensure ascendign mean ordering
	geom_vline(aes(xintercept=delay), color="red") + # what are the mean delays for each tag, NA is removed for the first tweet of each tag has NA delay
	geom_histogram(data=the.tweets, aes(x=delay), binwidth=60, colour="black", fill="white") + # main histogram bin width in minutes
	facet_wrap(~tag+group, ncol=1) + # layout by tag and then group - note the sorting is done in the order the facet is specified
	scale_x_continuous(limits=c(0,3600),name="Delay in seconds (1 hour limit)") + # limit histogram to 12 hours
	scale_y_continuous(name="# Tweets") + # relable the Y
	ggtitle(expression(atop("Histogram of delays between tweets", atop(italic("Red bar indicated mean delay for tag"), ""))))

	print(hist.mean.plot)

	# plot the % of tweets arriving before the mean delay, and the confidence interval based on an ideal poisson distro
	conf.int.plot <- ggplot(the.tweets.means, aes(x=tag)) +
	geom_point(aes(y=(estimate*100)), size=3) +
	geom_errorbar(aes(ymin=(lowerConfidence100), ymax=(upperConfidence100)), width=.1) +
	scale_y_continuous(limits=c(0,100), name="% Tweets quicker than the mean") +
	scale_x_discrete("Tag") +
	ggtitle(expression(atop("% of tweets quicker than the mean delay", atop(italic("95% confidence interval"), ""))))

	print(conf.int.plot)

	# plot to relative speeds of the tweets when compared to the tag with the smallest mean, and the confidence intervals
	upperSpeedLimit <- the.tweets.means$compareUpperConf[the.tweets.means$tag == the.tweets.means$compareTag] # find the upper confidence limit of the fastest tag

	rel.slowness.plot <- ggplot(the.tweets.means, aes(x=tag)) + # using the means data
	geom_point(aes(y=compareEst), size=3) + # plot the estimated N times slower
	geom_errorbar(aes(ymin=compareLowerConf, ymax=compareUpperConf), width=.1) + # plot the confidence interval
	geom_hline(aes(yintercept=upperSpeedLimit), color="red", width = .1, linetype=2) + # plot the reference line for 1, being the same speed as the fastest tag
	scale_y_continuous(name="N times") + # label Y
	scale_x_discrete("Tag") + # label X
	ggtitle(expression(atop("How slow are the tags compared to the fastest?", atop(italic("95% confidence interval"), ""))))

	print(rel.slowness.plot)

	#######################
	# Task 5 - Text analysis of tweets
	#######################

	# this function removes all URLs retweets, hashtags and user names, leaving plain words only
	TweetsToCleanedWordFrequecy <- function(tweets)
	{
	# first, lets clean all the tweets up
	tweets$text <- str_replace_all(tweets$text,"http:[./a-zA-Z0-9]*","") # remove URLs from end or first space, assuming alphanumeric shortening
	tweets$text <- str_replace_all(tweets$text,"https:[./a-zA-Z0-9]*","") # remove URLs
	tweets$text <- str_replace_all(tweets$text,"RT @[a-zA-Z0-9]*","") # remove all retweet labels
	tweets$text <- str_replace_all(tweets$text,"MT @[a-zA-Z0-9]*","") # remove all modified tweet labels
	tweets$text <- str_replace_all(tweets$text,"#[a-zA-Z0-9]*","") # remove all hashtags
	tweets$text <- str_replace_all(tweets$text,"@[a-zA-Z0-9]*","") # remove all screen names
	tweets$text = str_replace_all(tweets$text,"[^[:alpha:] ]", "") # Only keep alpha - removes all non-western and strange unicode characters
	tweets$text <- str_replace_all(tweets$text," {2,}"," ") # replace all multiple spaces with a single space

	# now lets use the tm package to do some text mining
	tweet.corpus <- Corpus(VectorSource(tweets$text)) # covert to corpus object
	tweet.corpus <- tm_map(tweet.corpus, removeWords, stopwords('english')) # remove stop words such as "the" "a" "at" etc
	tweet.corpus <- tm_map(tweet.corpus, tolower) # force everything to lower case

	# In many cases, words need to be stemmed to retrieve their radicals. For instance, "example" and "examples" are both stemmed to "exampl". However, after that, one may want to complete the stems to their original forms, so that the words would look "normal" - this basically consolidates the various forms of a word together (thank + thanks + thanked etc)
	tweet.corpus.dict <- tweet.corpus
	tweet.corpus <- tm_map(tweet.corpus, stemDocument)
	tweet.corpus <- tm_map(tweet.corpus, stemCompletion, dictionary=tweet.corpus.dict) #, mc.cores=4)
	# increasing the cores used for stemp compeletion can speed things up, in my example case, from 3:37 down to 2:43

	# we now have a bag of words, time to create a term-document-matrix
	tweet.tdm <- TermDocumentMatrix(tweet.corpus)
	td.matrix <- as.matrix(tweet.tdm) # convert to matrix
	td.matrix <- sort(rowSums(td.matrix), decreasing=TRUE) # sort the matrix
	word.freq.frame <- data.frame(word=names(td.matrix),freq=td.matrix) # convert to a data frame for visualization

	return(word.freq.frame)
	}

	# generate a seperate word frequency count for each tag
	word.freq.frame <- ddply(the.tweets[,1:3], ~group+tag, TweetsToCleanedWordFrequecy)

	# in order to plot multiple wordclouds on a single page, they must first be generated at PNG files and saved to the working directory
	pal <- brewer.pal(9,"GnBu")[-(1:2)] # generate a color pallete, but loose the palest shades

	old.par <- par(no.readonly=TRUE) # store current params before altering

	par(mfrow = c(ceiling(nrow(the.tweets.means)/3),3), # set to 3 columns wide and how ever many rows
	oma=c(0, 0, 2, 0)) # ensure there is room at the top for an overall title

	d_ply(word.freq.frame, ~group+tag, function(x=word.freq.frame){
	wordcloud(x$word, x$freq, max.words=50, colors=pal)
	title(max(str_c(x$group," - ",x$tag)))
	})
	par(font.main=2, ps=30) # set font style for main title only
	title(main="Twitter Word Clouds", outer = TRUE)

	par <- old.par # reset the params