bryangoodrich/TwitterTopics.r

## TwitterTopics.r
# Twitter Topic Modeling Using R
# Author: Bryan Goodrich
# Date Created: February 13, 2015
# Last Modified: April 3, 2015
#
# Use twitteR API to query Twitter, parse the search result, and
# perform a series of topic models for identifying potentially
# useful topics from your query content. This has applications for
# social media, research, or general curiosity
#
# Reference
# http://davetang.org/muse/2013/04/06/using-the-r_twitter-package/
# https://blog.credera.com/business-intelligence/twitter-analytics-using-r-part-2-create-word-cloud/


# Load relevant libraries
# Use install.packages("package") to install any you don't have
# If, like me, you're on Linux and cannot install the topicmodels
# package because of something about missing a GSL component. You'll
# need to install the library and development library. If, like me,
# you're on a Debian based system, it's pretty easy
#   sudo apt-get install libgsl0ldbl libgsl0-dev
library(twitteR)
library(NLP)
library(tm)
library(RColorBrewer)
library(wordcloud)
library(topicmodels)
library(SnowballC)


# Populate with your https://apps.twitter.com application information
# to use my_setup() to register your session. No, you cannot see mine!
my_setup <- function() {
    ckey = "Your Consumer Key Here"
    csec = "Your Consumer Secret Here"
    akey = "Your Access Token Here"
    asec = "Your Access Token Secret Here"

    setup_twitter_oauth(ckey, csec, akey, asec)
}

# Convenience function for accessing the text part of a tweet
# returned by the twitteR API. Is used below.
tweet_text <- function(x) x$getText()

# Submit a search query (terms separated by "+") and get a return
# set of data (corpus).
tweet_corpus <- function(search, n = 5000, ...) {
    payload <- searchTwitter(search, n = n, ...)
    sapply(payload, tweet_text)
}


# Search for some key terms, try to grab a lot if you want. Twitter will
# limit you as it sees fit (can find). Also has spatial options.
# Try these Sacramento coordinates: '38.630404,-121.293535,50mi'
my_setup()  # Setup the login for this session
tweets <- tweet_corpus("energy+electricity", n = 10000, lang = 'en')

# Save your corpus (because you're limited in how often you can do this for free!)
saveRDS(tweets, file = "Workspace/energy_themes/tweets.Rds", compress = 'xz')

# Okay, read that corpus back in from disk. I'm sure you have a
# different save location, right?
tweets <- readRDS("Workspace/energy_themes/tweets.Rds")


# Okay, here's where things get tricky. See references for examples.
# Problems? Depends if your system is using parallel processing. If
# it is, you'll need to use mc.cores parameters as shown later. That
# took me awhile to get! Thanks to the references for clearing that up.

# Here we pre-process the data in some standard ways. I'll post-define each step
tweets <- iconv(tweets, to = "ASCII", sub = " ")  # Convert to basic ASCII text to avoid silly characters
tweets <- tolower(tweets)  # Make everything consistently lower case
tweets <- gsub("rt", " ", tweets)  # Remove the "RT" (retweet) so duplicates are duplicates
tweets <- gsub("@\\w+", " ", tweets)  # Remove user names (all proper names if you're wise!)
tweets <- gsub("http.+ |http.+$", " ", tweets)  # Remove links
tweets <- gsub("[[:punct:]]", " ", tweets)  # Remove punctuation
tweets <- gsub("[ |\t]{2,}", " ", tweets)  # Remove tabs
tweets <- gsub("amp", " ", tweets)  # "&" is "&amp" in HTML, so after punctuation removed ...
tweets <- gsub("^ ", "", tweets)  # Leading blanks
tweets <- gsub(" $", "", tweets)  # Lagging blanks
tweets <- gsub(" +", " ", tweets) # General spaces (should just do all whitespaces no?)
tweets <- unique(tweets)  # Now get rid of duplicates!


# Convert to tm corpus and use its API for some additional fun
corpus <- Corpus(VectorSource(tweets))  # Create corpus object

# Remove English stop words. This could be greatly expanded!
# Don't forget the mc.cores thing
corpus <- tm_map(corpus, removeWords, stopwords("en"), mc.cores=1)

# Remove numbers. This could have been done earlier, of course.
corpus <- tm_map(corpus, removeNumbers, mc.cores=1)

# Stem the words. Google if you don't understand
corpus <- tm_map(corpus, stemDocument, mc.cores=1)

# Remove the stems associated with our search terms!
corpus <- tm_map(corpus, removeWords, c("energi", "electr"), mc.cores=1)


# Why not visualize the corpus now?
# Mine had a lot to do with {solar, power, renew, new, can, save, suppl, wind, price, use}
pal <- brewer.pal(8, "Dark2")
wordcloud(corpus, min.freq=2, max.words = 150, random.order = TRUE, col = pal)


# Now for Topic Modeling

# Get the lengths and make sure we only create a DTM for tweets with
# some actual content
doc.lengths <- rowSums(as.matrix(DocumentTermMatrix(corpus)))
dtm <- DocumentTermMatrix(corpus[doc.lengths > 0])
# model <- LDA(dtm, 10)  # Go ahead and test a simple model if you want


# Now for some topics
SEED = sample(1:1000000, 1)  # Pick a random seed for replication
k = 10  # Let's start with 10 topics

# This might take a minute!
models <- list(
    CTM       = CTM(dtm, k = k, control = list(seed = SEED, var = list(tol = 10^-4), em = list(tol = 10^-3))),
    VEM       = LDA(dtm, k = k, control = list(seed = SEED)),
    VEM_Fixed = LDA(dtm, k = k, control = list(estimate.alpha = FALSE, seed = SEED)),
    Gibbs     = LDA(dtm, k = k, method = "Gibbs", control = list(seed = SEED, burnin = 1000,
                                                                 thin = 100,    iter = 1000))
)


# There you have it. Models now holds 4 topics. See the topicmodels API documentation for details

# Top 10 terms of each topic for each model
# Do you see any themes you can label to these "topics" (lists of words)?
lapply(models, terms, 10)

# matrix of tweet assignments to predominate topic on that tweet
# for each of the models, in case you wanted to categorize them
assignments <- sapply(models, topics)
	# Twitter Topic Modeling Using R
	# Author: Bryan Goodrich
	# Date Created: February 13, 2015
	# Last Modified: April 3, 2015
	#
	# Use twitteR API to query Twitter, parse the search result, and
	# perform a series of topic models for identifying potentially
	# useful topics from your query content. This has applications for
	# social media, research, or general curiosity
	#
	# Reference
	# http://davetang.org/muse/2013/04/06/using-the-r_twitter-package/
	# https://blog.credera.com/business-intelligence/twitter-analytics-using-r-part-2-create-word-cloud/



	# Load relevant libraries
	# Use install.packages("package") to install any you don't have
	# If, like me, you're on Linux and cannot install the topicmodels
	# package because of something about missing a GSL component. You'll
	# need to install the library and development library. If, like me,
	# you're on a Debian based system, it's pretty easy
	# sudo apt-get install libgsl0ldbl libgsl0-dev
	library(twitteR)
	library(NLP)
	library(tm)
	library(RColorBrewer)
	library(wordcloud)
	library(topicmodels)
	library(SnowballC)


	# Populate with your https://apps.twitter.com application information
	# to use my_setup() to register your session. No, you cannot see mine!
	my_setup <- function() {
	ckey = "Your Consumer Key Here"
	csec = "Your Consumer Secret Here"
	akey = "Your Access Token Here"
	asec = "Your Access Token Secret Here"

	setup_twitter_oauth(ckey, csec, akey, asec)
	}

	# Convenience function for accessing the text part of a tweet
	# returned by the twitteR API. Is used below.
	tweet_text <- function(x) x$getText()

	# Submit a search query (terms separated by "+") and get a return
	# set of data (corpus).
	tweet_corpus <- function(search, n = 5000, ...) {
	payload <- searchTwitter(search, n = n, ...)
	sapply(payload, tweet_text)
	}



	# Search for some key terms, try to grab a lot if you want. Twitter will
	# limit you as it sees fit (can find). Also has spatial options.
	# Try these Sacramento coordinates: '38.630404,-121.293535,50mi'
	my_setup() # Setup the login for this session
	tweets <- tweet_corpus("energy+electricity", n = 10000, lang = 'en')

	# Save your corpus (because you're limited in how often you can do this for free!)
	saveRDS(tweets, file = "Workspace/energy_themes/tweets.Rds", compress = 'xz')

	# Okay, read that corpus back in from disk. I'm sure you have a
	# different save location, right?
	tweets <- readRDS("Workspace/energy_themes/tweets.Rds")



	# Okay, here's where things get tricky. See references for examples.
	# Problems? Depends if your system is using parallel processing. If
	# it is, you'll need to use mc.cores parameters as shown later. That
	# took me awhile to get! Thanks to the references for clearing that up.

	# Here we pre-process the data in some standard ways. I'll post-define each step
	tweets <- iconv(tweets, to = "ASCII", sub = " ") # Convert to basic ASCII text to avoid silly characters
	tweets <- tolower(tweets) # Make everything consistently lower case
	tweets <- gsub("rt", " ", tweets) # Remove the "RT" (retweet) so duplicates are duplicates
	tweets <- gsub("@\\w+", " ", tweets) # Remove user names (all proper names if you're wise!)
	tweets <- gsub("http.+ \|http.+$", " ", tweets) # Remove links
	tweets <- gsub("[[:punct:]]", " ", tweets) # Remove punctuation
	tweets <- gsub("[ \|\t]{2,}", " ", tweets) # Remove tabs
	tweets <- gsub("amp", " ", tweets) # "&" is "&amp" in HTML, so after punctuation removed ...
	tweets <- gsub("^ ", "", tweets) # Leading blanks
	tweets <- gsub(" $", "", tweets) # Lagging blanks
	tweets <- gsub(" +", " ", tweets) # General spaces (should just do all whitespaces no?)
	tweets <- unique(tweets) # Now get rid of duplicates!



	# Convert to tm corpus and use its API for some additional fun
	corpus <- Corpus(VectorSource(tweets)) # Create corpus object

	# Remove English stop words. This could be greatly expanded!
	# Don't forget the mc.cores thing
	corpus <- tm_map(corpus, removeWords, stopwords("en"), mc.cores=1)

	# Remove numbers. This could have been done earlier, of course.
	corpus <- tm_map(corpus, removeNumbers, mc.cores=1)

	# Stem the words. Google if you don't understand
	corpus <- tm_map(corpus, stemDocument, mc.cores=1)

	# Remove the stems associated with our search terms!
	corpus <- tm_map(corpus, removeWords, c("energi", "electr"), mc.cores=1)



	# Why not visualize the corpus now?
	# Mine had a lot to do with {solar, power, renew, new, can, save, suppl, wind, price, use}
	pal <- brewer.pal(8, "Dark2")
	wordcloud(corpus, min.freq=2, max.words = 150, random.order = TRUE, col = pal)



	# Now for Topic Modeling

	# Get the lengths and make sure we only create a DTM for tweets with
	# some actual content
	doc.lengths <- rowSums(as.matrix(DocumentTermMatrix(corpus)))
	dtm <- DocumentTermMatrix(corpus[doc.lengths > 0])
	# model <- LDA(dtm, 10) # Go ahead and test a simple model if you want



	# Now for some topics
	SEED = sample(1:1000000, 1) # Pick a random seed for replication
	k = 10 # Let's start with 10 topics

	# This might take a minute!
	models <- list(
	CTM = CTM(dtm, k = k, control = list(seed = SEED, var = list(tol = 10^-4), em = list(tol = 10^-3))),
	VEM = LDA(dtm, k = k, control = list(seed = SEED)),
	VEM_Fixed = LDA(dtm, k = k, control = list(estimate.alpha = FALSE, seed = SEED)),
	Gibbs = LDA(dtm, k = k, method = "Gibbs", control = list(seed = SEED, burnin = 1000,
	thin = 100, iter = 1000))
	)



	# There you have it. Models now holds 4 topics. See the topicmodels API documentation for details

	# Top 10 terms of each topic for each model
	# Do you see any themes you can label to these "topics" (lists of words)?
	lapply(models, terms, 10)

	# matrix of tweet assignments to predominate topic on that tweet
	# for each of the models, in case you wanted to categorize them
	assignments <- sapply(models, topics)