Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
Analyzing twitter data using R
#install required packages
install.packages("twitteR")
install.packages("RCurl")
install.packages("httr")
install.packages("devtools")
install.packages(toInstall, repos = "http://cran.r-project.org")
library(devtools)
#Load necessary packages
library(twitteR)
library(RCurl)
library(base64enc)
# XXX: Go to http://dev.twitter.com/apps/new to create an app and get values
# for these credentials, which you'll need to provide in place of these
# empty string values that are defined as placeholders.
# See https://dev.twitter.com/docs/auth/oauth for more information
# on Twitter's OAuth implementation.
Access_token <- ""
Access_token_secret <- ""
consumer_key <- ""
consumer_secret <- ""
#Calling twitteR OAuth function
setup_twitter_oauth(consumer_key,consumer_secret,Access_token,Access_token_secret)
#getting timeline data on ma3Route
tweets <- userTimeline("name", n=3200, maxID=NULL, sinceID=NULL, includeRts=TRUE)
#convert the tweets into a df
tweets.df <-twListToDF(tweets)
dim(tweets.df)
library(tm)
library(stringr)
#build a corpus and specify the source to be character of vectors
#a corpus is a collection of written texts
myCorpus <- Corpus(VectorSource(tweets.df$text))
myCorpus <- tm_map(myCorpus, function(x) iconv(enc2utf8(x), sub = "byte"))
#convert myCorpus into lowercase
myCorpus <- tm_map(myCorpus, content_transformer(tolower))
#remove punctuation
myCorpus <- tm_map(myCorpus, removePunctuation)
#remove numbers
myCorpus <- tm_map(myCorpus, removeNumbers)
Textprocessing <- function(x)
{gsub("http[[:alnum:]]*",'', x)
gsub('http\\S+\\s*', '', x) ## Remove URLs
gsub('\\b+RT', '', x) ## Remove RT
gsub('#\\S+', '', x) ## Remove Hashtags
gsub('@\\S+', '', x) ## Remove Mentions
gsub('[[:cntrl:]]', '', x) ## Remove Controls and special characters
gsub("\\d", '', x) ## Remove Controls and special characters
gsub('[[:punct:]]', '', x) ## Remove Punctuations
gsub("^[[:space:]]*","",x) ## Remove leading whitespaces
gsub("[[:space:]]*$","",x) ## Remove trailing whitespaces
gsub(' +',' ',x) ## Remove extra whitespaces
}
myCorpus <- tm_map(myCorpus,Textprocessing)
# remove extra whitespace
myCorpus <- tm_map(myCorpus, stripWhitespace)
#add stopwords
#stopwords are words which do not contain much significance.
#These words are usually filtered out because they return vast amount of unnecessary information.
mystopwords <- c(stopwords("english"),"rt","íí","get","like","just","yes","know","will","good","day","people")
#remove stopwords
myCorpus <- tm_map(myCorpus,removeWords,mystopwords)
#copy of corpus
myCorpus_copy <- myCorpus
#stem words
myCorpus <- tm_map(myCorpus,stemDocument)
for (i in c(1:2, 3163)){
cat(paste0("[", i, "] "))
writeLines(strwrap(as.character(myCorpus_copy[[i]]), 60))}
# stemCompletion2 <- function(x, dictionary){
# x <- unlist(strsplit(as.character(x), " "))
# # Unexpectedly, stemCompletion completes an empty string to
# # a word in dictionary. Remove empty string to avoid above issue.
# x <- x[x != ""]
# x <- stemCompletion(x, dictionary=dictionary)
# x <- paste(x, sep="", collapse=" ")
# PlainTextDocument(stripWhitespace(x))
# }
myCorpus <- tm_map(myCorpus, stemCompletion, dictionary=myCorpus_copy)
#myCorpus <- lapply(myCorpus, stemCompletion2, dictionary=myCorpus_copy)
myCorpus <- Corpus(VectorSource(myCorpus))
myCorpus <- iconv(x = myCorpus,"latin1","ASCII",sub = "")
tdm <- TermDocumentMatrix(myCorpus_copy,control = list(wordlengths = c(1,Inf)))
tdm
#inspect frequent words
freq.terms <- findFreqTerms(tdm, lowfreq = 50)
View(freq.terms)
termFreq <- rowSums(as.matrix(tdm))
termFreq <- subset(termFreq, termFreq >=20)
df <- data.frame(term = names(termFreq), freq = termFreq)
View(df)
#visualize frequent terms
library(ggplot2)
#visualize frequent terms
library(ggplot2)
ggplot(df,aes(x = reorder(df$term, +df$freq), y = freq, fill=df$freq)) + geom_bar(stat = "identity") +
scale_colour_gradientn(colors = terrain.colors(10)) + xlab("Terms") + ylab("Count") + coord_flip()
#load required libraries
library(wordcloud)
library(wordcloud2)
m <- as.matrix(tdm)
# colors
pal <- brewer.pal(9, "BuGn")
pal <- pal[-(1:4)]
#calculate the frequency of words as sort it by frequency
word.freq <- sort(rowSums(m), decreasing = T)
wordcloud2(df, color = "random-dark", backgroundColor = "white",figPath = "sev.png")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment