Instantly share code, notes, and snippets.

Embed
What would you like to do?
Analyzing a south african financial news corpus from twitter
library(httr)
library(httpuv)
library(RColorBrewer)
library(twitteR)
library(tm)
library(wordcloud)
library(base64enc)
library(devtools)
library(ROAuth)
api_key <- "API"
api_secret <- "API Secret"
access_token <- "ACCESS Token"
access_token_secret <- "ACCESS Token secret"
setup_twitter_oauth(api_key,api_secret,access_token,access_token_secret)
fin24Tweets <- userTimeline('Fin24', n=3200)
bdLiveTweets <- userTimeline('BDliveSA', n=3200)
moneyWebTweets <- userTimeline('Moneyweb', n=3200)
tweets=c(moneyWebTweets,bdLiveTweets,moneyWebTweets)
texts_from_tweets = sapply(tweets, function(x) x$getText())
cleanTweets <- function(tweets)
{
tweets = gsub("@", "", tweets)
tweets = gsub("@\\w+", " ", tweets)
tweets = gsub("https", "", tweets)
tweets <- gsub("[ |\t]{2,}", " ", tweets)
tweets <- gsub("[ |\t]{2,}", " ", tweets)
tweets <- gsub("amp", " ", tweets)
tweets <- gsub("^ ", "", tweets)
tweets <- gsub(" $", "", tweets)
tweets <- gsub(" +", " ", tweets)
tweets <- unique(tweets)
return(tweets)
}
texts_from_tweets=cleanTweets(texts_from_tweets)
tweets_corpus = Corpus(VectorSource(texts_from_tweets))
tweets_corpus_cleaned<- tm_map(tweets_corpus, function(x)removeWords(x,stopwords()))
# create document term matrix applying some transformations
add_more_stopwords=c("reports","report","opinion","column","can","will","still","read","wants","says","national","world","sport","life","video",
"lunchbox","comment","must","miss","new","editorial","popular", "cartoon","international","national","politics","companies",
"business","day","top","week","markets","economy","subscribe" , "case","missed","ahead","editor","premium","tomorrow","stories",
"click","keep","needs","interview","moneyweb","year","soapbox","news","expo","register","today","need","now","podcast","lineup",
"bafana","line-up","reader","question","wednesday","conversation","money","company","writes","how",
stopwords("english"))
tweets_corpus_cleaned<- tm_map(tweets_corpus, function(x)removeWords(x,add_more_stopwords))
term_document_matrix = TermDocumentMatrix(tweets_corpus_cleaned,control = list(removePunctuation = TRUE,stopwords=add_more_stopwords,removeNumbers = TRUE, tolower = TRUE))
# define tdm as matrix
term_document_matrix = as.matrix(term_document_matrix)
# get word counts in decreasing order
word_freqs = sort(rowSums(term_document_matrix), decreasing=TRUE)
# create a data frame with words and their frequencies
data_frame = data.frame(word=names(word_freqs), freq=word_freqs)
# plot wordcloud
wordcloud(data_frame$word, data_frame$freq, random.order=FALSE, colors=brewer.pal(8, "Dark2"))
library('topicmodels')
library(tm)
library(ggplot2)
library(dplyr)
library(tidytext)
# create the document term matrix
dtm <- DocumentTermMatrix(tweets_corpus_cleaned,control=list(removePunctuation = TRUE,stopwords=add_more_stopwords,removeNumbers = TRUE, tolower = TRUE))
# fit the lda_model using gibs sampling
lda_model <- LDA(dtm, k = 3,method="Gibbs") # Use 3 topics
# extract the key words in each topics
the_topics <- tidy(lda_model, matrix = "beta")
#display the topics
topic_terms <- the_topics %>%
group_by(topic) %>%
top_n(10, beta) %>% # Term most frequent words.
ungroup() %>%
arrange(topic, -beta)
topic_terms %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment