WilsonMongwe/Create corpus.R

## Create corpus.R
library(httr)
library(httpuv)
library(RColorBrewer)
library(twitteR)
library(tm)
library(wordcloud)
library(base64enc)
library(devtools)
library(ROAuth)

api_key <- "API"

api_secret <- "API Secret"

access_token <- "ACCESS Token"

access_token_secret <- "ACCESS Token secret"

setup_twitter_oauth(api_key,api_secret,access_token,access_token_secret)

fin24Tweets <- userTimeline('Fin24', n=3200)
bdLiveTweets <- userTimeline('BDliveSA', n=3200)
moneyWebTweets <- userTimeline('Moneyweb', n=3200)

tweets=c(moneyWebTweets,bdLiveTweets,moneyWebTweets)

texts_from_tweets = sapply(tweets, function(x) x$getText())

cleanTweets <- function(tweets)
{
  tweets = gsub("@", "", tweets)
  tweets = gsub("@\\w+", " ", tweets)
  tweets = gsub("https", "", tweets)
  tweets <- gsub("[ |\t]{2,}", " ", tweets)
  tweets <- gsub("[ |\t]{2,}", " ", tweets)
  tweets <- gsub("amp", " ", tweets)
  tweets <- gsub("^ ", "", tweets)
  tweets <- gsub(" $", "", tweets)
  tweets <- gsub(" +", " ", tweets)
  tweets <- unique(tweets)
  return(tweets)
}

texts_from_tweets=cleanTweets(texts_from_tweets)
tweets_corpus = Corpus(VectorSource(texts_from_tweets))
tweets_corpus_cleaned<- tm_map(tweets_corpus, function(x)removeWords(x,stopwords()))

# create document term matrix applying some transformations
add_more_stopwords=c("reports","report","opinion","column","can","will","still","read","wants","says","national","world","sport","life","video",
                     "lunchbox","comment","must","miss","new","editorial","popular", "cartoon","international","national","politics","companies",
                     "business","day","top","week","markets","economy","subscribe" , "case","missed","ahead","editor","premium","tomorrow","stories",
                     "click","keep","needs","interview","moneyweb","year","soapbox","news","expo","register","today","need","now","podcast","lineup",
                     "bafana","line-up","reader","question","wednesday","conversation","money","company","writes","how",
                     stopwords("english"))

tweets_corpus_cleaned<- tm_map(tweets_corpus, function(x)removeWords(x,add_more_stopwords))

term_document_matrix = TermDocumentMatrix(tweets_corpus_cleaned,control = list(removePunctuation = TRUE,stopwords=add_more_stopwords,removeNumbers = TRUE, tolower = TRUE))
# define tdm as matrix
term_document_matrix = as.matrix(term_document_matrix)
# get word counts in decreasing order
word_freqs = sort(rowSums(term_document_matrix), decreasing=TRUE)
# create a data frame with words and their frequencies
data_frame = data.frame(word=names(word_freqs), freq=word_freqs)
# plot wordcloud
wordcloud(data_frame$word, data_frame$freq, random.order=FALSE, colors=brewer.pal(8, "Dark2"))


## LDA.R
library('topicmodels')
library(tm)
library(ggplot2)
library(dplyr)
library(tidytext)

# create the document term matrix
dtm <- DocumentTermMatrix(tweets_corpus_cleaned,control=list(removePunctuation = TRUE,stopwords=add_more_stopwords,removeNumbers = TRUE, tolower = TRUE))
# fit the lda_model using gibs sampling
lda_model <- LDA(dtm, k = 3,method="Gibbs") # Use 3 topics
# extract the key words in each topics
the_topics <- tidy(lda_model, matrix = "beta")

#display the topics
topic_terms <- the_topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%  # Term most frequent words.
  ungroup() %>%
  arrange(topic, -beta)

topic_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()
	library(httr)
	library(httpuv)
	library(RColorBrewer)
	library(twitteR)
	library(tm)
	library(wordcloud)
	library(base64enc)
	library(devtools)
	library(ROAuth)

	api_key <- "API"

	api_secret <- "API Secret"

	access_token <- "ACCESS Token"

	access_token_secret <- "ACCESS Token secret"

	setup_twitter_oauth(api_key,api_secret,access_token,access_token_secret)

	fin24Tweets <- userTimeline('Fin24', n=3200)
	bdLiveTweets <- userTimeline('BDliveSA', n=3200)
	moneyWebTweets <- userTimeline('Moneyweb', n=3200)

	tweets=c(moneyWebTweets,bdLiveTweets,moneyWebTweets)

	texts_from_tweets = sapply(tweets, function(x) x$getText())

	cleanTweets <- function(tweets)
	{
	tweets = gsub("@", "", tweets)
	tweets = gsub("@\\w+", " ", tweets)
	tweets = gsub("https", "", tweets)
	tweets <- gsub("[ \|\t]{2,}", " ", tweets)
	tweets <- gsub("[ \|\t]{2,}", " ", tweets)
	tweets <- gsub("amp", " ", tweets)
	tweets <- gsub("^ ", "", tweets)
	tweets <- gsub(" $", "", tweets)
	tweets <- gsub(" +", " ", tweets)
	tweets <- unique(tweets)
	return(tweets)
	}

	texts_from_tweets=cleanTweets(texts_from_tweets)
	tweets_corpus = Corpus(VectorSource(texts_from_tweets))
	tweets_corpus_cleaned<- tm_map(tweets_corpus, function(x)removeWords(x,stopwords()))

	# create document term matrix applying some transformations
	add_more_stopwords=c("reports","report","opinion","column","can","will","still","read","wants","says","national","world","sport","life","video",
	"lunchbox","comment","must","miss","new","editorial","popular", "cartoon","international","national","politics","companies",
	"business","day","top","week","markets","economy","subscribe" , "case","missed","ahead","editor","premium","tomorrow","stories",
	"click","keep","needs","interview","moneyweb","year","soapbox","news","expo","register","today","need","now","podcast","lineup",
	"bafana","line-up","reader","question","wednesday","conversation","money","company","writes","how",
	stopwords("english"))

	tweets_corpus_cleaned<- tm_map(tweets_corpus, function(x)removeWords(x,add_more_stopwords))

	term_document_matrix = TermDocumentMatrix(tweets_corpus_cleaned,control = list(removePunctuation = TRUE,stopwords=add_more_stopwords,removeNumbers = TRUE, tolower = TRUE))
	# define tdm as matrix
	term_document_matrix = as.matrix(term_document_matrix)
	# get word counts in decreasing order
	word_freqs = sort(rowSums(term_document_matrix), decreasing=TRUE)
	# create a data frame with words and their frequencies
	data_frame = data.frame(word=names(word_freqs), freq=word_freqs)
	# plot wordcloud
	wordcloud(data_frame$word, data_frame$freq, random.order=FALSE, colors=brewer.pal(8, "Dark2"))
	library('topicmodels')
	library(tm)
	library(ggplot2)
	library(dplyr)
	library(tidytext)

	# create the document term matrix
	dtm <- DocumentTermMatrix(tweets_corpus_cleaned,control=list(removePunctuation = TRUE,stopwords=add_more_stopwords,removeNumbers = TRUE, tolower = TRUE))
	# fit the lda_model using gibs sampling
	lda_model <- LDA(dtm, k = 3,method="Gibbs") # Use 3 topics
	# extract the key words in each topics
	the_topics <- tidy(lda_model, matrix = "beta")

	#display the topics
	topic_terms <- the_topics %>%
	group_by(topic) %>%
	top_n(10, beta) %>% # Term most frequent words.
	ungroup() %>%
	arrange(topic, -beta)

	topic_terms %>%
	mutate(term = reorder(term, beta)) %>%
	ggplot(aes(term, beta, fill = factor(topic))) +
	geom_col(show.legend = FALSE) +
	facet_wrap(~ topic, scales = "free") +
	coord_flip()