shedoesdatascience/tweet.R

## tweet.R
##### Twitter Analysis ####

#### 1. load packages ####
Needed <-c("twitteR","SentimentAnalysis","quanteda","tm","EGAnet","tidytext","wordcloud")
install.packages(Needed,dependencies=TRUE)
library(rtweet)
library(twitteR)
library(dplyr)
library(tidyr)
library(tidytext)
library(lubridate)
library(SentimentAnalysis)
library(quanteda)
library(ggplot2)
library(tm)
library(devtools)
library(EGAnet)
library(forestmangr)#round_df function
library(wordcloud)
library(syuzhet)

#### 2. Set parameters for twitter app access ####
## store api keys (replace with your own keys)
api_key <- "aHGiOIWCubjkMmitWXWIYDA"
api_secret_key <- "WlbDGVZU8zylxY2j0aLsutp9VL0z6EsxOzC8SgjXmNTY"
access_token <- "13104153364333ry6VoVI6CpzVA5dix7J3gzsvhQLHtw"
access_token_secret <- "MZiVtV4FqQbQ7qvRj35B46BI0RflEKUZYx00AHiuX"


## 2.1 authenticate via web browser
token <- create_token(
  app = "rexploretweets",
  consumer_key = api_key,
  consumer_secret = api_secret_key,
  access_token = access_token,
  access_secret = access_token_secret)

setup_twitter_oauth(api_key, api_secret_key, access_token, access_token_secret)

## 2.2 Check to see if token is loaded

get_token()

#### 3. Obtain tweeets for a specific topic ####

sd_cars<- rtweet::search_tweets(
                        q = "#selfdrivingcars",
                        n=5000,
                        include_rts = FALSE) # want to exclude retweets

sd_cars #output resulting twitter tweet_type_dfframe #twitter rest API limits all search to the past 6-9 days

#### 4. Transform tweets data ####

## 4.1 Remove retweets ##

sd_cars_organic <- sd_cars[sd_cars$is_retweet==FALSE,]

## 4.2 Remove replies

sd_cars_organic <- subset(sd_cars_organic,is.na(sd_cars_organic$reply_to_status_id))

sd_cars_df<-as.tweet_type_df.frame(sd_cars_organic)
saveRDS(sd_cars_df,file="sd_cars_tweet.rds")


## 4.3 keeping only the retweets
sd_cars_retweets<-sd_cars[sd_cars$is_retweet==TRUE,]
sd_cars_replies<-subset(sd_cars,!is.na(sd_cars$reply_to_status_id))

# 4.4 creating a tweet_type_df frame based on number of rows for each type of tweet
tweet_type_df<-data.frame(
  category=c("Organic","Retweets","Replies"),
  count=c(nrow(sd_cars_organic),nrow(sd_cars_retweets),nrow(sd_cars_replies)))


## 4.5 Setting up tweet_type_df for visualisation
tweet_type_df$fraction = tweet_type_df$count/sum(tweet_type_df$count)
tweet_type_df$percentage = tweet_type_df$count/sum(tweet_type_df$count)*100
tweet_type_df$ymax = cumsum(tweet_type_df$fraction)
tweet_type_df$ymin = c(0,head(tweet_type_df$ymax,n=-1))

# 4.6 Rounding the tweet_type_df to two decimal points
tweet_type_df<-round_df(tweet_type_df,2)

# 4.7 Specify what the legend should say
Type_of_tweet<-paste(tweet_type_df$category, tweet_type_df$percentage,"%")
ggplot(tweet_type_df,aes(ymax=ymax,ymin=ymin,xmax=4,xmin=3,fill=Type_of_tweet))+
  geom_rect()+
  coord_polar(theta="y")+
  xlim(c(2,4))+
  theme_void()+
  theme(legend.position="right")

#### 5. Visualise tweets ####

ts_plot(sd_cars_organic, by = "3 hours") +
  theme(plot.title = element_text(face = "bold")) +
  labs(x = NULL, y= NULL,
    title = "Frequency of self driving car tweets from past 9 days",
    subtitle = "Twitter status (tweet) counts aggregated using three hour intervals",
    caption = "\n Source: tweet_type_df collected from Twitter's rest API via retweet")


## 5.1 Who is tweeting ##
users<-search_users("#selfdrivingcars",n=500)
users_df<-as.tweet_type_df.frame(users)

saveRDS(users_df,file="users_tweet.rds")
#just view the first 10 users
head(users, n=10)

## 5.2 how many locations are represented
length(unique(users$location))

users %>%
  count(location,sort=TRUE) %>%
  mutate(location=reorder(location,n)) %>%
  na.omit() %>%
  top_n(10) %>%
  ggplot(aes(x=location,y=n))+
  geom_col()+
  coord_flip()+
  labs(x="Location",
       y="Count",
       title="Twitter users - unique locations - Top 10 by usage")


## 5.3 most frequent words ##
sd_cars_organic$text <- gsub("https\\s*","",sd_cars_organic$text)
sd_cars_organic$text <- gsub("@\\s*","",sd_cars_organic$text)
sd_cars_organic$text  <-gsub("amp","",sd_cars_organic$text)
sd_cars_organic$text <-gsub("[[:punct:]]","",sd_cars_organic$text)


sd_cars_organic$text <-gsub("@[[:alpha:]]*","",sd_cars_organic$text)

#draw out only the text column because we are interested in cleaning it with tm
text_corpus<-Corpus(VectorSource(sd_cars_organic$text))

#remove words like car, selfdriving cars, autonomous vehicles as they are correlated to selfdriving cars
#rt = retweet, re = reply
text_corpus<-tm_map(text_corpus,removeWords, c("selfdrivingcars","driverlesscars","autonomousvehicles","cars","car","rt","re","vehicle","selfdriving","driverless","autonomous"))

#remove stop words
text_corpus<-tm_map(text_corpus,removeWords,stopwords("english"))

#remove punctuation
text_corpus<-tm_map(text_corpus,removePunctuation)
text_corpus<-tm_map(text_corpus,removeNumbers)


#find most frequent words

dtm<-TermDocumentMatrix(text_corpus)
m <-as.matrix(dtm)
v <-sort(rowSums(m),decreasing=TRUE)
d<-data.frame(word=names(v),freq=v)
head(d,10)


#visualize
set.seed(1234)
wordcloud(words=d$word,freq=d$freq,min.freq=10,max.words=200,random.order=FALSE,rot.per=0.35,
          colors=brewer.pal(8,"Dark2"))


#convert cleansed corpus back into dataframe

sd_cars_tweets <- data.frame(text_clean = get("content",text_corpus),
                             stringsAsFactors=FALSE)


#### 6. Sentiment analysis of tweets ####


##Converting tweets to ASCII To get rid of strange characters
sd_cars_tweets$text_clean<-iconv(sd_cars_tweets$text_clean,from="UTF-8",to="ASCII",sub="")

#removing mentions, in case needed
sd_cars_tweets$text_clean<-gsub("@\\w+","",sd_cars_tweets$text_clean)
tweet_sentiment<-get_nrc_sentiment((sd_cars_tweets$text_clean))
sentimentscores<-data.frame(colSums(tweet_sentiment[,]))
names(sentimentscores)<-"Score"
sentimentscores<-cbind("sentiment"=rownames(sentimentscores),sentimentscores)
rownames(sentimentscores)<-NULL

ggplot2::ggplot(data=sentimentscores)+
  geom_bar(mapping=aes(x=sentiment,y=Score),stat="identity")+
  theme(legend.position="none")+
  xlab("Sentiments")+ylab("Scores")+
  ggtitle("Total sentiment based on scores")+
  theme_minimal()


## Only look at negative sentiment ##

sd_cars_sentiments<-analyzeSentiment(sd_cars_tweets$text_clean)

##select subset of measures
sd_cars_sentiments_subset<-dplyr::select(sd_cars_sentiments,
                                         SentimentGI,SentimentHE,
                                         SentimentLM,SentimentQDAP,
                                         WordCount)

#create a mean value for each tweet's sentiment level, leaving us with a single sentiment value for each tweet
sd_cars_sentiments_subset <-dplyr::mutate(sd_cars_sentiments_subset,
                                          mean_sentiment=rowMeans(sd_cars_sentiments_subset[,-5]))


# remove unnecessary sentiment measures
sd_cars_sentiments_subset<-dplyr::select(sd_cars_sentiments_subset,
                                         WordCount,
                                         mean_sentiment)

sd_cars_df<-cbind.data.frame(sd_cars_tweets,sd_cars_sentiments_subset)

#only keep negative sentiments
sd_cars_df_negative<-filter(sd_cars_df,mean_sentiment<0)

nrow(sd_cars_df_negative)


#topic analysis of negative sentiments
sd_cars_tokenized_list<-tokens(sd_cars_df_negative$text_clean)

sd_cars_dfm<-dfm(sd_cars_tokenized_list)

#turn this list object into a document feature matrix with teh dfm command.
#use the colsums command to get the count of use for every word, which we assign to the vector "word_sums"
word_sums<-colSums(sd_cars_dfm)

length(word_sums)#number of words

#which are the most frequent words
freq_negative_words<-data.frame(word=names(word_sums),
                                freq=word_sums,
                                row.names=NULL,
                                stringsAsFactors = FALSE)

sorted_freq_neg<-freq_negative_words[order(freq_negative_words$freq,decreasing=TRUE),]

#remove odd characters
sd_cars_df_negative$text_clean<-sapply(sd_cars_df_negative$text_clean,
                                       function(row) iconv(row,"latin1","ASCII",sub=""))

neg_corpus_tm<-Corpus(VectorSource(sd_cars_df_negative$text_clean))

neg_tm<-DocumentTermMatrix(neg_corpus_tm)

neg_tm<-removeSparseTerms(neg_tm,0.98)

neg_df_cluster<-as.data.frame(as.matrix(neg_tm))

#Create clusters using exploratory graph analysis

ega_neg_sd_cars<-EGA(neg_df_cluster)

View(as.data.frame(ega_neg_sd_cars$dim.variables))
	##### Twitter Analysis ####

	#### 1. load packages ####
	Needed <-c("twitteR","SentimentAnalysis","quanteda","tm","EGAnet","tidytext","wordcloud")
	install.packages(Needed,dependencies=TRUE)
	library(rtweet)
	library(twitteR)
	library(dplyr)
	library(tidyr)
	library(tidytext)
	library(lubridate)
	library(SentimentAnalysis)
	library(quanteda)
	library(ggplot2)
	library(tm)
	library(devtools)
	library(EGAnet)
	library(forestmangr)#round_df function
	library(wordcloud)
	library(syuzhet)

	#### 2. Set parameters for twitter app access ####
	## store api keys (replace with your own keys)
	api_key <- "aHGiOIWCubjkMmitWXWIYDA"
	api_secret_key <- "WlbDGVZU8zylxY2j0aLsutp9VL0z6EsxOzC8SgjXmNTY"
	access_token <- "13104153364333ry6VoVI6CpzVA5dix7J3gzsvhQLHtw"
	access_token_secret <- "MZiVtV4FqQbQ7qvRj35B46BI0RflEKUZYx00AHiuX"


	## 2.1 authenticate via web browser
	token <- create_token(
	app = "rexploretweets",
	consumer_key = api_key,
	consumer_secret = api_secret_key,
	access_token = access_token,
	access_secret = access_token_secret)

	setup_twitter_oauth(api_key, api_secret_key, access_token, access_token_secret)

	## 2.2 Check to see if token is loaded

	get_token()

	#### 3. Obtain tweeets for a specific topic ####

	sd_cars<- rtweet::search_tweets(
	q = "#selfdrivingcars",
	n=5000,
	include_rts = FALSE) # want to exclude retweets

	sd_cars #output resulting twitter tweet_type_dfframe #twitter rest API limits all search to the past 6-9 days

	#### 4. Transform tweets data ####

	## 4.1 Remove retweets ##

	sd_cars_organic <- sd_cars[sd_cars$is_retweet==FALSE,]

	## 4.2 Remove replies

	sd_cars_organic <- subset(sd_cars_organic,is.na(sd_cars_organic$reply_to_status_id))

	sd_cars_df<-as.tweet_type_df.frame(sd_cars_organic)
	saveRDS(sd_cars_df,file="sd_cars_tweet.rds")


	## 4.3 keeping only the retweets
	sd_cars_retweets<-sd_cars[sd_cars$is_retweet==TRUE,]
	sd_cars_replies<-subset(sd_cars,!is.na(sd_cars$reply_to_status_id))

	# 4.4 creating a tweet_type_df frame based on number of rows for each type of tweet
	tweet_type_df<-data.frame(
	category=c("Organic","Retweets","Replies"),
	count=c(nrow(sd_cars_organic),nrow(sd_cars_retweets),nrow(sd_cars_replies)))


	## 4.5 Setting up tweet_type_df for visualisation
	tweet_type_df$fraction = tweet_type_df$count/sum(tweet_type_df$count)
	tweet_type_df$percentage = tweet_type_df$count/sum(tweet_type_df$count)*100
	tweet_type_df$ymax = cumsum(tweet_type_df$fraction)
	tweet_type_df$ymin = c(0,head(tweet_type_df$ymax,n=-1))

	# 4.6 Rounding the tweet_type_df to two decimal points
	tweet_type_df<-round_df(tweet_type_df,2)

	# 4.7 Specify what the legend should say
	Type_of_tweet<-paste(tweet_type_df$category, tweet_type_df$percentage,"%")
	ggplot(tweet_type_df,aes(ymax=ymax,ymin=ymin,xmax=4,xmin=3,fill=Type_of_tweet))+
	geom_rect()+
	coord_polar(theta="y")+
	xlim(c(2,4))+
	theme_void()+
	theme(legend.position="right")

	#### 5. Visualise tweets ####

	ts_plot(sd_cars_organic, by = "3 hours") +
	theme(plot.title = element_text(face = "bold")) +
	labs(x = NULL, y= NULL,
	title = "Frequency of self driving car tweets from past 9 days",
	subtitle = "Twitter status (tweet) counts aggregated using three hour intervals",
	caption = "\n Source: tweet_type_df collected from Twitter's rest API via retweet")


	## 5.1 Who is tweeting ##
	users<-search_users("#selfdrivingcars",n=500)
	users_df<-as.tweet_type_df.frame(users)

	saveRDS(users_df,file="users_tweet.rds")
	#just view the first 10 users
	head(users, n=10)

	## 5.2 how many locations are represented
	length(unique(users$location))

	users %>%
	count(location,sort=TRUE) %>%
	mutate(location=reorder(location,n)) %>%
	na.omit() %>%
	top_n(10) %>%
	ggplot(aes(x=location,y=n))+
	geom_col()+
	coord_flip()+
	labs(x="Location",
	y="Count",
	title="Twitter users - unique locations - Top 10 by usage")



	## 5.3 most frequent words ##
	sd_cars_organic$text <- gsub("https\\s*","",sd_cars_organic$text)
	sd_cars_organic$text <- gsub("@\\s*","",sd_cars_organic$text)
	sd_cars_organic$text <-gsub("amp","",sd_cars_organic$text)
	sd_cars_organic$text <-gsub("[[:punct:]]","",sd_cars_organic$text)



	sd_cars_organic$text <-gsub("@[[:alpha:]]*","",sd_cars_organic$text)

	#draw out only the text column because we are interested in cleaning it with tm
	text_corpus<-Corpus(VectorSource(sd_cars_organic$text))

	#remove words like car, selfdriving cars, autonomous vehicles as they are correlated to selfdriving cars
	#rt = retweet, re = reply
	text_corpus<-tm_map(text_corpus,removeWords, c("selfdrivingcars","driverlesscars","autonomousvehicles","cars","car","rt","re","vehicle","selfdriving","driverless","autonomous"))

	#remove stop words
	text_corpus<-tm_map(text_corpus,removeWords,stopwords("english"))

	#remove punctuation
	text_corpus<-tm_map(text_corpus,removePunctuation)
	text_corpus<-tm_map(text_corpus,removeNumbers)



	#find most frequent words

	dtm<-TermDocumentMatrix(text_corpus)
	m <-as.matrix(dtm)
	v <-sort(rowSums(m),decreasing=TRUE)
	d<-data.frame(word=names(v),freq=v)
	head(d,10)



	#visualize
	set.seed(1234)
	wordcloud(words=d$word,freq=d$freq,min.freq=10,max.words=200,random.order=FALSE,rot.per=0.35,
	colors=brewer.pal(8,"Dark2"))


	#convert cleansed corpus back into dataframe

	sd_cars_tweets <- data.frame(text_clean = get("content",text_corpus),
	stringsAsFactors=FALSE)




	#### 6. Sentiment analysis of tweets ####


	##Converting tweets to ASCII To get rid of strange characters
	sd_cars_tweets$text_clean<-iconv(sd_cars_tweets$text_clean,from="UTF-8",to="ASCII",sub="")

	#removing mentions, in case needed
	sd_cars_tweets$text_clean<-gsub("@\\w+","",sd_cars_tweets$text_clean)
	tweet_sentiment<-get_nrc_sentiment((sd_cars_tweets$text_clean))
	sentimentscores<-data.frame(colSums(tweet_sentiment[,]))
	names(sentimentscores)<-"Score"
	sentimentscores<-cbind("sentiment"=rownames(sentimentscores),sentimentscores)
	rownames(sentimentscores)<-NULL

	ggplot2::ggplot(data=sentimentscores)+
	geom_bar(mapping=aes(x=sentiment,y=Score),stat="identity")+
	theme(legend.position="none")+
	xlab("Sentiments")+ylab("Scores")+
	ggtitle("Total sentiment based on scores")+
	theme_minimal()


	## Only look at negative sentiment ##

	sd_cars_sentiments<-analyzeSentiment(sd_cars_tweets$text_clean)

	##select subset of measures
	sd_cars_sentiments_subset<-dplyr::select(sd_cars_sentiments,
	SentimentGI,SentimentHE,
	SentimentLM,SentimentQDAP,
	WordCount)

	#create a mean value for each tweet's sentiment level, leaving us with a single sentiment value for each tweet
	sd_cars_sentiments_subset <-dplyr::mutate(sd_cars_sentiments_subset,
	mean_sentiment=rowMeans(sd_cars_sentiments_subset[,-5]))


	# remove unnecessary sentiment measures
	sd_cars_sentiments_subset<-dplyr::select(sd_cars_sentiments_subset,
	WordCount,
	mean_sentiment)

	sd_cars_df<-cbind.data.frame(sd_cars_tweets,sd_cars_sentiments_subset)

	#only keep negative sentiments
	sd_cars_df_negative<-filter(sd_cars_df,mean_sentiment<0)

	nrow(sd_cars_df_negative)


	#topic analysis of negative sentiments
	sd_cars_tokenized_list<-tokens(sd_cars_df_negative$text_clean)

	sd_cars_dfm<-dfm(sd_cars_tokenized_list)

	#turn this list object into a document feature matrix with teh dfm command.
	#use the colsums command to get the count of use for every word, which we assign to the vector "word_sums"
	word_sums<-colSums(sd_cars_dfm)

	length(word_sums)#number of words

	#which are the most frequent words
	freq_negative_words<-data.frame(word=names(word_sums),
	freq=word_sums,
	row.names=NULL,
	stringsAsFactors = FALSE)

	sorted_freq_neg<-freq_negative_words[order(freq_negative_words$freq,decreasing=TRUE),]

	#remove odd characters
	sd_cars_df_negative$text_clean<-sapply(sd_cars_df_negative$text_clean,
	function(row) iconv(row,"latin1","ASCII",sub=""))

	neg_corpus_tm<-Corpus(VectorSource(sd_cars_df_negative$text_clean))

	neg_tm<-DocumentTermMatrix(neg_corpus_tm)

	neg_tm<-removeSparseTerms(neg_tm,0.98)

	neg_df_cluster<-as.data.frame(as.matrix(neg_tm))

	#Create clusters using exploratory graph analysis

	ega_neg_sd_cars<-EGA(neg_df_cluster)

	View(as.data.frame(ega_neg_sd_cars$dim.variables))