Created
September 29, 2020 06:03
-
-
Save shedoesdatascience/53c7e58e1df6ac696e6672179249e20b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
##### Twitter Analysis #### | |
#### 1. load packages #### | |
Needed <-c("twitteR","SentimentAnalysis","quanteda","tm","EGAnet","tidytext","wordcloud") | |
install.packages(Needed,dependencies=TRUE) | |
library(rtweet) | |
library(twitteR) | |
library(dplyr) | |
library(tidyr) | |
library(tidytext) | |
library(lubridate) | |
library(SentimentAnalysis) | |
library(quanteda) | |
library(ggplot2) | |
library(tm) | |
library(devtools) | |
library(EGAnet) | |
library(forestmangr)#round_df function | |
library(wordcloud) | |
library(syuzhet) | |
#### 2. Set parameters for twitter app access #### | |
## store api keys (replace with your own keys) | |
api_key <- "aHGiOIWCubjkMmitWXWIYDA" | |
api_secret_key <- "WlbDGVZU8zylxY2j0aLsutp9VL0z6EsxOzC8SgjXmNTY" | |
access_token <- "13104153364333ry6VoVI6CpzVA5dix7J3gzsvhQLHtw" | |
access_token_secret <- "MZiVtV4FqQbQ7qvRj35B46BI0RflEKUZYx00AHiuX" | |
## 2.1 authenticate via web browser | |
token <- create_token( | |
app = "rexploretweets", | |
consumer_key = api_key, | |
consumer_secret = api_secret_key, | |
access_token = access_token, | |
access_secret = access_token_secret) | |
setup_twitter_oauth(api_key, api_secret_key, access_token, access_token_secret) | |
## 2.2 Check to see if token is loaded | |
get_token() | |
#### 3. Obtain tweeets for a specific topic #### | |
sd_cars<- rtweet::search_tweets( | |
q = "#selfdrivingcars", | |
n=5000, | |
include_rts = FALSE) # want to exclude retweets | |
sd_cars #output resulting twitter tweet_type_dfframe #twitter rest API limits all search to the past 6-9 days | |
#### 4. Transform tweets data #### | |
## 4.1 Remove retweets ## | |
sd_cars_organic <- sd_cars[sd_cars$is_retweet==FALSE,] | |
## 4.2 Remove replies | |
sd_cars_organic <- subset(sd_cars_organic,is.na(sd_cars_organic$reply_to_status_id)) | |
sd_cars_df<-as.tweet_type_df.frame(sd_cars_organic) | |
saveRDS(sd_cars_df,file="sd_cars_tweet.rds") | |
## 4.3 keeping only the retweets | |
sd_cars_retweets<-sd_cars[sd_cars$is_retweet==TRUE,] | |
sd_cars_replies<-subset(sd_cars,!is.na(sd_cars$reply_to_status_id)) | |
# 4.4 creating a tweet_type_df frame based on number of rows for each type of tweet | |
tweet_type_df<-data.frame( | |
category=c("Organic","Retweets","Replies"), | |
count=c(nrow(sd_cars_organic),nrow(sd_cars_retweets),nrow(sd_cars_replies))) | |
## 4.5 Setting up tweet_type_df for visualisation | |
tweet_type_df$fraction = tweet_type_df$count/sum(tweet_type_df$count) | |
tweet_type_df$percentage = tweet_type_df$count/sum(tweet_type_df$count)*100 | |
tweet_type_df$ymax = cumsum(tweet_type_df$fraction) | |
tweet_type_df$ymin = c(0,head(tweet_type_df$ymax,n=-1)) | |
# 4.6 Rounding the tweet_type_df to two decimal points | |
tweet_type_df<-round_df(tweet_type_df,2) | |
# 4.7 Specify what the legend should say | |
Type_of_tweet<-paste(tweet_type_df$category, tweet_type_df$percentage,"%") | |
ggplot(tweet_type_df,aes(ymax=ymax,ymin=ymin,xmax=4,xmin=3,fill=Type_of_tweet))+ | |
geom_rect()+ | |
coord_polar(theta="y")+ | |
xlim(c(2,4))+ | |
theme_void()+ | |
theme(legend.position="right") | |
#### 5. Visualise tweets #### | |
ts_plot(sd_cars_organic, by = "3 hours") + | |
theme(plot.title = element_text(face = "bold")) + | |
labs(x = NULL, y= NULL, | |
title = "Frequency of self driving car tweets from past 9 days", | |
subtitle = "Twitter status (tweet) counts aggregated using three hour intervals", | |
caption = "\n Source: tweet_type_df collected from Twitter's rest API via retweet") | |
## 5.1 Who is tweeting ## | |
users<-search_users("#selfdrivingcars",n=500) | |
users_df<-as.tweet_type_df.frame(users) | |
saveRDS(users_df,file="users_tweet.rds") | |
#just view the first 10 users | |
head(users, n=10) | |
## 5.2 how many locations are represented | |
length(unique(users$location)) | |
users %>% | |
count(location,sort=TRUE) %>% | |
mutate(location=reorder(location,n)) %>% | |
na.omit() %>% | |
top_n(10) %>% | |
ggplot(aes(x=location,y=n))+ | |
geom_col()+ | |
coord_flip()+ | |
labs(x="Location", | |
y="Count", | |
title="Twitter users - unique locations - Top 10 by usage") | |
## 5.3 most frequent words ## | |
sd_cars_organic$text <- gsub("https\\s*","",sd_cars_organic$text) | |
sd_cars_organic$text <- gsub("@\\s*","",sd_cars_organic$text) | |
sd_cars_organic$text <-gsub("amp","",sd_cars_organic$text) | |
sd_cars_organic$text <-gsub("[[:punct:]]","",sd_cars_organic$text) | |
sd_cars_organic$text <-gsub("@[[:alpha:]]*","",sd_cars_organic$text) | |
#draw out only the text column because we are interested in cleaning it with tm | |
text_corpus<-Corpus(VectorSource(sd_cars_organic$text)) | |
#remove words like car, selfdriving cars, autonomous vehicles as they are correlated to selfdriving cars | |
#rt = retweet, re = reply | |
text_corpus<-tm_map(text_corpus,removeWords, c("selfdrivingcars","driverlesscars","autonomousvehicles","cars","car","rt","re","vehicle","selfdriving","driverless","autonomous")) | |
#remove stop words | |
text_corpus<-tm_map(text_corpus,removeWords,stopwords("english")) | |
#remove punctuation | |
text_corpus<-tm_map(text_corpus,removePunctuation) | |
text_corpus<-tm_map(text_corpus,removeNumbers) | |
#find most frequent words | |
dtm<-TermDocumentMatrix(text_corpus) | |
m <-as.matrix(dtm) | |
v <-sort(rowSums(m),decreasing=TRUE) | |
d<-data.frame(word=names(v),freq=v) | |
head(d,10) | |
#visualize | |
set.seed(1234) | |
wordcloud(words=d$word,freq=d$freq,min.freq=10,max.words=200,random.order=FALSE,rot.per=0.35, | |
colors=brewer.pal(8,"Dark2")) | |
#convert cleansed corpus back into dataframe | |
sd_cars_tweets <- data.frame(text_clean = get("content",text_corpus), | |
stringsAsFactors=FALSE) | |
#### 6. Sentiment analysis of tweets #### | |
##Converting tweets to ASCII To get rid of strange characters | |
sd_cars_tweets$text_clean<-iconv(sd_cars_tweets$text_clean,from="UTF-8",to="ASCII",sub="") | |
#removing mentions, in case needed | |
sd_cars_tweets$text_clean<-gsub("@\\w+","",sd_cars_tweets$text_clean) | |
tweet_sentiment<-get_nrc_sentiment((sd_cars_tweets$text_clean)) | |
sentimentscores<-data.frame(colSums(tweet_sentiment[,])) | |
names(sentimentscores)<-"Score" | |
sentimentscores<-cbind("sentiment"=rownames(sentimentscores),sentimentscores) | |
rownames(sentimentscores)<-NULL | |
ggplot2::ggplot(data=sentimentscores)+ | |
geom_bar(mapping=aes(x=sentiment,y=Score),stat="identity")+ | |
theme(legend.position="none")+ | |
xlab("Sentiments")+ylab("Scores")+ | |
ggtitle("Total sentiment based on scores")+ | |
theme_minimal() | |
## Only look at negative sentiment ## | |
sd_cars_sentiments<-analyzeSentiment(sd_cars_tweets$text_clean) | |
##select subset of measures | |
sd_cars_sentiments_subset<-dplyr::select(sd_cars_sentiments, | |
SentimentGI,SentimentHE, | |
SentimentLM,SentimentQDAP, | |
WordCount) | |
#create a mean value for each tweet's sentiment level, leaving us with a single sentiment value for each tweet | |
sd_cars_sentiments_subset <-dplyr::mutate(sd_cars_sentiments_subset, | |
mean_sentiment=rowMeans(sd_cars_sentiments_subset[,-5])) | |
# remove unnecessary sentiment measures | |
sd_cars_sentiments_subset<-dplyr::select(sd_cars_sentiments_subset, | |
WordCount, | |
mean_sentiment) | |
sd_cars_df<-cbind.data.frame(sd_cars_tweets,sd_cars_sentiments_subset) | |
#only keep negative sentiments | |
sd_cars_df_negative<-filter(sd_cars_df,mean_sentiment<0) | |
nrow(sd_cars_df_negative) | |
#topic analysis of negative sentiments | |
sd_cars_tokenized_list<-tokens(sd_cars_df_negative$text_clean) | |
sd_cars_dfm<-dfm(sd_cars_tokenized_list) | |
#turn this list object into a document feature matrix with teh dfm command. | |
#use the colsums command to get the count of use for every word, which we assign to the vector "word_sums" | |
word_sums<-colSums(sd_cars_dfm) | |
length(word_sums)#number of words | |
#which are the most frequent words | |
freq_negative_words<-data.frame(word=names(word_sums), | |
freq=word_sums, | |
row.names=NULL, | |
stringsAsFactors = FALSE) | |
sorted_freq_neg<-freq_negative_words[order(freq_negative_words$freq,decreasing=TRUE),] | |
#remove odd characters | |
sd_cars_df_negative$text_clean<-sapply(sd_cars_df_negative$text_clean, | |
function(row) iconv(row,"latin1","ASCII",sub="")) | |
neg_corpus_tm<-Corpus(VectorSource(sd_cars_df_negative$text_clean)) | |
neg_tm<-DocumentTermMatrix(neg_corpus_tm) | |
neg_tm<-removeSparseTerms(neg_tm,0.98) | |
neg_df_cluster<-as.data.frame(as.matrix(neg_tm)) | |
#Create clusters using exploratory graph analysis | |
ega_neg_sd_cars<-EGA(neg_df_cluster) | |
View(as.data.frame(ega_neg_sd_cars$dim.variables)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment