Skip to content

Instantly share code, notes, and snippets.

@shedoesdatascience
Created September 29, 2020 06:03
Show Gist options
  • Save shedoesdatascience/53c7e58e1df6ac696e6672179249e20b to your computer and use it in GitHub Desktop.
Save shedoesdatascience/53c7e58e1df6ac696e6672179249e20b to your computer and use it in GitHub Desktop.
##### Twitter Analysis ####
#### 1. load packages ####
Needed <-c("twitteR","SentimentAnalysis","quanteda","tm","EGAnet","tidytext","wordcloud")
install.packages(Needed,dependencies=TRUE)
library(rtweet)
library(twitteR)
library(dplyr)
library(tidyr)
library(tidytext)
library(lubridate)
library(SentimentAnalysis)
library(quanteda)
library(ggplot2)
library(tm)
library(devtools)
library(EGAnet)
library(forestmangr)#round_df function
library(wordcloud)
library(syuzhet)
#### 2. Set parameters for twitter app access ####
## store api keys (replace with your own keys)
api_key <- "aHGiOIWCubjkMmitWXWIYDA"
api_secret_key <- "WlbDGVZU8zylxY2j0aLsutp9VL0z6EsxOzC8SgjXmNTY"
access_token <- "13104153364333ry6VoVI6CpzVA5dix7J3gzsvhQLHtw"
access_token_secret <- "MZiVtV4FqQbQ7qvRj35B46BI0RflEKUZYx00AHiuX"
## 2.1 authenticate via web browser
token <- create_token(
app = "rexploretweets",
consumer_key = api_key,
consumer_secret = api_secret_key,
access_token = access_token,
access_secret = access_token_secret)
setup_twitter_oauth(api_key, api_secret_key, access_token, access_token_secret)
## 2.2 Check to see if token is loaded
get_token()
#### 3. Obtain tweeets for a specific topic ####
sd_cars<- rtweet::search_tweets(
q = "#selfdrivingcars",
n=5000,
include_rts = FALSE) # want to exclude retweets
sd_cars #output resulting twitter tweet_type_dfframe #twitter rest API limits all search to the past 6-9 days
#### 4. Transform tweets data ####
## 4.1 Remove retweets ##
sd_cars_organic <- sd_cars[sd_cars$is_retweet==FALSE,]
## 4.2 Remove replies
sd_cars_organic <- subset(sd_cars_organic,is.na(sd_cars_organic$reply_to_status_id))
sd_cars_df<-as.tweet_type_df.frame(sd_cars_organic)
saveRDS(sd_cars_df,file="sd_cars_tweet.rds")
## 4.3 keeping only the retweets
sd_cars_retweets<-sd_cars[sd_cars$is_retweet==TRUE,]
sd_cars_replies<-subset(sd_cars,!is.na(sd_cars$reply_to_status_id))
# 4.4 creating a tweet_type_df frame based on number of rows for each type of tweet
tweet_type_df<-data.frame(
category=c("Organic","Retweets","Replies"),
count=c(nrow(sd_cars_organic),nrow(sd_cars_retweets),nrow(sd_cars_replies)))
## 4.5 Setting up tweet_type_df for visualisation
tweet_type_df$fraction = tweet_type_df$count/sum(tweet_type_df$count)
tweet_type_df$percentage = tweet_type_df$count/sum(tweet_type_df$count)*100
tweet_type_df$ymax = cumsum(tweet_type_df$fraction)
tweet_type_df$ymin = c(0,head(tweet_type_df$ymax,n=-1))
# 4.6 Rounding the tweet_type_df to two decimal points
tweet_type_df<-round_df(tweet_type_df,2)
# 4.7 Specify what the legend should say
Type_of_tweet<-paste(tweet_type_df$category, tweet_type_df$percentage,"%")
ggplot(tweet_type_df,aes(ymax=ymax,ymin=ymin,xmax=4,xmin=3,fill=Type_of_tweet))+
geom_rect()+
coord_polar(theta="y")+
xlim(c(2,4))+
theme_void()+
theme(legend.position="right")
#### 5. Visualise tweets ####
ts_plot(sd_cars_organic, by = "3 hours") +
theme(plot.title = element_text(face = "bold")) +
labs(x = NULL, y= NULL,
title = "Frequency of self driving car tweets from past 9 days",
subtitle = "Twitter status (tweet) counts aggregated using three hour intervals",
caption = "\n Source: tweet_type_df collected from Twitter's rest API via retweet")
## 5.1 Who is tweeting ##
users<-search_users("#selfdrivingcars",n=500)
users_df<-as.tweet_type_df.frame(users)
saveRDS(users_df,file="users_tweet.rds")
#just view the first 10 users
head(users, n=10)
## 5.2 how many locations are represented
length(unique(users$location))
users %>%
count(location,sort=TRUE) %>%
mutate(location=reorder(location,n)) %>%
na.omit() %>%
top_n(10) %>%
ggplot(aes(x=location,y=n))+
geom_col()+
coord_flip()+
labs(x="Location",
y="Count",
title="Twitter users - unique locations - Top 10 by usage")
## 5.3 most frequent words ##
sd_cars_organic$text <- gsub("https\\s*","",sd_cars_organic$text)
sd_cars_organic$text <- gsub("@\\s*","",sd_cars_organic$text)
sd_cars_organic$text <-gsub("amp","",sd_cars_organic$text)
sd_cars_organic$text <-gsub("[[:punct:]]","",sd_cars_organic$text)
sd_cars_organic$text <-gsub("@[[:alpha:]]*","",sd_cars_organic$text)
#draw out only the text column because we are interested in cleaning it with tm
text_corpus<-Corpus(VectorSource(sd_cars_organic$text))
#remove words like car, selfdriving cars, autonomous vehicles as they are correlated to selfdriving cars
#rt = retweet, re = reply
text_corpus<-tm_map(text_corpus,removeWords, c("selfdrivingcars","driverlesscars","autonomousvehicles","cars","car","rt","re","vehicle","selfdriving","driverless","autonomous"))
#remove stop words
text_corpus<-tm_map(text_corpus,removeWords,stopwords("english"))
#remove punctuation
text_corpus<-tm_map(text_corpus,removePunctuation)
text_corpus<-tm_map(text_corpus,removeNumbers)
#find most frequent words
dtm<-TermDocumentMatrix(text_corpus)
m <-as.matrix(dtm)
v <-sort(rowSums(m),decreasing=TRUE)
d<-data.frame(word=names(v),freq=v)
head(d,10)
#visualize
set.seed(1234)
wordcloud(words=d$word,freq=d$freq,min.freq=10,max.words=200,random.order=FALSE,rot.per=0.35,
colors=brewer.pal(8,"Dark2"))
#convert cleansed corpus back into dataframe
sd_cars_tweets <- data.frame(text_clean = get("content",text_corpus),
stringsAsFactors=FALSE)
#### 6. Sentiment analysis of tweets ####
##Converting tweets to ASCII To get rid of strange characters
sd_cars_tweets$text_clean<-iconv(sd_cars_tweets$text_clean,from="UTF-8",to="ASCII",sub="")
#removing mentions, in case needed
sd_cars_tweets$text_clean<-gsub("@\\w+","",sd_cars_tweets$text_clean)
tweet_sentiment<-get_nrc_sentiment((sd_cars_tweets$text_clean))
sentimentscores<-data.frame(colSums(tweet_sentiment[,]))
names(sentimentscores)<-"Score"
sentimentscores<-cbind("sentiment"=rownames(sentimentscores),sentimentscores)
rownames(sentimentscores)<-NULL
ggplot2::ggplot(data=sentimentscores)+
geom_bar(mapping=aes(x=sentiment,y=Score),stat="identity")+
theme(legend.position="none")+
xlab("Sentiments")+ylab("Scores")+
ggtitle("Total sentiment based on scores")+
theme_minimal()
## Only look at negative sentiment ##
sd_cars_sentiments<-analyzeSentiment(sd_cars_tweets$text_clean)
##select subset of measures
sd_cars_sentiments_subset<-dplyr::select(sd_cars_sentiments,
SentimentGI,SentimentHE,
SentimentLM,SentimentQDAP,
WordCount)
#create a mean value for each tweet's sentiment level, leaving us with a single sentiment value for each tweet
sd_cars_sentiments_subset <-dplyr::mutate(sd_cars_sentiments_subset,
mean_sentiment=rowMeans(sd_cars_sentiments_subset[,-5]))
# remove unnecessary sentiment measures
sd_cars_sentiments_subset<-dplyr::select(sd_cars_sentiments_subset,
WordCount,
mean_sentiment)
sd_cars_df<-cbind.data.frame(sd_cars_tweets,sd_cars_sentiments_subset)
#only keep negative sentiments
sd_cars_df_negative<-filter(sd_cars_df,mean_sentiment<0)
nrow(sd_cars_df_negative)
#topic analysis of negative sentiments
sd_cars_tokenized_list<-tokens(sd_cars_df_negative$text_clean)
sd_cars_dfm<-dfm(sd_cars_tokenized_list)
#turn this list object into a document feature matrix with teh dfm command.
#use the colsums command to get the count of use for every word, which we assign to the vector "word_sums"
word_sums<-colSums(sd_cars_dfm)
length(word_sums)#number of words
#which are the most frequent words
freq_negative_words<-data.frame(word=names(word_sums),
freq=word_sums,
row.names=NULL,
stringsAsFactors = FALSE)
sorted_freq_neg<-freq_negative_words[order(freq_negative_words$freq,decreasing=TRUE),]
#remove odd characters
sd_cars_df_negative$text_clean<-sapply(sd_cars_df_negative$text_clean,
function(row) iconv(row,"latin1","ASCII",sub=""))
neg_corpus_tm<-Corpus(VectorSource(sd_cars_df_negative$text_clean))
neg_tm<-DocumentTermMatrix(neg_corpus_tm)
neg_tm<-removeSparseTerms(neg_tm,0.98)
neg_df_cluster<-as.data.frame(as.matrix(neg_tm))
#Create clusters using exploratory graph analysis
ega_neg_sd_cars<-EGA(neg_df_cluster)
View(as.data.frame(ega_neg_sd_cars$dim.variables))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment