Skip to content

Instantly share code, notes, and snippets.

View wandabwa2004's full-sized avatar

Herman Wandabwa wandabwa2004

View GitHub Profile
df.drop('hotel name', axis=1, inplace=True)
df.drop('review title', axis=1, inplace=True)
df.drop('helpful vote', axis=1, inplace=True)
df.drop('user name', axis=1, inplace=True)
df['rating'] = df['rating']/10
df = df[~df['review_body'].isnull()]
# A little bit of data clean up to get commas and ASCII characters out
def preprocess(review_body):
review_body = review_body.str.replace("(<br/>)", "")
print('5 random reviews with the highest positive sentiment polarity: \n')
cl = df.loc[df.polarity == 1, ['review_body']].sample(5).values
for c in cl:
print(c[0])
df['polarity'].iplot(
kind='hist',
bins=50,
xTitle='polarity',
linecolor='black',
yTitle='count',
title='Sentiment Polarity Distribution')
## Tepapa must be doing well as a majority of the reviews are quite positive i.e. above 0
df['review_len'].iplot(
kind='hist',
xTitle='Review Length - Number of Characters',
linecolor='black',
yTitle='count',
title='Review Text Lengths Distribution')
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
sentiment_vader = df['review_body'].apply(lambda x: analyzer.polarity_scores(x))
df = pd.concat([df,sentiment_vader.apply(pd.Series)],1)
df.describe()
df.index = pd.DatetimeIndex(df["review date"])
df = df.sort_index()
df['mean'] = df['compound'].expanding().mean()
df['rolling'] = df['compound'].rolling('24h').mean()
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(20,5))
ax = fig.add_subplot(111)
ax.scatter(df['review date'],df['compound'], label='Tweet Sentiment')
ot = df.sample(frac=.1, random_state=1111)
ot.sort_index(inplace=True)
ot['mean'] = ot['compound'].expanding().mean()
ot['rolling'] = ot['compound'].rolling('12h').mean()
fig = plt.figure(figsize=(20,5))
ax = fig.add_subplot(111)
ax.scatter(ot['review date'],ot['compound'], label='Tweet Sentiment')
ax.plot(ot['review date'],ot['rolling'], color ='g', label='Rolling Mean')
ax.plot(ot['review date'],ot['mean'], color='r', label='Expanding Mean')
library(dplyr)
library(tm)
library(readr)
library(lubridate)
library(ggplot2)
library(tidytext)
library(tidyverse)
library(stringr)
library(tidyr)
library(scales)
wordToRemove = c('the','mister','honourable','also','will','speaker') ##Found to be repetitive with no semantic sense.
docs <- tm_map(docs, tolower) #Lower case all words
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, removeWords, stopwords("english")) #TM function to remove stop words e.g. "is","the" etc
docs <- tm_map(docs,removeWords,wordToRemove)
docs <- tm_map(docs, stripWhitespace)
docs = tm_map(docs, PlainTextDocument)
dtm = DocumentTermMatrix(docs) #Convert text to term matrix format for easier computations
dim(dtm)
#Word frequency and topic models
freq <- colSums(as.matrix(dtm))
ord <- order(-freq)
freq[head(ord)]
freq[tail(ord)]