Herman Wandabwa wandabwa2004

## cleanupfix.py
df.drop('hotel name', axis=1, inplace=True)
df.drop('review title', axis=1, inplace=True)
df.drop('helpful vote', axis=1, inplace=True)
df.drop('user name', axis=1, inplace=True)
df['rating'] = df['rating']/10
df = df[~df['review_body'].isnull()]

# A little bit of data clean up to get commas and ASCII characters out
def preprocess(review_body):
    review_body = review_body.str.replace("(<br/>)", "")

## random_positive.py
print('5 random reviews with the highest positive sentiment polarity: \n')
cl = df.loc[df.polarity == 1, ['review_body']].sample(5).values
for c in cl:
    print(c[0])

## sentiment_polarity.py
df['polarity'].iplot(
    kind='hist',
    bins=50,
    xTitle='polarity',
    linecolor='black',
    yTitle='count',
    title='Sentiment Polarity Distribution')
## Tepapa must be doing well as a majority of the reviews are quite positive i.e. above 0

## review_lengths.py
df['review_len'].iplot(
    kind='hist',
    xTitle='Review Length - Number of Characters',
    linecolor='black',
    yTitle='count',
    title='Review Text Lengths Distribution')

## vader_sentiments.py
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

sentiment_vader = df['review_body'].apply(lambda x: analyzer.polarity_scores(x))
df = pd.concat([df,sentiment_vader.apply(pd.Series)],1)

df.describe()

## means.py
df.index = pd.DatetimeIndex(df["review date"])
df = df.sort_index()
df['mean'] = df['compound'].expanding().mean()
df['rolling'] = df['compound'].rolling('24h').mean()

import matplotlib.pyplot as plt

fig = plt.figure(figsize=(20,5))
ax = fig.add_subplot(111)
ax.scatter(df['review date'],df['compound'], label='Tweet Sentiment')

## sample_for_graph.py
ot = df.sample(frac=.1, random_state=1111)
ot.sort_index(inplace=True)
ot['mean'] = ot['compound'].expanding().mean()
ot['rolling'] = ot['compound'].rolling('12h').mean()

fig = plt.figure(figsize=(20,5))
ax = fig.add_subplot(111)
ax.scatter(ot['review date'],ot['compound'], label='Tweet Sentiment')
ax.plot(ot['review date'],ot['rolling'], color ='g', label='Rolling Mean')
ax.plot(ot['review date'],ot['mean'], color='r', label='Expanding Mean')

## libraries_.R
library(dplyr)
library(tm)
library(readr)
library(lubridate)
library(ggplot2)
library(tidytext)
library(tidyverse)
library(stringr)
library(tidyr)
library(scales)

## dtm_conversion.R
wordToRemove = c('the','mister','honourable','also','will','speaker') ##Found to be repetitive with no semantic sense.
docs <- tm_map(docs, tolower) #Lower case all words
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, removeWords, stopwords("english")) #TM function to remove stop words e.g. "is","the" etc
docs <- tm_map(docs,removeWords,wordToRemove)
docs <- tm_map(docs, stripWhitespace)
docs = tm_map(docs, PlainTextDocument)
dtm = DocumentTermMatrix(docs) #Convert text to term matrix format for easier computations
dim(dtm)

## frequency_.R
#Word frequency and topic models
freq <- colSums(as.matrix(dtm))
ord <- order(-freq)
freq[head(ord)]
freq[tail(ord)]
	df.drop('hotel name', axis=1, inplace=True)
	df.drop('review title', axis=1, inplace=True)
	df.drop('helpful vote', axis=1, inplace=True)
	df.drop('user name', axis=1, inplace=True)
	df['rating'] = df['rating']/10
	df = df[~df['review_body'].isnull()]

	# A little bit of data clean up to get commas and ASCII characters out
	def preprocess(review_body):
	review_body = review_body.str.replace("(<br/>)", "")
	print('5 random reviews with the highest positive sentiment polarity: \n')
	cl = df.loc[df.polarity == 1, ['review_body']].sample(5).values
	for c in cl:
	print(c[0])
	df['polarity'].iplot(
	kind='hist',
	bins=50,
	xTitle='polarity',
	linecolor='black',
	yTitle='count',
	title='Sentiment Polarity Distribution')
	## Tepapa must be doing well as a majority of the reviews are quite positive i.e. above 0
	df['review_len'].iplot(
	kind='hist',
	xTitle='Review Length - Number of Characters',
	linecolor='black',
	yTitle='count',
	title='Review Text Lengths Distribution')
	from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
	analyzer = SentimentIntensityAnalyzer()

	sentiment_vader = df['review_body'].apply(lambda x: analyzer.polarity_scores(x))
	df = pd.concat([df,sentiment_vader.apply(pd.Series)],1)

	df.describe()
	df.index = pd.DatetimeIndex(df["review date"])
	df = df.sort_index()
	df['mean'] = df['compound'].expanding().mean()
	df['rolling'] = df['compound'].rolling('24h').mean()

	import matplotlib.pyplot as plt

	fig = plt.figure(figsize=(20,5))
	ax = fig.add_subplot(111)
	ax.scatter(df['review date'],df['compound'], label='Tweet Sentiment')
	ot = df.sample(frac=.1, random_state=1111)
	ot.sort_index(inplace=True)
	ot['mean'] = ot['compound'].expanding().mean()
	ot['rolling'] = ot['compound'].rolling('12h').mean()

	fig = plt.figure(figsize=(20,5))
	ax = fig.add_subplot(111)
	ax.scatter(ot['review date'],ot['compound'], label='Tweet Sentiment')
	ax.plot(ot['review date'],ot['rolling'], color ='g', label='Rolling Mean')
	ax.plot(ot['review date'],ot['mean'], color='r', label='Expanding Mean')
	library(dplyr)
	library(tm)
	library(readr)
	library(lubridate)
	library(ggplot2)
	library(tidytext)
	library(tidyverse)
	library(stringr)
	library(tidyr)
	library(scales)
	wordToRemove = c('the','mister','honourable','also','will','speaker') ##Found to be repetitive with no semantic sense.
	docs <- tm_map(docs, tolower) #Lower case all words
	docs <- tm_map(docs, removeNumbers)
	docs <- tm_map(docs, removePunctuation)
	docs <- tm_map(docs, removeWords, stopwords("english")) #TM function to remove stop words e.g. "is","the" etc
	docs <- tm_map(docs,removeWords,wordToRemove)
	docs <- tm_map(docs, stripWhitespace)
	docs = tm_map(docs, PlainTextDocument)
	dtm = DocumentTermMatrix(docs) #Convert text to term matrix format for easier computations
	dim(dtm)
	#Word frequency and topic models
	freq <- colSums(as.matrix(dtm))
	ord <- order(-freq)
	freq[head(ord)]
	freq[tail(ord)]