Kiprono Elijah Koech kipronokoech

## review_class.py
#Import necessary packages
import numpy as np
import random
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score #f1 score to use it as and evaluation metric
import ast #to convert string into dictionary
from IPython.display import clear_output
from sklearn import svm #support vector machine classifier

## load.py
# Most of the cleaning was done during the data web scraping
# Find the notebook here
# https://github.com/kipronokoech/Reviews-Classification/blob/master/data_collection.ipynb
reviews = []
with open("./data/reviews.txt") as fp:
    for index,line in enumerate(fp):
        # line is a strong on loading so we need to convert to dictionary
        review = ast.literal_eval(line)
        #categorize the review and append it to reviews
        reviews.append(Review(review['reviewBody'], review['stars']))

## split_data.py
# 70% for training and 30% testing
training, test = train_test_split(reviews, test_size=0.30, random_state=42)

# define the independent (X) and target (y)
train_x,train_y = [x.text for x in training],[x.sentiment for x in training]
test_x,test_y = [x.text for x in test],[x.sentiment for x in test]

print("Size of train set: ",len(training))
print("Size of train set: ",len(test))

## vectorize2.py
vectorizer = TfidfVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)
test_x_vectors = vectorizer.transform(test_x)

## svm.py
#train SVM classifier
clf_svm = svm.SVC(kernel='linear')

clf_svm.fit(train_x_vectors, train_y)
#random prediction using SVM

i = np.random.randint(0,len(test_x))
print("Review Message: ",test_x[i])
print("Actual: ",test_y[i])
print("Prediction: ",clf_svm.predict(test_x_vectors[i]))

## load_data.py
import pandas as pd
import datetime
from matplotlib import pyplot as plt

df = pd.read_csv("messages.csv")
df.head()

## active_days.py
df.Date = pd.to_datetime(df.Date)

# Getting the active days from the data
print("Active days: ",len(pd.unique(df.Date)))

# First day and the last day from the data
start = df.Date.iloc[0].date()
print("First Day:", start)
end = df.Date.iloc[-1].date()
print("Last Day: ",end)

## most.py
#Count the occurences of the Sender
print("Total Messages: ", len(df))
messages_count = pd.DataFrame(df["Sender"].value_counts())
print(messages_count)


#plot
sender = list(messages_count.index)
data = messages_count.Sender
plt.figure(figsize=(6,6))

## morning_and_night.py
# Morning Messaging
first = df.sort_values(by=["Date",'Time'],ascending=True).drop_duplicates(subset=['Date'])["Sender"].value_counts()
print(first)
plt.figure(figsize=(6,6))
plt.title("Morning Messages Count")
plt.bar(first.index,first)
plt.show()

#Late Night Messaging
last = df.sort_values(by=['Time'],ascending=False).drop_duplicates(subset=['Date'])["Sender"].value_counts()

## number_of_messages_a_day.py
# Count the number of messages per day by simply grouping by Date
gk  = pd.DataFrame(df.groupby("Date").count()["Message"])

# plot results
time = list(gk.index)
number_of_messages = list(gk["Message"])
plt.figure(figsize=(12,10))
plt.axis("on")
plt.ylim(bottom=0,top=600)
plt.plot(time,number_of_messages)
	#Import necessary packages
	import numpy as np
	import random
	import matplotlib.pyplot as plt
	from sklearn.model_selection import train_test_split
	from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
	from sklearn.metrics import f1_score #f1 score to use it as and evaluation metric
	import ast #to convert string into dictionary
	from IPython.display import clear_output
	from sklearn import svm #support vector machine classifier
	# Most of the cleaning was done during the data web scraping
	# Find the notebook here
	# https://github.com/kipronokoech/Reviews-Classification/blob/master/data_collection.ipynb
	reviews = []
	with open("./data/reviews.txt") as fp:
	for index,line in enumerate(fp):
	# line is a strong on loading so we need to convert to dictionary
	review = ast.literal_eval(line)
	#categorize the review and append it to reviews
	reviews.append(Review(review['reviewBody'], review['stars']))
	# 70% for training and 30% testing
	training, test = train_test_split(reviews, test_size=0.30, random_state=42)

	# define the independent (X) and target (y)
	train_x,train_y = [x.text for x in training],[x.sentiment for x in training]
	test_x,test_y = [x.text for x in test],[x.sentiment for x in test]

	print("Size of train set: ",len(training))
	print("Size of train set: ",len(test))
	vectorizer = TfidfVectorizer()
	train_x_vectors = vectorizer.fit_transform(train_x)
	test_x_vectors = vectorizer.transform(test_x)
	#train SVM classifier
	clf_svm = svm.SVC(kernel='linear')

	clf_svm.fit(train_x_vectors, train_y)
	#random prediction using SVM

	i = np.random.randint(0,len(test_x))
	print("Review Message: ",test_x[i])
	print("Actual: ",test_y[i])
	print("Prediction: ",clf_svm.predict(test_x_vectors[i]))
	import pandas as pd
	import datetime
	from matplotlib import pyplot as plt

	df = pd.read_csv("messages.csv")
	df.head()
	df.Date = pd.to_datetime(df.Date)

	# Getting the active days from the data
	print("Active days: ",len(pd.unique(df.Date)))

	# First day and the last day from the data
	start = df.Date.iloc[0].date()
	print("First Day:", start)
	end = df.Date.iloc[-1].date()
	print("Last Day: ",end)
	#Count the occurences of the Sender
	print("Total Messages: ", len(df))
	messages_count = pd.DataFrame(df["Sender"].value_counts())
	print(messages_count)


	#plot
	sender = list(messages_count.index)
	data = messages_count.Sender
	plt.figure(figsize=(6,6))
	# Morning Messaging
	first = df.sort_values(by=["Date",'Time'],ascending=True).drop_duplicates(subset=['Date'])["Sender"].value_counts()
	print(first)
	plt.figure(figsize=(6,6))
	plt.title("Morning Messages Count")
	plt.bar(first.index,first)
	plt.show()

	#Late Night Messaging
	last = df.sort_values(by=['Time'],ascending=False).drop_duplicates(subset=['Date'])["Sender"].value_counts()
	# Count the number of messages per day by simply grouping by Date
	gk = pd.DataFrame(df.groupby("Date").count()["Message"])

	# plot results
	time = list(gk.index)
	number_of_messages = list(gk["Message"])
	plt.figure(figsize=(12,10))
	plt.axis("on")
	plt.ylim(bottom=0,top=600)
	plt.plot(time,number_of_messages)