This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Import necessary packages | |
import numpy as np | |
import random | |
import matplotlib.pyplot as plt | |
from sklearn.model_selection import train_test_split | |
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer | |
from sklearn.metrics import f1_score #f1 score to use it as and evaluation metric | |
import ast #to convert string into dictionary | |
from IPython.display import clear_output | |
from sklearn import svm #support vector machine classifier |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Most of the cleaning was done during the data web scraping | |
# Find the notebook here | |
# https://github.com/kipronokoech/Reviews-Classification/blob/master/data_collection.ipynb | |
reviews = [] | |
with open("./data/reviews.txt") as fp: | |
for index,line in enumerate(fp): | |
# line is a strong on loading so we need to convert to dictionary | |
review = ast.literal_eval(line) | |
#categorize the review and append it to reviews | |
reviews.append(Review(review['reviewBody'], review['stars'])) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 70% for training and 30% testing | |
training, test = train_test_split(reviews, test_size=0.30, random_state=42) | |
# define the independent (X) and target (y) | |
train_x,train_y = [x.text for x in training],[x.sentiment for x in training] | |
test_x,test_y = [x.text for x in test],[x.sentiment for x in test] | |
print("Size of train set: ",len(training)) | |
print("Size of train set: ",len(test)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
vectorizer = TfidfVectorizer() | |
train_x_vectors = vectorizer.fit_transform(train_x) | |
test_x_vectors = vectorizer.transform(test_x) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#train SVM classifier | |
clf_svm = svm.SVC(kernel='linear') | |
clf_svm.fit(train_x_vectors, train_y) | |
#random prediction using SVM | |
i = np.random.randint(0,len(test_x)) | |
print("Review Message: ",test_x[i]) | |
print("Actual: ",test_y[i]) | |
print("Prediction: ",clf_svm.predict(test_x_vectors[i])) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import datetime | |
from matplotlib import pyplot as plt | |
df = pd.read_csv("messages.csv") | |
df.head() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
df.Date = pd.to_datetime(df.Date) | |
# Getting the active days from the data | |
print("Active days: ",len(pd.unique(df.Date))) | |
# First day and the last day from the data | |
start = df.Date.iloc[0].date() | |
print("First Day:", start) | |
end = df.Date.iloc[-1].date() | |
print("Last Day: ",end) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Count the occurences of the Sender | |
print("Total Messages: ", len(df)) | |
messages_count = pd.DataFrame(df["Sender"].value_counts()) | |
print(messages_count) | |
#plot | |
sender = list(messages_count.index) | |
data = messages_count.Sender | |
plt.figure(figsize=(6,6)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Morning Messaging | |
first = df.sort_values(by=["Date",'Time'],ascending=True).drop_duplicates(subset=['Date'])["Sender"].value_counts() | |
print(first) | |
plt.figure(figsize=(6,6)) | |
plt.title("Morning Messages Count") | |
plt.bar(first.index,first) | |
plt.show() | |
#Late Night Messaging | |
last = df.sort_values(by=['Time'],ascending=False).drop_duplicates(subset=['Date'])["Sender"].value_counts() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Count the number of messages per day by simply grouping by Date | |
gk = pd.DataFrame(df.groupby("Date").count()["Message"]) | |
# plot results | |
time = list(gk.index) | |
number_of_messages = list(gk["Message"]) | |
plt.figure(figsize=(12,10)) | |
plt.axis("on") | |
plt.ylim(bottom=0,top=600) | |
plt.plot(time,number_of_messages) |