Skip to content

Instantly share code, notes, and snippets.

View kipronokoech's full-sized avatar
🎯
Focusing

Kiprono Elijah Koech kipronokoech

🎯
Focusing
View GitHub Profile
# import library to use to vectorize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# Vectorization on sklearn - simple example
corpus = [
"Excellent Services by the ABC remit team.Recommend.",
"Bad Services. Transaction delayed for three days.Don't recommend."]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
#print(X) #this is just a matrix with position as tuple and token in that position
#Import necessary packages
import numpy as np
import random
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score #f1 score to use it as and evaluation metric
import ast #to convert string into dictionary
from IPython.display import clear_output
from sklearn import svm #support vector machine classifier
# Most of the cleaning was done during the data web scraping
# Find the notebook here
# https://github.com/kipronokoech/Reviews-Classification/blob/master/data_collection.ipynb
reviews = []
with open("./data/reviews.txt") as fp:
for index,line in enumerate(fp):
# line is a strong on loading so we need to convert to dictionary
review = ast.literal_eval(line)
#categorize the review and append it to reviews
reviews.append(Review(review['reviewBody'], review['stars']))
# 70% for training and 30% testing
training, test = train_test_split(reviews, test_size=0.30, random_state=42)
# define the independent (X) and target (y)
train_x,train_y = [x.text for x in training],[x.sentiment for x in training]
test_x,test_y = [x.text for x in test],[x.sentiment for x in test]
print("Size of train set: ",len(training))
print("Size of train set: ",len(test))
vectorizer = TfidfVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)
test_x_vectors = vectorizer.transform(test_x)
#train SVM classifier
clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(train_x_vectors, train_y)
#random prediction using SVM
i = np.random.randint(0,len(test_x))
print("Review Message: ",test_x[i])
print("Actual: ",test_y[i])
print("Prediction: ",clf_svm.predict(test_x_vectors[i]))
import pandas as pd
data = []
with open("whatsapp.txt","r") as fp:
for index,line in enumerate(fp):
try:
send_time, message_section = line.strip().split("-")
sender, message = message_section.split(":",maxsplit=1)
date,time = send_time.split(",")
r = {
"Date": date,
import pandas as pd
import datetime
from matplotlib import pyplot as plt
df = pd.read_csv("messages.csv")
df.head()
df.Date = pd.to_datetime(df.Date)
# Getting the active days from the data
print("Active days: ",len(pd.unique(df.Date)))
# First day and the last day from the data
start = df.Date.iloc[0].date()
print("First Day:", start)
end = df.Date.iloc[-1].date()
print("Last Day: ",end)
#Count the occurences of the Sender
print("Total Messages: ", len(df))
messages_count = pd.DataFrame(df["Sender"].value_counts())
print(messages_count)
#plot
sender = list(messages_count.index)
data = messages_count.Sender
plt.figure(figsize=(6,6))