Created
August 22, 2018 20:51
-
-
Save SpandanBG/33669c24416c528d91bad7d0f730c056 to your computer and use it in GitHub Desktop.
SMS Spam Detection ML (Explained)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Created on Wed Jun 13 23:27:45 2018 | |
@author: spand | |
""" | |
import numpy as np | |
import pandas as pd | |
import pickle, string, time | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.model_selection import train_test_split | |
from sklearn.naive_bayes import MultinomialNB | |
from sklearn.metrics import accuracy_score | |
from nltk.stem import SnowballStemmer | |
from nltk.corpus import stopwords | |
# loadin the CSV file with latin-1 encoding. | |
# latin-1 encoding is type of encoding used in IT | |
# (as in different from utf-8) | |
# pd.read_csv() method returns a DataFrame object | |
sms = pd.read_csv("bin/spam.csv", encoding="latin-1") | |
# The CSV file has five columns: | |
# v1, v2, Unnamed: 2, Unnamed: 3 and Unnamed: 4 | |
# v1 is actually the label column and | |
# v2 is the feature column (messages in our case) | |
# the remaining are not needed, hence dropped. | |
sms = sms.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1) | |
sms = sms.rename(columns = {"v1":"label", "v2":"message"}) | |
# apart from the message as feature we'll also add the | |
# length of the message as a feature. Thus the dataset | |
# will contain two feature, message and message length | |
# .apply() is a DataFrame object method that takes | |
# another method in it's parameter and applies it to each | |
# item in the DataFrame. | |
sms["length"] = sms["message"].apply(len) | |
# Now we cannot use the text in the messages directly | |
# in our ML algo. as it only understands numbers. | |
# So we need to put weights in each messages. | |
# To do this we'll need to find the meaning in each message | |
# and weight them accordingly. | |
# Here Natural Language Processing comes in play. | |
# We are using "nltk" (Natural Language Toolkit) library for | |
# this purpose | |
# First let us take a copy of all the messages in a different DataFrame | |
texts = sms["message"].copy() | |
# Now we need to remove all punctuations from the message | |
# To do this well create a "map" that points all punctuations | |
# to None (None => null in python) | |
# This will be done with the help of str.maketrans and string.punctuation | |
punctuationToNone = str.maketrans('', '', string.punctuation) | |
# Now for each message we'll remove the punctuations | |
# This will be done by .translate() method | |
for i in range(len(texts)): | |
texts[i] = texts[i].translate(punctuationToNone) | |
# Now we have to remove the stopwords from the messages | |
# Stopwords are the most common words in a language | |
# that do not actually help convery deep information | |
# like {the, as , is, a, etc} | |
# This is done using stopwords in nltk.corpus | |
for i in range(len(texts)): | |
txt = [word for word in texts[i].split() if word.lower() not in stopwords.words("english")] | |
texts[i] = " ".join(txt) | |
# Now we'll remove stemmers from the texts | |
# Stemmer means words in different tenses. | |
# Like {fly, flying} | |
# This is done using SnowballStemmer from nltk library | |
for i in range(len(texts)): | |
words = map(lambda t: SnowballStemmer("english").stem(t), texts[i].split()) | |
texts[i] = " ".join(words) | |
# Now we'll using the TfidfVectorizer to transform | |
# the texts to feature vectors. This way the data | |
# will be more useful to the ML algo. | |
vectorizer = TfidfVectorizer("english") | |
# Now we'll not only obtain the feature vector but | |
# also fit the vectorizer object so that we can use | |
# the same object to evaluate texts in the future with | |
# it and get the same number of features as now. | |
# This is important because the ML model we'll | |
# generate will learn to predict with a set number of features. | |
# If the number of features are different the model will fail | |
features = vectorizer.fit_transform(texts) | |
# Remember we found the length of each message at line no. 36 | |
# This is also an important feature and needs to be added | |
# To do this we'll concat a column to the "features" | |
lengths = sms["length"].values | |
features = np.hstack((features.todense(), lengths[:, None])) | |
# Train test split | |
f_train, f_test, l_train, l_test =train_test_split( | |
features, sms['label'], | |
test_size = 0.2, | |
random_state = int(time.time()) | |
) | |
# Now our list of features is completed | |
# We only need to train the machine to classfiy messages | |
# We'll use Naive Bayes classifer to classify our messages | |
model = MultinomialNB(alpha=0.2) | |
# In Multinomial Naive Bayes, the alpha parameter is what | |
# is known as a hyperparameter; i.e. a parameter that controls | |
# the form of the model itself. In most cases, the best way to | |
# determine optimal values for hyperparameters is through a grid | |
# search over possible parameter values, using cross validation | |
# to evaluate the performance of the model on your data at each value. | |
# Now we fit the model using the features we have processed | |
# and labels that were provided | |
model.fit(f_train, l_train) | |
# Test and evalute score | |
predicted = model.predict(f_test) | |
accuracyScore = accuracy_score(l_test, predicted) | |
print(accuracyScore) | |
# The model has been created. | |
# We now need to save this model so that it can be | |
# used in the future without having to go through the | |
# training process. | |
# We'll use pickle to save the model and the vectorizer | |
# The vectorizer will be used to process the future texts | |
# The model will be used to classfiy future texts | |
pickle.dump(model, open("model.sav", "wb")) | |
pickle.dump(vectorizer, open("vectorizer.sav", "wb")) | |
# What pickle does is saves the object as a raw binary format in a file. | |
# Now it can be loaded into any python script and used as the same object as | |
# it is in this script | |
""" | |
# Let us predict some texts | |
# Loading up the model and the vectorizer using pickle | |
model = pickle.load(open("model.sav", "rb")) | |
vectorizer = pickle.load(open("vectorizer.sav", "rb")) | |
# Reading a text | |
message = str(input("Enter the message: ")) | |
# Creating a DataFrame to store the message and it's length | |
sms = pd.DataFrame({"message": [message]}) | |
sms["length"] = sms["message"].apply(len) | |
# Create a copy DataFrame of the message | |
texts = sms["message"].copy() | |
# Removing punctuations | |
for i in range(len(texts)): | |
texts[i] = texts[i].translate(punctuationToNone) | |
# Removing stopwords | |
for i in range(len(texts)): | |
txt = [word for word in texts[i].split() if word.lower() not in stopwords.words("english")] | |
texts[i] = " ".join(txt) | |
# Removing stemmers | |
for i in range(len(texts)): | |
txt = "" | |
for c in texts[i].split(): | |
stemmer = SnowballStemmer("english") | |
txt += (stemmer.stem(c)) + " " | |
texts[i] = txt | |
# Using the loaded vectorizer to transform the text | |
features = vectorizer.transform(texts) | |
# Adding length to the features | |
lengths = sms["length"].values | |
features = np.hstack((features.todense(), lengths[:, None])) | |
# Classifying the sms | |
label = model.predict(features) | |
# Displaying the classification of the message | |
print(label) | |
# Done | |
""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment