Skip to content

Instantly share code, notes, and snippets.

@SpandanBG
Created August 22, 2018 20:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save SpandanBG/33669c24416c528d91bad7d0f730c056 to your computer and use it in GitHub Desktop.
Save SpandanBG/33669c24416c528d91bad7d0f730c056 to your computer and use it in GitHub Desktop.
SMS Spam Detection ML (Explained)
# -*- coding: utf-8 -*-
"""
Created on Wed Jun 13 23:27:45 2018
@author: spand
"""
import numpy as np
import pandas as pd
import pickle, string, time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
# loadin the CSV file with latin-1 encoding.
# latin-1 encoding is type of encoding used in IT
# (as in different from utf-8)
# pd.read_csv() method returns a DataFrame object
sms = pd.read_csv("bin/spam.csv", encoding="latin-1")
# The CSV file has five columns:
# v1, v2, Unnamed: 2, Unnamed: 3 and Unnamed: 4
# v1 is actually the label column and
# v2 is the feature column (messages in our case)
# the remaining are not needed, hence dropped.
sms = sms.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
sms = sms.rename(columns = {"v1":"label", "v2":"message"})
# apart from the message as feature we'll also add the
# length of the message as a feature. Thus the dataset
# will contain two feature, message and message length
# .apply() is a DataFrame object method that takes
# another method in it's parameter and applies it to each
# item in the DataFrame.
sms["length"] = sms["message"].apply(len)
# Now we cannot use the text in the messages directly
# in our ML algo. as it only understands numbers.
# So we need to put weights in each messages.
# To do this we'll need to find the meaning in each message
# and weight them accordingly.
# Here Natural Language Processing comes in play.
# We are using "nltk" (Natural Language Toolkit) library for
# this purpose
# First let us take a copy of all the messages in a different DataFrame
texts = sms["message"].copy()
# Now we need to remove all punctuations from the message
# To do this well create a "map" that points all punctuations
# to None (None => null in python)
# This will be done with the help of str.maketrans and string.punctuation
punctuationToNone = str.maketrans('', '', string.punctuation)
# Now for each message we'll remove the punctuations
# This will be done by .translate() method
for i in range(len(texts)):
texts[i] = texts[i].translate(punctuationToNone)
# Now we have to remove the stopwords from the messages
# Stopwords are the most common words in a language
# that do not actually help convery deep information
# like {the, as , is, a, etc}
# This is done using stopwords in nltk.corpus
for i in range(len(texts)):
txt = [word for word in texts[i].split() if word.lower() not in stopwords.words("english")]
texts[i] = " ".join(txt)
# Now we'll remove stemmers from the texts
# Stemmer means words in different tenses.
# Like {fly, flying}
# This is done using SnowballStemmer from nltk library
for i in range(len(texts)):
words = map(lambda t: SnowballStemmer("english").stem(t), texts[i].split())
texts[i] = " ".join(words)
# Now we'll using the TfidfVectorizer to transform
# the texts to feature vectors. This way the data
# will be more useful to the ML algo.
vectorizer = TfidfVectorizer("english")
# Now we'll not only obtain the feature vector but
# also fit the vectorizer object so that we can use
# the same object to evaluate texts in the future with
# it and get the same number of features as now.
# This is important because the ML model we'll
# generate will learn to predict with a set number of features.
# If the number of features are different the model will fail
features = vectorizer.fit_transform(texts)
# Remember we found the length of each message at line no. 36
# This is also an important feature and needs to be added
# To do this we'll concat a column to the "features"
lengths = sms["length"].values
features = np.hstack((features.todense(), lengths[:, None]))
# Train test split
f_train, f_test, l_train, l_test =train_test_split(
features, sms['label'],
test_size = 0.2,
random_state = int(time.time())
)
# Now our list of features is completed
# We only need to train the machine to classfiy messages
# We'll use Naive Bayes classifer to classify our messages
model = MultinomialNB(alpha=0.2)
# In Multinomial Naive Bayes, the alpha parameter is what
# is known as a hyperparameter; i.e. a parameter that controls
# the form of the model itself. In most cases, the best way to
# determine optimal values for hyperparameters is through a grid
# search over possible parameter values, using cross validation
# to evaluate the performance of the model on your data at each value.
# Now we fit the model using the features we have processed
# and labels that were provided
model.fit(f_train, l_train)
# Test and evalute score
predicted = model.predict(f_test)
accuracyScore = accuracy_score(l_test, predicted)
print(accuracyScore)
# The model has been created.
# We now need to save this model so that it can be
# used in the future without having to go through the
# training process.
# We'll use pickle to save the model and the vectorizer
# The vectorizer will be used to process the future texts
# The model will be used to classfiy future texts
pickle.dump(model, open("model.sav", "wb"))
pickle.dump(vectorizer, open("vectorizer.sav", "wb"))
# What pickle does is saves the object as a raw binary format in a file.
# Now it can be loaded into any python script and used as the same object as
# it is in this script
"""
# Let us predict some texts
# Loading up the model and the vectorizer using pickle
model = pickle.load(open("model.sav", "rb"))
vectorizer = pickle.load(open("vectorizer.sav", "rb"))
# Reading a text
message = str(input("Enter the message: "))
# Creating a DataFrame to store the message and it's length
sms = pd.DataFrame({"message": [message]})
sms["length"] = sms["message"].apply(len)
# Create a copy DataFrame of the message
texts = sms["message"].copy()
# Removing punctuations
for i in range(len(texts)):
texts[i] = texts[i].translate(punctuationToNone)
# Removing stopwords
for i in range(len(texts)):
txt = [word for word in texts[i].split() if word.lower() not in stopwords.words("english")]
texts[i] = " ".join(txt)
# Removing stemmers
for i in range(len(texts)):
txt = ""
for c in texts[i].split():
stemmer = SnowballStemmer("english")
txt += (stemmer.stem(c)) + " "
texts[i] = txt
# Using the loaded vectorizer to transform the text
features = vectorizer.transform(texts)
# Adding length to the features
lengths = sms["length"].values
features = np.hstack((features.todense(), lengths[:, None]))
# Classifying the sms
label = model.predict(features)
# Displaying the classification of the message
print(label)
# Done
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment