SpandanBG/sms_spam_detection.py

## sms_spam_detection.py
# -*- coding: utf-8 -*-
"""
Created on Wed Jun 13 23:27:45 2018

@author: spand
"""

import numpy as np
import pandas as pd
import pickle, string, time
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords

# loadin the CSV file with latin-1 encoding.
# latin-1 encoding is type of encoding used in IT
# (as in different from utf-8)
# pd.read_csv() method returns a DataFrame object
sms = pd.read_csv("bin/spam.csv", encoding="latin-1")

# The CSV file has five columns:
# v1, v2, Unnamed: 2, Unnamed: 3 and Unnamed: 4
# v1 is actually the label column and
# v2 is the feature column (messages in our case)
# the remaining are not needed, hence dropped.
sms = sms.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
sms = sms.rename(columns = {"v1":"label", "v2":"message"})

# apart from the message as feature we'll also add the
# length of the message as a feature. Thus the dataset
# will contain two feature, message and message length
# .apply() is a DataFrame object method that takes
# another method in it's parameter and applies it to each
# item in the DataFrame.
sms["length"] = sms["message"].apply(len)

# Now we cannot use the text in the messages directly
# in our ML algo. as it only understands numbers.
# So we need to put weights in each messages.
# To do this we'll need to find the meaning in each message
# and weight them accordingly.
# Here Natural Language Processing comes in play.
# We are using "nltk" (Natural Language Toolkit) library for
# this purpose

# First let us take a copy of all the messages in a different DataFrame
texts = sms["message"].copy()

# Now we need to remove all punctuations from the message
# To do this well create a "map" that points all punctuations
# to None (None => null in python)
# This will be done with the help of str.maketrans and  string.punctuation
punctuationToNone = str.maketrans('', '', string.punctuation)

# Now for each message we'll remove the punctuations
# This will be done by .translate() method
for i in range(len(texts)):
    texts[i] = texts[i].translate(punctuationToNone)

# Now we have to remove the stopwords from the messages
# Stopwords are the most common words in a language
# that do not actually help convery deep information
# like {the, as , is, a, etc}
# This is done using stopwords in nltk.corpus
for i in range(len(texts)):
    txt = [word for word in texts[i].split() if word.lower() not in stopwords.words("english")]
    texts[i] = " ".join(txt)

# Now we'll remove stemmers from the texts
# Stemmer means words in different tenses.
# Like {fly, flying}
# This is done using SnowballStemmer from nltk library
for i in range(len(texts)):
    words = map(lambda t: SnowballStemmer("english").stem(t), texts[i].split())
    texts[i] = " ".join(words)

# Now we'll using the TfidfVectorizer to transform
# the texts to feature vectors. This way the data
# will be more useful to the ML algo.
vectorizer = TfidfVectorizer("english")

# Now we'll not only obtain the feature vector but
# also fit the vectorizer object so that we can use
# the same object to evaluate texts in the future with
# it and get the same number of features as now.
# This is important because the ML model we'll
# generate will learn to predict with a set number of features.
# If the number of features are different the model will fail
features = vectorizer.fit_transform(texts)

# Remember we found the length of each message at line no. 36
# This is also an important feature and needs to be added
# To do this we'll concat a column to the "features"
lengths = sms["length"].values
features = np.hstack((features.todense(), lengths[:, None]))

# Train test split
f_train, f_test, l_train, l_test =train_test_split(
        features, sms['label'],
        test_size = 0.2,
        random_state = int(time.time())
)

# Now our list of features is completed
# We only need to train the machine to classfiy messages
# We'll use Naive Bayes classifer to classify our messages
model = MultinomialNB(alpha=0.2)

# In Multinomial Naive Bayes, the alpha parameter is what
# is known as a hyperparameter; i.e. a parameter that controls
# the form of the model itself. In most cases, the best way to
# determine optimal values for hyperparameters is through a grid
# search over possible parameter values, using cross validation
# to evaluate the performance of the model on your data at each value.

# Now we fit the model using the features we have processed
# and labels that were provided
model.fit(f_train, l_train)

# Test and evalute score
predicted = model.predict(f_test)
accuracyScore = accuracy_score(l_test, predicted)
print(accuracyScore)

# The model has been created.
# We now need to save this model so that it can be
# used in the future without having to go through the
# training process.
# We'll use pickle to save the model and the vectorizer
# The vectorizer will be used to process the future texts
# The model will be used to classfiy future texts
pickle.dump(model, open("model.sav", "wb"))
pickle.dump(vectorizer, open("vectorizer.sav", "wb"))

# What pickle does is saves the object as a raw binary format in a file.
# Now it can be loaded into any python script and used as the same object as
# it is in this script
"""
# Let us predict some texts

# Loading up the model and the vectorizer using pickle
model = pickle.load(open("model.sav", "rb"))
vectorizer = pickle.load(open("vectorizer.sav", "rb"))

# Reading a text
message = str(input("Enter the message: "))

# Creating a DataFrame to store the message and it's length
sms = pd.DataFrame({"message": [message]})
sms["length"] = sms["message"].apply(len)

# Create a copy DataFrame of the message
texts = sms["message"].copy()

# Removing punctuations
for i in range(len(texts)):
    texts[i] = texts[i].translate(punctuationToNone)

# Removing stopwords
for i in range(len(texts)):
    txt = [word for word in texts[i].split() if word.lower() not in stopwords.words("english")]
    texts[i] = " ".join(txt)

# Removing stemmers
for i in range(len(texts)):
    txt = ""
    for c in texts[i].split():
        stemmer = SnowballStemmer("english")
        txt += (stemmer.stem(c)) + " "
    texts[i] = txt

# Using the loaded vectorizer to transform the text
features = vectorizer.transform(texts)

# Adding length to the features
lengths = sms["length"].values
features = np.hstack((features.todense(), lengths[:, None]))

# Classifying the sms
label = model.predict(features)

# Displaying the classification of the message
print(label)

# Done
"""
	# -- coding: utf-8 --
	"""
	Created on Wed Jun 13 23:27:45 2018

	@author: spand
	"""

	import numpy as np
	import pandas as pd
	import pickle, string, time
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.model_selection import train_test_split
	from sklearn.naive_bayes import MultinomialNB
	from sklearn.metrics import accuracy_score
	from nltk.stem import SnowballStemmer
	from nltk.corpus import stopwords

	# loadin the CSV file with latin-1 encoding.
	# latin-1 encoding is type of encoding used in IT
	# (as in different from utf-8)
	# pd.read_csv() method returns a DataFrame object
	sms = pd.read_csv("bin/spam.csv", encoding="latin-1")

	# The CSV file has five columns:
	# v1, v2, Unnamed: 2, Unnamed: 3 and Unnamed: 4
	# v1 is actually the label column and
	# v2 is the feature column (messages in our case)
	# the remaining are not needed, hence dropped.
	sms = sms.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
	sms = sms.rename(columns = {"v1":"label", "v2":"message"})

	# apart from the message as feature we'll also add the
	# length of the message as a feature. Thus the dataset
	# will contain two feature, message and message length
	# .apply() is a DataFrame object method that takes
	# another method in it's parameter and applies it to each
	# item in the DataFrame.
	sms["length"] = sms["message"].apply(len)

	# Now we cannot use the text in the messages directly
	# in our ML algo. as it only understands numbers.
	# So we need to put weights in each messages.
	# To do this we'll need to find the meaning in each message
	# and weight them accordingly.
	# Here Natural Language Processing comes in play.
	# We are using "nltk" (Natural Language Toolkit) library for
	# this purpose

	# First let us take a copy of all the messages in a different DataFrame
	texts = sms["message"].copy()

	# Now we need to remove all punctuations from the message
	# To do this well create a "map" that points all punctuations
	# to None (None => null in python)
	# This will be done with the help of str.maketrans and string.punctuation
	punctuationToNone = str.maketrans('', '', string.punctuation)

	# Now for each message we'll remove the punctuations
	# This will be done by .translate() method
	for i in range(len(texts)):
	texts[i] = texts[i].translate(punctuationToNone)

	# Now we have to remove the stopwords from the messages
	# Stopwords are the most common words in a language
	# that do not actually help convery deep information
	# like {the, as , is, a, etc}
	# This is done using stopwords in nltk.corpus
	for i in range(len(texts)):
	txt = [word for word in texts[i].split() if word.lower() not in stopwords.words("english")]
	texts[i] = " ".join(txt)

	# Now we'll remove stemmers from the texts
	# Stemmer means words in different tenses.
	# Like {fly, flying}
	# This is done using SnowballStemmer from nltk library
	for i in range(len(texts)):
	words = map(lambda t: SnowballStemmer("english").stem(t), texts[i].split())
	texts[i] = " ".join(words)

	# Now we'll using the TfidfVectorizer to transform
	# the texts to feature vectors. This way the data
	# will be more useful to the ML algo.
	vectorizer = TfidfVectorizer("english")

	# Now we'll not only obtain the feature vector but
	# also fit the vectorizer object so that we can use
	# the same object to evaluate texts in the future with
	# it and get the same number of features as now.
	# This is important because the ML model we'll
	# generate will learn to predict with a set number of features.
	# If the number of features are different the model will fail
	features = vectorizer.fit_transform(texts)

	# Remember we found the length of each message at line no. 36
	# This is also an important feature and needs to be added
	# To do this we'll concat a column to the "features"
	lengths = sms["length"].values
	features = np.hstack((features.todense(), lengths[:, None]))

	# Train test split
	f_train, f_test, l_train, l_test =train_test_split(
	features, sms['label'],
	test_size = 0.2,
	random_state = int(time.time())
	)

	# Now our list of features is completed
	# We only need to train the machine to classfiy messages
	# We'll use Naive Bayes classifer to classify our messages
	model = MultinomialNB(alpha=0.2)

	# In Multinomial Naive Bayes, the alpha parameter is what
	# is known as a hyperparameter; i.e. a parameter that controls
	# the form of the model itself. In most cases, the best way to
	# determine optimal values for hyperparameters is through a grid
	# search over possible parameter values, using cross validation
	# to evaluate the performance of the model on your data at each value.

	# Now we fit the model using the features we have processed
	# and labels that were provided
	model.fit(f_train, l_train)

	# Test and evalute score
	predicted = model.predict(f_test)
	accuracyScore = accuracy_score(l_test, predicted)
	print(accuracyScore)

	# The model has been created.
	# We now need to save this model so that it can be
	# used in the future without having to go through the
	# training process.
	# We'll use pickle to save the model and the vectorizer
	# The vectorizer will be used to process the future texts
	# The model will be used to classfiy future texts
	pickle.dump(model, open("model.sav", "wb"))
	pickle.dump(vectorizer, open("vectorizer.sav", "wb"))

	# What pickle does is saves the object as a raw binary format in a file.
	# Now it can be loaded into any python script and used as the same object as
	# it is in this script
	"""
	# Let us predict some texts

	# Loading up the model and the vectorizer using pickle
	model = pickle.load(open("model.sav", "rb"))
	vectorizer = pickle.load(open("vectorizer.sav", "rb"))

	# Reading a text
	message = str(input("Enter the message: "))

	# Creating a DataFrame to store the message and it's length
	sms = pd.DataFrame({"message": [message]})
	sms["length"] = sms["message"].apply(len)

	# Create a copy DataFrame of the message
	texts = sms["message"].copy()

	# Removing punctuations
	for i in range(len(texts)):
	texts[i] = texts[i].translate(punctuationToNone)

	# Removing stopwords
	for i in range(len(texts)):
	txt = [word for word in texts[i].split() if word.lower() not in stopwords.words("english")]
	texts[i] = " ".join(txt)

	# Removing stemmers
	for i in range(len(texts)):
	txt = ""
	for c in texts[i].split():
	stemmer = SnowballStemmer("english")
	txt += (stemmer.stem(c)) + " "
	texts[i] = txt

	# Using the loaded vectorizer to transform the text
	features = vectorizer.transform(texts)

	# Adding length to the features
	lengths = sms["length"].values
	features = np.hstack((features.todense(), lengths[:, None]))

	# Classifying the sms
	label = model.predict(features)

	# Displaying the classification of the message
	print(label)

	# Done
	"""