ishwor2048/Real_Spam_Detecting_Algorithm_High_Accuracy.py

## Real_Spam_Detecting_Algorithm_High_Accuracy.py
# -*- coding: utf-8 -*-
"""
Created on Tue May 28 15:04:39 2019

@author: Ishwor Bhusal
"""

# Importing basic packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Importing dataset to the environment
data = pd.read_csv('TextFiles/smsspamcollection.tsv', sep='\t')
data.head()

# Checking the total count of the null values
data.isnull().sum()

# Counting the values of the label
data['label'].value_counts()

# Importing train test split model
from sklearn.model_selection import train_test_split

# Splitting the data for train and test set
X_data = data['message']
y_data = data['label']

# Running the train test split model
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.25, random_state=508)

# Running count vectorizer
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()

# FIT VECTORIZER to the data
count_vect.fit(X_train)
X_train_counts = count_vect.transform(X_train)
# transform the original text message
X_train_counts = count_vect.fit_transform(X_train)

# Checking the shape of the data
X_train_counts.shape

# Importing tfi data transformer
from sklearn.feature_extraction.text import TfidataTransformer
tfi_trans = TfidataTransformer()

# Fitting the transformer into training data counts and looking at the shape of the data
X_train_tfidata = tfi_trans.fit_transform(X_train_counts)
X_train_tfidata.shape

# Importing Term-Frequency vectorizer
from sklearn.feature_extraction.text import TfidataVectorizer
vectorizer = TfidataVectorizer()

# Fitting the tfi vectorizer
X_train_tfidata = vectorizer.fit_transform(X_train)

from sklearn.svm import LinearSVC
clf = LinearSVC()

# Fitting linear support vector classifier in vectorized training data
clf.fit(X_train_tfidata, y_train)

# Importing Pipeline module
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('tfidata', TfidataVectorizer()), ('clf', LinearSVC())])

# Fitting pipeline module in traning data
text_clf.fit(X_train, y_train)

# Running prediction model into test data
predictions = text_clf.predict(X_test)

# Importing confusioin matrix and classification report from scikit learn
from sklearn.metrics import confusion_matrix, classification_report
# Preparing and printing confusion matrix to see the performance of the model
print(confusion_matrix(y_test, predictions))
# Looking at classification report
print(classification_report(y_test, predictions))

# importing metrics and looking in each test data and comparing with prediction
from sklearn import metrics
metrics.accuracy_score(y_test, predictions)

# Practically checking the accuracy level of the predictions and looking in real examples
text_clf.predict(["Congratulations! for your achievement what you have got!"])
text_clf.predict(["Mr. Customer, you are eligible to get 50% discount on your purchase of your clothings in your next visit."])
text_clf.predict(["Mom, come home as soon as possible, someone wants to see you soon!"])
text_clf.predict(["Dad, come home as soon as possible, someone wants to see you soon!"])
text_clf.predict(["Brother, come home as soon as possible, someone wants to see you soon!"])
text_clf.predict(["Dear Customer, come home as soon as possible, someone wants to see you soon!"])
text_clf.predict(["Dear Mr. Bhusal, come home as soon as possible, someone wants to see you soon!"])
text_clf.predict(["Mrs. Kala, you are one of those people who have been choose for next round of interviews, congratulations!"])
# Above sentences are those which have been tested if model accurately predicts or not, either message is spam or legit (ham)
	# -- coding: utf-8 --
	"""
	Created on Tue May 28 15:04:39 2019

	@author: Ishwor Bhusal
	"""

	# Importing basic packages
	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt

	# Importing dataset to the environment
	data = pd.read_csv('TextFiles/smsspamcollection.tsv', sep='\t')
	data.head()

	# Checking the total count of the null values
	data.isnull().sum()

	# Counting the values of the label
	data['label'].value_counts()

	# Importing train test split model
	from sklearn.model_selection import train_test_split

	# Splitting the data for train and test set
	X_data = data['message']
	y_data = data['label']

	# Running the train test split model
	X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.25, random_state=508)

	# Running count vectorizer
	from sklearn.feature_extraction.text import CountVectorizer

	count_vect = CountVectorizer()

	# FIT VECTORIZER to the data
	count_vect.fit(X_train)
	X_train_counts = count_vect.transform(X_train)
	# transform the original text message
	X_train_counts = count_vect.fit_transform(X_train)

	# Checking the shape of the data
	X_train_counts.shape

	# Importing tfi data transformer
	from sklearn.feature_extraction.text import TfidataTransformer
	tfi_trans = TfidataTransformer()

	# Fitting the transformer into training data counts and looking at the shape of the data
	X_train_tfidata = tfi_trans.fit_transform(X_train_counts)
	X_train_tfidata.shape

	# Importing Term-Frequency vectorizer
	from sklearn.feature_extraction.text import TfidataVectorizer
	vectorizer = TfidataVectorizer()

	# Fitting the tfi vectorizer
	X_train_tfidata = vectorizer.fit_transform(X_train)

	from sklearn.svm import LinearSVC
	clf = LinearSVC()

	# Fitting linear support vector classifier in vectorized training data
	clf.fit(X_train_tfidata, y_train)

	# Importing Pipeline module
	from sklearn.pipeline import Pipeline
	text_clf = Pipeline([('tfidata', TfidataVectorizer()), ('clf', LinearSVC())])

	# Fitting pipeline module in traning data
	text_clf.fit(X_train, y_train)

	# Running prediction model into test data
	predictions = text_clf.predict(X_test)

	# Importing confusioin matrix and classification report from scikit learn
	from sklearn.metrics import confusion_matrix, classification_report
	# Preparing and printing confusion matrix to see the performance of the model
	print(confusion_matrix(y_test, predictions))
	# Looking at classification report
	print(classification_report(y_test, predictions))

	# importing metrics and looking in each test data and comparing with prediction
	from sklearn import metrics
	metrics.accuracy_score(y_test, predictions)

	# Practically checking the accuracy level of the predictions and looking in real examples
	text_clf.predict(["Congratulations! for your achievement what you have got!"])
	text_clf.predict(["Mr. Customer, you are eligible to get 50% discount on your purchase of your clothings in your next visit."])
	text_clf.predict(["Mom, come home as soon as possible, someone wants to see you soon!"])
	text_clf.predict(["Dad, come home as soon as possible, someone wants to see you soon!"])
	text_clf.predict(["Brother, come home as soon as possible, someone wants to see you soon!"])
	text_clf.predict(["Dear Customer, come home as soon as possible, someone wants to see you soon!"])
	text_clf.predict(["Dear Mr. Bhusal, come home as soon as possible, someone wants to see you soon!"])
	text_clf.predict(["Mrs. Kala, you are one of those people who have been choose for next round of interviews, congratulations!"])
	# Above sentences are those which have been tested if model accurately predicts or not, either message is spam or legit (ham)