Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save ishwor2048/d9cce529299201d03ba47ab34e38f0cf to your computer and use it in GitHub Desktop.
Save ishwor2048/d9cce529299201d03ba47ab34e38f0cf to your computer and use it in GitHub Desktop.
Spam Filtering Model on scikit learn based on tfidf and countvectorizer
# -*- coding: utf-8 -*-
"""
Created on Tue May 28 15:04:39 2019
@author: Ishwor Bhusal
"""
# Importing basic packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# Importing dataset to the environment
data = pd.read_csv('TextFiles/smsspamcollection.tsv', sep='\t')
data.head()
# Checking the total count of the null values
data.isnull().sum()
# Counting the values of the label
data['label'].value_counts()
# Importing train test split model
from sklearn.model_selection import train_test_split
# Splitting the data for train and test set
X_data = data['message']
y_data = data['label']
# Running the train test split model
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.25, random_state=508)
# Running count vectorizer
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
# FIT VECTORIZER to the data
count_vect.fit(X_train)
X_train_counts = count_vect.transform(X_train)
# transform the original text message
X_train_counts = count_vect.fit_transform(X_train)
# Checking the shape of the data
X_train_counts.shape
# Importing tfi data transformer
from sklearn.feature_extraction.text import TfidataTransformer
tfi_trans = TfidataTransformer()
# Fitting the transformer into training data counts and looking at the shape of the data
X_train_tfidata = tfi_trans.fit_transform(X_train_counts)
X_train_tfidata.shape
# Importing Term-Frequency vectorizer
from sklearn.feature_extraction.text import TfidataVectorizer
vectorizer = TfidataVectorizer()
# Fitting the tfi vectorizer
X_train_tfidata = vectorizer.fit_transform(X_train)
from sklearn.svm import LinearSVC
clf = LinearSVC()
# Fitting linear support vector classifier in vectorized training data
clf.fit(X_train_tfidata, y_train)
# Importing Pipeline module
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('tfidata', TfidataVectorizer()), ('clf', LinearSVC())])
# Fitting pipeline module in traning data
text_clf.fit(X_train, y_train)
# Running prediction model into test data
predictions = text_clf.predict(X_test)
# Importing confusioin matrix and classification report from scikit learn
from sklearn.metrics import confusion_matrix, classification_report
# Preparing and printing confusion matrix to see the performance of the model
print(confusion_matrix(y_test, predictions))
# Looking at classification report
print(classification_report(y_test, predictions))
# importing metrics and looking in each test data and comparing with prediction
from sklearn import metrics
metrics.accuracy_score(y_test, predictions)
# Practically checking the accuracy level of the predictions and looking in real examples
text_clf.predict(["Congratulations! for your achievement what you have got!"])
text_clf.predict(["Mr. Customer, you are eligible to get 50% discount on your purchase of your clothings in your next visit."])
text_clf.predict(["Mom, come home as soon as possible, someone wants to see you soon!"])
text_clf.predict(["Dad, come home as soon as possible, someone wants to see you soon!"])
text_clf.predict(["Brother, come home as soon as possible, someone wants to see you soon!"])
text_clf.predict(["Dear Customer, come home as soon as possible, someone wants to see you soon!"])
text_clf.predict(["Dear Mr. Bhusal, come home as soon as possible, someone wants to see you soon!"])
text_clf.predict(["Mrs. Kala, you are one of those people who have been choose for next round of interviews, congratulations!"])
# Above sentences are those which have been tested if model accurately predicts or not, either message is spam or legit (ham)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment