Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save saimadhu-polamuri/b80b5ea7b0c51c1f77fc89ef7a9b5d4e to your computer and use it in GitHub Desktop.
Save saimadhu-polamuri/b80b5ea7b0c51c1f77fc89ef7a9b5d4e to your computer and use it in GitHub Desktop.
## dependencies
import pandas as pd
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize as st
from nltk.stem import WordNetLemmatizer as wordnet
import re
## reading the file
df = pd.read_csv('spam.csv',encoding = 'ISO-8859-1',usecols=['v1','v2'])
corpus = [] #empty list
wordnet = wordnet() #object instantiation
length = len(df['v2']) #finding total number of rows
for i in range(length):
rev = re.sub('[^a-zA-Z]',' ',df['v2'][i])
rev = rev.lower() #text to lowercase
rev = rev.split() #each word of the sentence becomes the element of a list
rev = [wordnet.lemmatize(word) for word in rev if word not in stopwords.words('english')] #lemmatization via list comprehension
rev = ' '.join(rev) #from list to string
corpus.append(rev) #appending to the list
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500) #to take max features(columns), 2500
x = cv.fit_transform(corpus).toarray() #converting to array
y = df['v1'] #dependent variable
## y is a categorical variable so will encode it
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
## now splitting the model into train and test set
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)
## training the model
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB() # using naive bayes classification algorithm
model.fit(x_train,y_train) # fitting the model
## predicting the values
y_pred = model.predict(x_test)
#score of the model
model.score(x_test,y_test)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test,y_pred)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment