Created
January 18, 2021 14:55
-
-
Save saimadhu-polamuri/b80b5ea7b0c51c1f77fc89ef7a9b5d4e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## dependencies | |
import pandas as pd | |
import nltk | |
import numpy as np | |
from nltk.corpus import stopwords | |
from nltk.tokenize import sent_tokenize as st | |
from nltk.stem import WordNetLemmatizer as wordnet | |
import re | |
## reading the file | |
df = pd.read_csv('spam.csv',encoding = 'ISO-8859-1',usecols=['v1','v2']) | |
corpus = [] #empty list | |
wordnet = wordnet() #object instantiation | |
length = len(df['v2']) #finding total number of rows | |
for i in range(length): | |
rev = re.sub('[^a-zA-Z]',' ',df['v2'][i]) | |
rev = rev.lower() #text to lowercase | |
rev = rev.split() #each word of the sentence becomes the element of a list | |
rev = [wordnet.lemmatize(word) for word in rev if word not in stopwords.words('english')] #lemmatization via list comprehension | |
rev = ' '.join(rev) #from list to string | |
corpus.append(rev) #appending to the list | |
from sklearn.feature_extraction.text import CountVectorizer | |
cv = CountVectorizer(max_features=2500) #to take max features(columns), 2500 | |
x = cv.fit_transform(corpus).toarray() #converting to array | |
y = df['v1'] #dependent variable | |
## y is a categorical variable so will encode it | |
from sklearn.preprocessing import LabelEncoder | |
le = LabelEncoder() | |
y = le.fit_transform(y) | |
## now splitting the model into train and test set | |
from sklearn.model_selection import train_test_split | |
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2) | |
## training the model | |
from sklearn.naive_bayes import MultinomialNB | |
model = MultinomialNB() # using naive bayes classification algorithm | |
model.fit(x_train,y_train) # fitting the model | |
## predicting the values | |
y_pred = model.predict(x_test) | |
#score of the model | |
model.score(x_test,y_test) | |
from sklearn.metrics import confusion_matrix | |
cm = confusion_matrix(y_test,y_pred) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment