saimadhu-polamuri/dataaspirant-bag-of-words-implementation.py

## dataaspirant-bag-of-words-implementation.py

## dependencies
import pandas as pd
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize as st
from nltk.stem import WordNetLemmatizer as wordnet
import re

## reading the file
df = pd.read_csv('spam.csv',encoding = 'ISO-8859-1',usecols=['v1','v2'])
corpus = [] #empty list
wordnet = wordnet() #object instantiation
length = len(df['v2']) #finding total number of rows
for i in range(length):
	rev = re.sub('[^a-zA-Z]',' ',df['v2'][i])
	rev = rev.lower() #text to lowercase
	rev = rev.split() #each word of the sentence becomes the element of a list
	rev = [wordnet.lemmatize(word) for word in rev if word not in stopwords.words('english')] #lemmatization via list comprehension
	rev = ' '.join(rev) #from list to string
	corpus.append(rev) #appending to the list

from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=2500) #to take max features(columns), 2500
x = cv.fit_transform(corpus).toarray() #converting to array
y = df['v1'] #dependent variable

## y is a categorical variable so will encode it
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

## now splitting the model into train and test set

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

## training the model
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB() # using naive bayes classification algorithm
model.fit(x_train,y_train) # fitting the model

## predicting the values
y_pred = model.predict(x_test)

#score of the model
model.score(x_test,y_test)

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test,y_pred)

	## dependencies
	import pandas as pd
	import nltk
	import numpy as np
	from nltk.corpus import stopwords
	from nltk.tokenize import sent_tokenize as st
	from nltk.stem import WordNetLemmatizer as wordnet
	import re

	## reading the file
	df = pd.read_csv('spam.csv',encoding = 'ISO-8859-1',usecols=['v1','v2'])
	corpus = [] #empty list
	wordnet = wordnet() #object instantiation
	length = len(df['v2']) #finding total number of rows
	for i in range(length):
	rev = re.sub('[^a-zA-Z]',' ',df['v2'][i])
	rev = rev.lower() #text to lowercase
	rev = rev.split() #each word of the sentence becomes the element of a list
	rev = [wordnet.lemmatize(word) for word in rev if word not in stopwords.words('english')] #lemmatization via list comprehension
	rev = ' '.join(rev) #from list to string
	corpus.append(rev) #appending to the list

	from sklearn.feature_extraction.text import CountVectorizer

	cv = CountVectorizer(max_features=2500) #to take max features(columns), 2500
	x = cv.fit_transform(corpus).toarray() #converting to array
	y = df['v1'] #dependent variable

	## y is a categorical variable so will encode it
	from sklearn.preprocessing import LabelEncoder
	le = LabelEncoder()
	y = le.fit_transform(y)

	## now splitting the model into train and test set

	from sklearn.model_selection import train_test_split
	x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

	## training the model
	from sklearn.naive_bayes import MultinomialNB
	model = MultinomialNB() # using naive bayes classification algorithm
	model.fit(x_train,y_train) # fitting the model

	## predicting the values
	y_pred = model.predict(x_test)

	#score of the model
	model.score(x_test,y_test)

	from sklearn.metrics import confusion_matrix
	cm = confusion_matrix(y_test,y_pred)