Skip to content

Instantly share code, notes, and snippets.

@AzharuddinKazi
Created February 18, 2017 18:03
Show Gist options
  • Save AzharuddinKazi/6718448cc3da6fe6781f90e31ad108fb to your computer and use it in GitHub Desktop.
Save AzharuddinKazi/6718448cc3da6fe6781f90e31ad108fb to your computer and use it in GitHub Desktop.
text classification using naive bayes classifier in python
import os
import pandas as pd
import re
import numpy as np
from sklearn.metrics import confusion_matrix
import random
import nltk
from sklearn.metrics import recall_score, precision_score, accuracy_score
from sklearn.naive_bayes import MultinomialNB
# change the file name
data_domain = pd.read_csv("Data_Domains_Textclassification.csv") # importing the data into python using pandas data frame
print data_domain.head() # this will print the head of data_domain
data_domain.columns #to print column names
data_domain['target'] = data_domain['Domain'].astype(str) # taking the target column in seperate variable. we will add this to final data frame when pre processing is done
data_desc = data_domain["Description"] # taking out the description column from main dataset to perform pre-processing steps
for i in range(1,len(data_desc)):
data_desc[i] = re.sub(" \d+", " ",data_desc[i]) # this will retain only the words
# create the count vectorizer
from sklearn.feature_extraction.text import CountVectorizer
cnt_vct = CountVectorizer(analyzer='word',binary =True,token_pattern=r'\b\w\w+\b',
lowercase = True,ngram_range=(1,1),stop_words='english',encoding = 'ISO-8859-1',min_df=3)
# creating the document term matrix
data_mat = cnt_vct.fit_transform(data_desc)
dense = data_mat.todense()
# extracting the features and storing it in a different dataset
featureset = pd.DataFrame(dense,columns=cnt_vct.get_feature_names())
featureset["target"] = data_domain['target']
classlabel = "target"
# shuffing the data to avoid baising
featureset = featureset.sample(frac = 1)
featureset.shape
Targetindex = featureset.columns.get_loc("target")
np.random.seed(0) # setting the seed so that we will get same results every time
# split data into training = 70%, and test = 30% sets
from sklearn import cross_validation
dataset_train, dataset_test = cross_validation.train_test_split(np.array(featureset), train_size=0.7, test_size=0.3)
type(dataset_train)
dataset_train = pd.DataFrame(dataset_train)
dataset_test = pd.DataFrame(dataset_test)
dataset_train.columns = featureset.columns
dataset_test.columns = featureset.columns
Targetindex = dataset_train.columns.get_loc("target")
Targetindex
X = dataset_train.drop(dataset_train.columns[[Targetindex]], axis=1) # all the features except the target variable
y = dataset_train["target"] # only the target variable
X1 = dataset_test.drop(dataset_test.columns[[Targetindex]], axis=1) # all the features except the target variable
y1 = dataset_test["target"] # only the target variable
# building the naive bayes model with the data
clf = MultinomialNB().fit(X, y)
trainpred = clf.predict(X)
testpred = clf.predict(X1)
dataset_train["target"].value_counts()
dataset_test["target"].value_counts()
# creating the confusion matrix for both train and test
train_confMat = confusion_matrix(y,trainpred)
test_confMat = confusion_matrix(y1,testpred)
# print the confusion matrix for train
print(train_confMat)
print(test_confMat)
# evaluate the error metrics on the predictions made
accuracy_score(dataset_train['target'],trainpred)
accuracy_score(dataset_test['target'],testpred)
acc = accuracy_score(dataset_test['target'],testpred)
rec = recall_score(dataset_test['target'],testpred,pos_label='3')
prec = precision_score(dataset_test['target'],testpred,pos_label='1')
acc
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment