Created
February 18, 2017 18:03
-
-
Save AzharuddinKazi/6718448cc3da6fe6781f90e31ad108fb to your computer and use it in GitHub Desktop.
text classification using naive bayes classifier in python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import pandas as pd | |
import re | |
import numpy as np | |
from sklearn.metrics import confusion_matrix | |
import random | |
import nltk | |
from sklearn.metrics import recall_score, precision_score, accuracy_score | |
from sklearn.naive_bayes import MultinomialNB | |
# change the file name | |
data_domain = pd.read_csv("Data_Domains_Textclassification.csv") # importing the data into python using pandas data frame | |
print data_domain.head() # this will print the head of data_domain | |
data_domain.columns #to print column names | |
data_domain['target'] = data_domain['Domain'].astype(str) # taking the target column in seperate variable. we will add this to final data frame when pre processing is done | |
data_desc = data_domain["Description"] # taking out the description column from main dataset to perform pre-processing steps | |
for i in range(1,len(data_desc)): | |
data_desc[i] = re.sub(" \d+", " ",data_desc[i]) # this will retain only the words | |
# create the count vectorizer | |
from sklearn.feature_extraction.text import CountVectorizer | |
cnt_vct = CountVectorizer(analyzer='word',binary =True,token_pattern=r'\b\w\w+\b', | |
lowercase = True,ngram_range=(1,1),stop_words='english',encoding = 'ISO-8859-1',min_df=3) | |
# creating the document term matrix | |
data_mat = cnt_vct.fit_transform(data_desc) | |
dense = data_mat.todense() | |
# extracting the features and storing it in a different dataset | |
featureset = pd.DataFrame(dense,columns=cnt_vct.get_feature_names()) | |
featureset["target"] = data_domain['target'] | |
classlabel = "target" | |
# shuffing the data to avoid baising | |
featureset = featureset.sample(frac = 1) | |
featureset.shape | |
Targetindex = featureset.columns.get_loc("target") | |
np.random.seed(0) # setting the seed so that we will get same results every time | |
# split data into training = 70%, and test = 30% sets | |
from sklearn import cross_validation | |
dataset_train, dataset_test = cross_validation.train_test_split(np.array(featureset), train_size=0.7, test_size=0.3) | |
type(dataset_train) | |
dataset_train = pd.DataFrame(dataset_train) | |
dataset_test = pd.DataFrame(dataset_test) | |
dataset_train.columns = featureset.columns | |
dataset_test.columns = featureset.columns | |
Targetindex = dataset_train.columns.get_loc("target") | |
Targetindex | |
X = dataset_train.drop(dataset_train.columns[[Targetindex]], axis=1) # all the features except the target variable | |
y = dataset_train["target"] # only the target variable | |
X1 = dataset_test.drop(dataset_test.columns[[Targetindex]], axis=1) # all the features except the target variable | |
y1 = dataset_test["target"] # only the target variable | |
# building the naive bayes model with the data | |
clf = MultinomialNB().fit(X, y) | |
trainpred = clf.predict(X) | |
testpred = clf.predict(X1) | |
dataset_train["target"].value_counts() | |
dataset_test["target"].value_counts() | |
# creating the confusion matrix for both train and test | |
train_confMat = confusion_matrix(y,trainpred) | |
test_confMat = confusion_matrix(y1,testpred) | |
# print the confusion matrix for train | |
print(train_confMat) | |
print(test_confMat) | |
# evaluate the error metrics on the predictions made | |
accuracy_score(dataset_train['target'],trainpred) | |
accuracy_score(dataset_test['target'],testpred) | |
acc = accuracy_score(dataset_test['target'],testpred) | |
rec = recall_score(dataset_test['target'],testpred,pos_label='3') | |
prec = precision_score(dataset_test['target'],testpred,pos_label='1') | |
acc |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment