Last active
May 29, 2021 19:54
-
-
Save vivek1240/41df815005190650a705f1f60ea80175 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import re | |
import numpy as np | |
import matplotlib.pyplot as plt | |
from nltk.corpus import stopwords | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.pipeline import Pipeline | |
from sklearn.metrics import classification_report, confusion_matrix | |
from sklearn.model_selection import train_test_split | |
from sklearn.feature_selection import SelectKBest, chi2 | |
from sqlite3 import Error | |
from sklearn.ensemble import RandomForestClassifier | |
import pickle | |
import nltk | |
import copy | |
%matplotlib inline | |
#Using a copy of original dataframe | |
df=copy.deepcopy(email_data) | |
#Applying Tfidf vectorizer for vectorizing the data | |
vectorizer = TfidfVectorizer(min_df= 1,max_df= .94, stop_words="english", sublinear_tf=True, norm='l1', ngram_range=(1, 1)) | |
final_features = vectorizer.fit_transform(df['cleaned_body']).toarray() | |
final_features.shape | |
# this block is to split the dataset into training and testing set | |
X = df['cleaned_body'] | |
df['label_num'] = df['Request_Type'].map({'Refunds': 0, 'Cancellation': 1,'Others': 2,'Amendment': 3, 'Website Error':4}) | |
#df['label_num'] = df['Request_Type'].factorize()[0] #class embedding | |
Y = df['label_num'] | |
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state = 0, test_size=0.25) | |
# instead of doing these steps one at a time, we can use a pipeline to complete then all at once | |
pipeline = Pipeline([('vect', vectorizer), | |
('chi', SelectKBest(chi2, k=1000)), | |
('clf', RandomForestClassifier())]) | |
# fitting our model and save it in a pickle for later use | |
model = pipeline.fit(X_train, y_train) | |
with open('RandomForest.pickle', 'wb') as f: | |
pickle.dump(model, f) | |
ytest = np.array(y_test) | |
# confusion matrix and classification report(precision, recall, F1-score) | |
print(classification_report(ytest, model.predict(X_test))) | |
print(confusion_matrix(ytest, model.predict(X_test))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment