Skip to content

Instantly share code, notes, and snippets.

@vivek1240
Last active May 29, 2021 19:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save vivek1240/41df815005190650a705f1f60ea80175 to your computer and use it in GitHub Desktop.
Save vivek1240/41df815005190650a705f1f60ea80175 to your computer and use it in GitHub Desktop.
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sqlite3 import Error
from sklearn.ensemble import RandomForestClassifier
import pickle
import nltk
import copy
%matplotlib inline
#Using a copy of original dataframe
df=copy.deepcopy(email_data)
#Applying Tfidf vectorizer for vectorizing the data
vectorizer = TfidfVectorizer(min_df= 1,max_df= .94, stop_words="english", sublinear_tf=True, norm='l1', ngram_range=(1, 1))
final_features = vectorizer.fit_transform(df['cleaned_body']).toarray()
final_features.shape
# this block is to split the dataset into training and testing set
X = df['cleaned_body']
df['label_num'] = df['Request_Type'].map({'Refunds': 0, 'Cancellation': 1,'Others': 2,'Amendment': 3, 'Website Error':4})
#df['label_num'] = df['Request_Type'].factorize()[0] #class embedding
Y = df['label_num']
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state = 0, test_size=0.25)
# instead of doing these steps one at a time, we can use a pipeline to complete then all at once
pipeline = Pipeline([('vect', vectorizer),
('chi', SelectKBest(chi2, k=1000)),
('clf', RandomForestClassifier())])
# fitting our model and save it in a pickle for later use
model = pipeline.fit(X_train, y_train)
with open('RandomForest.pickle', 'wb') as f:
pickle.dump(model, f)
ytest = np.array(y_test)
# confusion matrix and classification report(precision, recall, F1-score)
print(classification_report(ytest, model.predict(X_test)))
print(confusion_matrix(ytest, model.predict(X_test)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment