vivek1240/support_email_classification_medium_1.py

## support_email_classification_medium_1.py
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sqlite3 import Error
from sklearn.ensemble import RandomForestClassifier
import pickle
import nltk
import copy
%matplotlib inline

#Using a copy of original dataframe
df=copy.deepcopy(email_data)

#Applying Tfidf vectorizer for vectorizing the data
vectorizer = TfidfVectorizer(min_df= 1,max_df= .94, stop_words="english", sublinear_tf=True, norm='l1', ngram_range=(1, 1))
final_features = vectorizer.fit_transform(df['cleaned_body']).toarray()
final_features.shape


# this block is to split the dataset into training and testing set
X = df['cleaned_body']
df['label_num']  =  df['Request_Type'].map({'Refunds': 0, 'Cancellation': 1,'Others': 2,'Amendment': 3, 'Website Error':4})
#df['label_num'] = df['Request_Type'].factorize()[0] #class embedding
Y = df['label_num']
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state = 0, test_size=0.25)

# instead of doing these steps one at a time, we can use a pipeline to complete then all at once
pipeline = Pipeline([('vect', vectorizer),
                     ('chi',  SelectKBest(chi2, k=1000)),
                     ('clf', RandomForestClassifier())])

# fitting our model and save it in a pickle for later use
model = pipeline.fit(X_train, y_train)
with open('RandomForest.pickle', 'wb') as f:
    pickle.dump(model, f)

ytest = np.array(y_test)

# confusion matrix and classification report(precision, recall, F1-score)
print(classification_report(ytest, model.predict(X_test)))
print(confusion_matrix(ytest, model.predict(X_test)))
	import pandas as pd
	import re
	import numpy as np
	import matplotlib.pyplot as plt
	from nltk.corpus import stopwords
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.pipeline import Pipeline
	from sklearn.metrics import classification_report, confusion_matrix
	from sklearn.model_selection import train_test_split
	from sklearn.feature_selection import SelectKBest, chi2
	from sqlite3 import Error
	from sklearn.ensemble import RandomForestClassifier
	import pickle
	import nltk
	import copy
	%matplotlib inline

	#Using a copy of original dataframe
	df=copy.deepcopy(email_data)

	#Applying Tfidf vectorizer for vectorizing the data
	vectorizer = TfidfVectorizer(min_df= 1,max_df= .94, stop_words="english", sublinear_tf=True, norm='l1', ngram_range=(1, 1))
	final_features = vectorizer.fit_transform(df['cleaned_body']).toarray()
	final_features.shape


	# this block is to split the dataset into training and testing set
	X = df['cleaned_body']
	df['label_num'] = df['Request_Type'].map({'Refunds': 0, 'Cancellation': 1,'Others': 2,'Amendment': 3, 'Website Error':4})
	#df['label_num'] = df['Request_Type'].factorize()[0] #class embedding
	Y = df['label_num']
	X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state = 0, test_size=0.25)

	# instead of doing these steps one at a time, we can use a pipeline to complete then all at once
	pipeline = Pipeline([('vect', vectorizer),
	('chi', SelectKBest(chi2, k=1000)),
	('clf', RandomForestClassifier())])

	# fitting our model and save it in a pickle for later use
	model = pipeline.fit(X_train, y_train)
	with open('RandomForest.pickle', 'wb') as f:
	pickle.dump(model, f)

	ytest = np.array(y_test)

	# confusion matrix and classification report(precision, recall, F1-score)
	print(classification_report(ytest, model.predict(X_test)))
	print(confusion_matrix(ytest, model.predict(X_test)))