MickeyPvX/NLTK_Simple_BagOfWords.py

## NLTK_Simple_BagOfWords.py
import pandas as pd
import re
import os
import pydotplus
import numpy as np
import datetime as dt

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from os.path import join
from sklearn.model_selection import cross_val_score

"""Script to generate a simple, 'Bag of Words,' model to classify text strings into N classes using a Random Forest classifier
and a Decision Tree classifier for model visualization

Returns:
    vocab (dict): Vocabulary of the training data
    test_accuracy (float): Model accuracy against test data set
    train_cv_accuracy (float): Cross-validation accuracy (default 10-fold)
    <filename>.txt (.txt file): Feature Importances text file output (default is top 25)
    <filename>.pdf (.pdf file): Decision tree graphic (default is max_nodes=25)
    cm (numpy.array): Confusion matrix for model performance on test data
    <filename>.xlsx (.xlsx file): Excel workbook with class predictions and probabilities of unlabeled data
"""

def gen_corpus(df, column, extra_pattern=None):
    """Creates list of cleaned text strings to be modeled from a pandas DataFrame column

    Args:
        df (pandas.DataFrame): DataFrame that contains text strings to clean
        column (str): Column name that contains text strings
        extra_pattern (_sre.SRE_Pattern): Optional extra pattern to remove from text
    """
    corpus = []
    ps = PorterStemmer()

    for i in df.index:
        if extra_pattern:
            line = re.sub(extra_pattern, '', df[column][i])

        line = re.sub('[^a-zA-Z]', ' ', line)
        line = line.lower().split()

        line = [ps.stem(word) for word in line if not word in set(stopwords.words('english'))]
        line = ' '.join(line)

        corpus.append(line)

    return corpus


# Get raw text
bannerpath = '<YOUR DIRECTORY PATH HERE>'

dfTrain = pd.read_csv(join(bannerpath, '<YOUR TRAINING DATA FILE HERE>')) # I'm using .csv files, change to whatever input method you like

dfFuture = pd.read_csv(join(bannerpath, '<YOUR UNLABELED DATA FILE HERE>'))

dfTrain.drop_duplicates(inplace=True)
dfTrain.dropna(inplace=True)

# Cleaning the text
cleantags = re.compile('<.*?>') # To help remove HTML/XML tags from text strings
corpus_future = gen_corpus(dfFuture, '<COLUMN NAME CONTAINING TARGET TEXT>', cleantags)
corpus_train = gen_corpus(dfTrain, '<COLUMN NAME CONTAINING TARGET TEXT>', cleantags)

# Creating Bag of Words Model
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features = 5000)
X = cv.fit_transform(corpus_train).toarray()
vocab = cv.vocabulary_
y = dfTrain.iloc[:, -1].values

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

# Fitting Classifier to the Training Set
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, export_graphviz

# Need to make sure the graphviz folder is in PATH
vizpath = '{}<PATH TO GRAPHVIZ FOLDER>'.format(os.pathsep)
if vizpath not in os.environ['PATH']:
    os.environ['PATH'] += vizpath

del vizpath

tree = DecisionTreeClassifier(max_leaf_nodes=25)  # For visualization purposes
classifier = RandomForestClassifier(n_estimators = 501,
                                    criterion = 'gini',
                                    n_jobs=-1,
                                    random_state=0,
                                    verbose=1)    # Actual model used for classifications

tree.fit(X_train, y_train)

classifier.fit(X_train, y_train)
test_accuracy = classifier.score(X_test, y_test)
train_cv_accuracy = cross_val_score(classifier, X, y, cv=10).mean()

importances = classifier.feature_importances_
std = np.std([tree.feature_importances_ for tree in classifier.estimators_], axis=0)
indices = np.argsort(importances)[::-1]

feature_map = [(f + 1, list(vocab.keys())[list(vocab.values()).index(indices[f])], importances[indices[f]]) for f in range(25)]

with open(os.path.join(bannerpath, '<YOUR FEATURE MAP LOG FILE NAME>.txt'), 'w') as log:
    log.write('{:<}\t{:<10}\t{:>}\n'.format('Rank','Feature','Importance'))
    for rank, feature, importance in feature_map:
        log.write('{:<3}\t{:10}\t{:>0.4f}\n'.format(str(rank), feature, importance))

features = sorted(vocab.keys())
dot_data = export_graphviz(tree, out_file=None,
                            filled=True, rounded=True,
                            special_characters=True,
                            feature_names=features)

graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_pdf(join(bannerpath, '<YOUR DECISION TREE GRAPHIC FILE NAME>.pdf'))

# Predicting Test Set results
y_pred = classifier.predict(X_test)
y_prob = classifier.predict_proba(X_test)

# Making the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

# Get unlabeled data
X_future = cv.transform(corpus_future).toarray()

y_future_prob = pd.DataFrame(classifier.predict_proba(X_future), columns=classifier.classes_)
y_future_pred = pd.DataFrame(classifier.predict(X_future), columns=['ClassPred'])
dfOutput = dfFuture.join(y_future_prob).join(y_future_pred)

writer = pd.ExcelWriter(bannerpath + '<YOUR OUTPUT FILE NAME>.xlsx')
dfOutput.to_excel(writer, '<WORKSHEET NAME>', index=False)
writer.save()
	import pandas as pd
	import re
	import os
	import pydotplus
	import numpy as np
	import datetime as dt

	from nltk.corpus import stopwords
	from nltk.stem.porter import PorterStemmer
	from os.path import join
	from sklearn.model_selection import cross_val_score

	"""Script to generate a simple, 'Bag of Words,' model to classify text strings into N classes using a Random Forest classifier
	and a Decision Tree classifier for model visualization

	Returns:
	vocab (dict): Vocabulary of the training data
	test_accuracy (float): Model accuracy against test data set
	train_cv_accuracy (float): Cross-validation accuracy (default 10-fold)
	<filename>.txt (.txt file): Feature Importances text file output (default is top 25)
	<filename>.pdf (.pdf file): Decision tree graphic (default is max_nodes=25)
	cm (numpy.array): Confusion matrix for model performance on test data
	<filename>.xlsx (.xlsx file): Excel workbook with class predictions and probabilities of unlabeled data
	"""

	def gen_corpus(df, column, extra_pattern=None):
	"""Creates list of cleaned text strings to be modeled from a pandas DataFrame column

	Args:
	df (pandas.DataFrame): DataFrame that contains text strings to clean
	column (str): Column name that contains text strings
	extra_pattern (_sre.SRE_Pattern): Optional extra pattern to remove from text
	"""
	corpus = []
	ps = PorterStemmer()

	for i in df.index:
	if extra_pattern:
	line = re.sub(extra_pattern, '', df[column][i])

	line = re.sub('[^a-zA-Z]', ' ', line)
	line = line.lower().split()

	line = [ps.stem(word) for word in line if not word in set(stopwords.words('english'))]
	line = ' '.join(line)

	corpus.append(line)

	return corpus


	# Get raw text
	bannerpath = '<YOUR DIRECTORY PATH HERE>'

	dfTrain = pd.read_csv(join(bannerpath, '<YOUR TRAINING DATA FILE HERE>')) # I'm using .csv files, change to whatever input method you like

	dfFuture = pd.read_csv(join(bannerpath, '<YOUR UNLABELED DATA FILE HERE>'))

	dfTrain.drop_duplicates(inplace=True)
	dfTrain.dropna(inplace=True)

	# Cleaning the text
	cleantags = re.compile('<.*?>') # To help remove HTML/XML tags from text strings
	corpus_future = gen_corpus(dfFuture, '<COLUMN NAME CONTAINING TARGET TEXT>', cleantags)
	corpus_train = gen_corpus(dfTrain, '<COLUMN NAME CONTAINING TARGET TEXT>', cleantags)

	# Creating Bag of Words Model
	from sklearn.feature_extraction.text import CountVectorizer

	cv = CountVectorizer(max_features = 5000)
	X = cv.fit_transform(corpus_train).toarray()
	vocab = cv.vocabulary_
	y = dfTrain.iloc[:, -1].values

	# Splitting the dataset into the Training set and Test set
	from sklearn.model_selection import train_test_split
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

	# Fitting Classifier to the Training Set
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.tree import DecisionTreeClassifier, export_graphviz

	# Need to make sure the graphviz folder is in PATH
	vizpath = '{}<PATH TO GRAPHVIZ FOLDER>'.format(os.pathsep)
	if vizpath not in os.environ['PATH']:
	os.environ['PATH'] += vizpath

	del vizpath

	tree = DecisionTreeClassifier(max_leaf_nodes=25) # For visualization purposes
	classifier = RandomForestClassifier(n_estimators = 501,
	criterion = 'gini',
	n_jobs=-1,
	random_state=0,
	verbose=1) # Actual model used for classifications

	tree.fit(X_train, y_train)

	classifier.fit(X_train, y_train)
	test_accuracy = classifier.score(X_test, y_test)
	train_cv_accuracy = cross_val_score(classifier, X, y, cv=10).mean()

	importances = classifier.feature_importances_
	std = np.std([tree.feature_importances_ for tree in classifier.estimators_], axis=0)
	indices = np.argsort(importances)[::-1]

	feature_map = [(f + 1, list(vocab.keys())[list(vocab.values()).index(indices[f])], importances[indices[f]]) for f in range(25)]

	with open(os.path.join(bannerpath, '<YOUR FEATURE MAP LOG FILE NAME>.txt'), 'w') as log:
	log.write('{:<}\t{:<10}\t{:>}\n'.format('Rank','Feature','Importance'))
	for rank, feature, importance in feature_map:
	log.write('{:<3}\t{:10}\t{:>0.4f}\n'.format(str(rank), feature, importance))

	features = sorted(vocab.keys())
	dot_data = export_graphviz(tree, out_file=None,
	filled=True, rounded=True,
	special_characters=True,
	feature_names=features)

	graph = pydotplus.graph_from_dot_data(dot_data)
	graph.write_pdf(join(bannerpath, '<YOUR DECISION TREE GRAPHIC FILE NAME>.pdf'))

	# Predicting Test Set results
	y_pred = classifier.predict(X_test)
	y_prob = classifier.predict_proba(X_test)

	# Making the confusion matrix
	from sklearn.metrics import confusion_matrix
	cm = confusion_matrix(y_test, y_pred)

	# Get unlabeled data
	X_future = cv.transform(corpus_future).toarray()

	y_future_prob = pd.DataFrame(classifier.predict_proba(X_future), columns=classifier.classes_)
	y_future_pred = pd.DataFrame(classifier.predict(X_future), columns=['ClassPred'])
	dfOutput = dfFuture.join(y_future_prob).join(y_future_pred)

	writer = pd.ExcelWriter(bannerpath + '<YOUR OUTPUT FILE NAME>.xlsx')
	dfOutput.to_excel(writer, '<WORKSHEET NAME>', index=False)
	writer.save()