Skip to content

Instantly share code, notes, and snippets.

@MickeyPvX
Created September 12, 2018 13:22
Show Gist options
  • Save MickeyPvX/34dd5cd1c102f4bcb057c0d23a8c4c96 to your computer and use it in GitHub Desktop.
Save MickeyPvX/34dd5cd1c102f4bcb057c0d23a8c4c96 to your computer and use it in GitHub Desktop.
Builds a simple bag-of-words model using the nltk library
import pandas as pd
import re
import os
import pydotplus
import numpy as np
import datetime as dt
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from os.path import join
from sklearn.model_selection import cross_val_score
"""Script to generate a simple, 'Bag of Words,' model to classify text strings into N classes using a Random Forest classifier
and a Decision Tree classifier for model visualization
Returns:
vocab (dict): Vocabulary of the training data
test_accuracy (float): Model accuracy against test data set
train_cv_accuracy (float): Cross-validation accuracy (default 10-fold)
<filename>.txt (.txt file): Feature Importances text file output (default is top 25)
<filename>.pdf (.pdf file): Decision tree graphic (default is max_nodes=25)
cm (numpy.array): Confusion matrix for model performance on test data
<filename>.xlsx (.xlsx file): Excel workbook with class predictions and probabilities of unlabeled data
"""
def gen_corpus(df, column, extra_pattern=None):
"""Creates list of cleaned text strings to be modeled from a pandas DataFrame column
Args:
df (pandas.DataFrame): DataFrame that contains text strings to clean
column (str): Column name that contains text strings
extra_pattern (_sre.SRE_Pattern): Optional extra pattern to remove from text
"""
corpus = []
ps = PorterStemmer()
for i in df.index:
if extra_pattern:
line = re.sub(extra_pattern, '', df[column][i])
line = re.sub('[^a-zA-Z]', ' ', line)
line = line.lower().split()
line = [ps.stem(word) for word in line if not word in set(stopwords.words('english'))]
line = ' '.join(line)
corpus.append(line)
return corpus
# Get raw text
bannerpath = '<YOUR DIRECTORY PATH HERE>'
dfTrain = pd.read_csv(join(bannerpath, '<YOUR TRAINING DATA FILE HERE>')) # I'm using .csv files, change to whatever input method you like
dfFuture = pd.read_csv(join(bannerpath, '<YOUR UNLABELED DATA FILE HERE>'))
dfTrain.drop_duplicates(inplace=True)
dfTrain.dropna(inplace=True)
# Cleaning the text
cleantags = re.compile('<.*?>') # To help remove HTML/XML tags from text strings
corpus_future = gen_corpus(dfFuture, '<COLUMN NAME CONTAINING TARGET TEXT>', cleantags)
corpus_train = gen_corpus(dfTrain, '<COLUMN NAME CONTAINING TARGET TEXT>', cleantags)
# Creating Bag of Words Model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 5000)
X = cv.fit_transform(corpus_train).toarray()
vocab = cv.vocabulary_
y = dfTrain.iloc[:, -1].values
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
# Fitting Classifier to the Training Set
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, export_graphviz
# Need to make sure the graphviz folder is in PATH
vizpath = '{}<PATH TO GRAPHVIZ FOLDER>'.format(os.pathsep)
if vizpath not in os.environ['PATH']:
os.environ['PATH'] += vizpath
del vizpath
tree = DecisionTreeClassifier(max_leaf_nodes=25) # For visualization purposes
classifier = RandomForestClassifier(n_estimators = 501,
criterion = 'gini',
n_jobs=-1,
random_state=0,
verbose=1) # Actual model used for classifications
tree.fit(X_train, y_train)
classifier.fit(X_train, y_train)
test_accuracy = classifier.score(X_test, y_test)
train_cv_accuracy = cross_val_score(classifier, X, y, cv=10).mean()
importances = classifier.feature_importances_
std = np.std([tree.feature_importances_ for tree in classifier.estimators_], axis=0)
indices = np.argsort(importances)[::-1]
feature_map = [(f + 1, list(vocab.keys())[list(vocab.values()).index(indices[f])], importances[indices[f]]) for f in range(25)]
with open(os.path.join(bannerpath, '<YOUR FEATURE MAP LOG FILE NAME>.txt'), 'w') as log:
log.write('{:<}\t{:<10}\t{:>}\n'.format('Rank','Feature','Importance'))
for rank, feature, importance in feature_map:
log.write('{:<3}\t{:10}\t{:>0.4f}\n'.format(str(rank), feature, importance))
features = sorted(vocab.keys())
dot_data = export_graphviz(tree, out_file=None,
filled=True, rounded=True,
special_characters=True,
feature_names=features)
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_pdf(join(bannerpath, '<YOUR DECISION TREE GRAPHIC FILE NAME>.pdf'))
# Predicting Test Set results
y_pred = classifier.predict(X_test)
y_prob = classifier.predict_proba(X_test)
# Making the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
# Get unlabeled data
X_future = cv.transform(corpus_future).toarray()
y_future_prob = pd.DataFrame(classifier.predict_proba(X_future), columns=classifier.classes_)
y_future_pred = pd.DataFrame(classifier.predict(X_future), columns=['ClassPred'])
dfOutput = dfFuture.join(y_future_prob).join(y_future_pred)
writer = pd.ExcelWriter(bannerpath + '<YOUR OUTPUT FILE NAME>.xlsx')
dfOutput.to_excel(writer, '<WORKSHEET NAME>', index=False)
writer.save()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment