Created
September 12, 2018 13:22
-
-
Save MickeyPvX/34dd5cd1c102f4bcb057c0d23a8c4c96 to your computer and use it in GitHub Desktop.
Builds a simple bag-of-words model using the nltk library
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import re | |
import os | |
import pydotplus | |
import numpy as np | |
import datetime as dt | |
from nltk.corpus import stopwords | |
from nltk.stem.porter import PorterStemmer | |
from os.path import join | |
from sklearn.model_selection import cross_val_score | |
"""Script to generate a simple, 'Bag of Words,' model to classify text strings into N classes using a Random Forest classifier | |
and a Decision Tree classifier for model visualization | |
Returns: | |
vocab (dict): Vocabulary of the training data | |
test_accuracy (float): Model accuracy against test data set | |
train_cv_accuracy (float): Cross-validation accuracy (default 10-fold) | |
<filename>.txt (.txt file): Feature Importances text file output (default is top 25) | |
<filename>.pdf (.pdf file): Decision tree graphic (default is max_nodes=25) | |
cm (numpy.array): Confusion matrix for model performance on test data | |
<filename>.xlsx (.xlsx file): Excel workbook with class predictions and probabilities of unlabeled data | |
""" | |
def gen_corpus(df, column, extra_pattern=None): | |
"""Creates list of cleaned text strings to be modeled from a pandas DataFrame column | |
Args: | |
df (pandas.DataFrame): DataFrame that contains text strings to clean | |
column (str): Column name that contains text strings | |
extra_pattern (_sre.SRE_Pattern): Optional extra pattern to remove from text | |
""" | |
corpus = [] | |
ps = PorterStemmer() | |
for i in df.index: | |
if extra_pattern: | |
line = re.sub(extra_pattern, '', df[column][i]) | |
line = re.sub('[^a-zA-Z]', ' ', line) | |
line = line.lower().split() | |
line = [ps.stem(word) for word in line if not word in set(stopwords.words('english'))] | |
line = ' '.join(line) | |
corpus.append(line) | |
return corpus | |
# Get raw text | |
bannerpath = '<YOUR DIRECTORY PATH HERE>' | |
dfTrain = pd.read_csv(join(bannerpath, '<YOUR TRAINING DATA FILE HERE>')) # I'm using .csv files, change to whatever input method you like | |
dfFuture = pd.read_csv(join(bannerpath, '<YOUR UNLABELED DATA FILE HERE>')) | |
dfTrain.drop_duplicates(inplace=True) | |
dfTrain.dropna(inplace=True) | |
# Cleaning the text | |
cleantags = re.compile('<.*?>') # To help remove HTML/XML tags from text strings | |
corpus_future = gen_corpus(dfFuture, '<COLUMN NAME CONTAINING TARGET TEXT>', cleantags) | |
corpus_train = gen_corpus(dfTrain, '<COLUMN NAME CONTAINING TARGET TEXT>', cleantags) | |
# Creating Bag of Words Model | |
from sklearn.feature_extraction.text import CountVectorizer | |
cv = CountVectorizer(max_features = 5000) | |
X = cv.fit_transform(corpus_train).toarray() | |
vocab = cv.vocabulary_ | |
y = dfTrain.iloc[:, -1].values | |
# Splitting the dataset into the Training set and Test set | |
from sklearn.model_selection import train_test_split | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0) | |
# Fitting Classifier to the Training Set | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.tree import DecisionTreeClassifier, export_graphviz | |
# Need to make sure the graphviz folder is in PATH | |
vizpath = '{}<PATH TO GRAPHVIZ FOLDER>'.format(os.pathsep) | |
if vizpath not in os.environ['PATH']: | |
os.environ['PATH'] += vizpath | |
del vizpath | |
tree = DecisionTreeClassifier(max_leaf_nodes=25) # For visualization purposes | |
classifier = RandomForestClassifier(n_estimators = 501, | |
criterion = 'gini', | |
n_jobs=-1, | |
random_state=0, | |
verbose=1) # Actual model used for classifications | |
tree.fit(X_train, y_train) | |
classifier.fit(X_train, y_train) | |
test_accuracy = classifier.score(X_test, y_test) | |
train_cv_accuracy = cross_val_score(classifier, X, y, cv=10).mean() | |
importances = classifier.feature_importances_ | |
std = np.std([tree.feature_importances_ for tree in classifier.estimators_], axis=0) | |
indices = np.argsort(importances)[::-1] | |
feature_map = [(f + 1, list(vocab.keys())[list(vocab.values()).index(indices[f])], importances[indices[f]]) for f in range(25)] | |
with open(os.path.join(bannerpath, '<YOUR FEATURE MAP LOG FILE NAME>.txt'), 'w') as log: | |
log.write('{:<}\t{:<10}\t{:>}\n'.format('Rank','Feature','Importance')) | |
for rank, feature, importance in feature_map: | |
log.write('{:<3}\t{:10}\t{:>0.4f}\n'.format(str(rank), feature, importance)) | |
features = sorted(vocab.keys()) | |
dot_data = export_graphviz(tree, out_file=None, | |
filled=True, rounded=True, | |
special_characters=True, | |
feature_names=features) | |
graph = pydotplus.graph_from_dot_data(dot_data) | |
graph.write_pdf(join(bannerpath, '<YOUR DECISION TREE GRAPHIC FILE NAME>.pdf')) | |
# Predicting Test Set results | |
y_pred = classifier.predict(X_test) | |
y_prob = classifier.predict_proba(X_test) | |
# Making the confusion matrix | |
from sklearn.metrics import confusion_matrix | |
cm = confusion_matrix(y_test, y_pred) | |
# Get unlabeled data | |
X_future = cv.transform(corpus_future).toarray() | |
y_future_prob = pd.DataFrame(classifier.predict_proba(X_future), columns=classifier.classes_) | |
y_future_pred = pd.DataFrame(classifier.predict(X_future), columns=['ClassPred']) | |
dfOutput = dfFuture.join(y_future_prob).join(y_future_pred) | |
writer = pd.ExcelWriter(bannerpath + '<YOUR OUTPUT FILE NAME>.xlsx') | |
dfOutput.to_excel(writer, '<WORKSHEET NAME>', index=False) | |
writer.save() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment