Skip to content

Instantly share code, notes, and snippets.

@feliperyan
Created February 13, 2018 23:39
Show Gist options
  • Save feliperyan/5036f395a80d38cf9e6999abf041d179 to your computer and use it in GitHub Desktop.
Save feliperyan/5036f395a80d38cf9e6999abf041d179 to your computer and use it in GitHub Desktop.
Using scikit-learn to do text categorisation and to save a model and a one-hot encoder
import pickle
import pandas as pd
import re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split
from time import time
def get_words_for_title(raw_title):
clean = raw_title.replace('_', ' ')
clean = clean.lower()
clean = re.compile('[a-z]{3,}').findall(clean)
clean = ' '.join(clean)
return clean
#iso encoding to get around excel crap
df = pd.read_csv('train_dataset.csv', encoding = "ISO-8859-1")
# Use the helper function above to do some basic text hygiene and add it as a column to the Pandas datafram
clean_titles = [get_words_for_title(i) for i in df['Title'].astype(str)]
df['Cleaned_Title'] = clean_titles
# Converting blanks to NaN so I can remove blank titles after I removed all the rubbish
df = df.where(cond=(df.astype(str) != ''))
df.dropna(inplace=True)
# Using a Python set as a collection object as sets only keep unique values which is what we want.
words = set()
for title in df['Cleaned_Title']:
for w in title.split(' '):
words.add(w)
print ('Unique words:' + str(len(words)))
# This is a helper function from sklearn that helps me "hot-encode" the data, it creates this "vectorizer" object
# which I can apply to the raw data to get a hot-encoded version of the data.abs
# I "pickle" it, which means I save it as a file so I can re-use it when I need to run predictions.
# 2018 https://stackoverflow.com/questions/48226506/how-to-get-one-hot-encoding-of-specific-words-in-a-text-in-pandas
cv = CountVectorizer(vocabulary=list(words))
f = open('feature_extractor', 'wb')
pickle.dump(cv, f)
f.close()
# This is where I get the actual dataframe to train, notice the use of the Vectorizer
# r here is the result of transforming the Cleaned Title column to hot-encode it.
# Then
r = pd.SparseDataFrame(cv.fit_transform(df['Cleaned_Title']), df.index, cv.get_feature_names(), default_fill_value=0)
df2 = df.join(r)
# Easy-peasy once we prepare the data.
clf = RandomForestClassifier(n_estimators=100)
X = df2.iloc[:, 5:]
y = df2.iloc[:, 3]
# Break it down into a training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
# Lets time it
t0 = time()
clf.fit(X_train, y_train)
train_time = time() - t0
print("train time: %0.3fs" % train_time)
# Lets time ourselves running the predictions across the test dataset
t0 = time()
pred = clf.predict(X_test)
test_time = time() - t0
print("test time: %0.3fs" % test_time)
# Print score
score = metrics.accuracy_score(y_test, pred)
print("accuracy: %0.3f" % score)
# Save the model so we can use it in the future without training it again and so we
# can share it.
f = open('forest_classifier.pkl', 'wb')
pickle.dump(clf, f)
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment