Created
February 13, 2018 23:39
-
-
Save feliperyan/5036f395a80d38cf9e6999abf041d179 to your computer and use it in GitHub Desktop.
Using scikit-learn to do text categorisation and to save a model and a one-hot encoder
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pickle | |
import pandas as pd | |
import re | |
import numpy as np | |
from sklearn.feature_extraction.text import CountVectorizer | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn import metrics | |
from sklearn.model_selection import train_test_split | |
from time import time | |
def get_words_for_title(raw_title): | |
clean = raw_title.replace('_', ' ') | |
clean = clean.lower() | |
clean = re.compile('[a-z]{3,}').findall(clean) | |
clean = ' '.join(clean) | |
return clean | |
#iso encoding to get around excel crap | |
df = pd.read_csv('train_dataset.csv', encoding = "ISO-8859-1") | |
# Use the helper function above to do some basic text hygiene and add it as a column to the Pandas datafram | |
clean_titles = [get_words_for_title(i) for i in df['Title'].astype(str)] | |
df['Cleaned_Title'] = clean_titles | |
# Converting blanks to NaN so I can remove blank titles after I removed all the rubbish | |
df = df.where(cond=(df.astype(str) != '')) | |
df.dropna(inplace=True) | |
# Using a Python set as a collection object as sets only keep unique values which is what we want. | |
words = set() | |
for title in df['Cleaned_Title']: | |
for w in title.split(' '): | |
words.add(w) | |
print ('Unique words:' + str(len(words))) | |
# This is a helper function from sklearn that helps me "hot-encode" the data, it creates this "vectorizer" object | |
# which I can apply to the raw data to get a hot-encoded version of the data.abs | |
# I "pickle" it, which means I save it as a file so I can re-use it when I need to run predictions. | |
# 2018 https://stackoverflow.com/questions/48226506/how-to-get-one-hot-encoding-of-specific-words-in-a-text-in-pandas | |
cv = CountVectorizer(vocabulary=list(words)) | |
f = open('feature_extractor', 'wb') | |
pickle.dump(cv, f) | |
f.close() | |
# This is where I get the actual dataframe to train, notice the use of the Vectorizer | |
# r here is the result of transforming the Cleaned Title column to hot-encode it. | |
# Then | |
r = pd.SparseDataFrame(cv.fit_transform(df['Cleaned_Title']), df.index, cv.get_feature_names(), default_fill_value=0) | |
df2 = df.join(r) | |
# Easy-peasy once we prepare the data. | |
clf = RandomForestClassifier(n_estimators=100) | |
X = df2.iloc[:, 5:] | |
y = df2.iloc[:, 3] | |
# Break it down into a training and testing datasets | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) | |
# Lets time it | |
t0 = time() | |
clf.fit(X_train, y_train) | |
train_time = time() - t0 | |
print("train time: %0.3fs" % train_time) | |
# Lets time ourselves running the predictions across the test dataset | |
t0 = time() | |
pred = clf.predict(X_test) | |
test_time = time() - t0 | |
print("test time: %0.3fs" % test_time) | |
# Print score | |
score = metrics.accuracy_score(y_test, pred) | |
print("accuracy: %0.3f" % score) | |
# Save the model so we can use it in the future without training it again and so we | |
# can share it. | |
f = open('forest_classifier.pkl', 'wb') | |
pickle.dump(clf, f) | |
f.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment