feliperyan/make_model.py

## make_model.py
import pickle
import pandas as pd
import re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split
from time import time

def get_words_for_title(raw_title):
    clean = raw_title.replace('_', ' ')
    clean = clean.lower()
    clean = re.compile('[a-z]{3,}').findall(clean)
    clean = ' '.join(clean)

    return clean

#iso encoding to get around excel crap
df = pd.read_csv('train_dataset.csv', encoding = "ISO-8859-1")

# Use the helper function above to do some basic text hygiene and add it as a column to the Pandas datafram
clean_titles = [get_words_for_title(i) for i in df['Title'].astype(str)]
df['Cleaned_Title'] = clean_titles

# Converting blanks to NaN so I can remove blank titles after I removed all the rubbish
df = df.where(cond=(df.astype(str) != ''))
df.dropna(inplace=True)

# Using a Python set as a collection object as sets only keep unique values which is what we want.
words = set()
for title in df['Cleaned_Title']:
    for w in title.split(' '):
        words.add(w)

print ('Unique words:' + str(len(words)))

# This is a helper function from sklearn that helps me "hot-encode" the data, it creates this "vectorizer" object
# which I can apply to the raw data to get a hot-encoded version of the data.abs
# I "pickle" it, which means I save it as a file so I can re-use it when I need to run predictions.
# 2018 https://stackoverflow.com/questions/48226506/how-to-get-one-hot-encoding-of-specific-words-in-a-text-in-pandas
cv = CountVectorizer(vocabulary=list(words))
f = open('feature_extractor', 'wb')
pickle.dump(cv, f)
f.close()

# This is where I get the actual dataframe to train, notice the use of the Vectorizer
# r here is the result of transforming the Cleaned Title column to hot-encode it.
# Then
r = pd.SparseDataFrame(cv.fit_transform(df['Cleaned_Title']), df.index, cv.get_feature_names(), default_fill_value=0)
df2 = df.join(r)

# Easy-peasy once we prepare the data.
clf = RandomForestClassifier(n_estimators=100)
X = df2.iloc[:, 5:]
y = df2.iloc[:, 3]

# Break it down into a training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Lets time it
t0 = time()
clf.fit(X_train, y_train)
train_time = time() - t0
print("train time: %0.3fs" % train_time)

# Lets time ourselves running the predictions across the test dataset
t0 = time()
pred = clf.predict(X_test)
test_time = time() - t0
print("test time:  %0.3fs" % test_time)

# Print score
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)

# Save the model so we can use it in the future without training it again and so we
# can share it.
f = open('forest_classifier.pkl', 'wb')
pickle.dump(clf, f)
f.close()
	import pickle
	import pandas as pd
	import re
	import numpy as np
	from sklearn.feature_extraction.text import CountVectorizer
	from sklearn.ensemble import RandomForestClassifier
	from sklearn import metrics
	from sklearn.model_selection import train_test_split
	from time import time

	def get_words_for_title(raw_title):
	clean = raw_title.replace('_', ' ')
	clean = clean.lower()
	clean = re.compile('[a-z]{3,}').findall(clean)
	clean = ' '.join(clean)

	return clean

	#iso encoding to get around excel crap
	df = pd.read_csv('train_dataset.csv', encoding = "ISO-8859-1")

	# Use the helper function above to do some basic text hygiene and add it as a column to the Pandas datafram
	clean_titles = [get_words_for_title(i) for i in df['Title'].astype(str)]
	df['Cleaned_Title'] = clean_titles

	# Converting blanks to NaN so I can remove blank titles after I removed all the rubbish
	df = df.where(cond=(df.astype(str) != ''))
	df.dropna(inplace=True)

	# Using a Python set as a collection object as sets only keep unique values which is what we want.
	words = set()
	for title in df['Cleaned_Title']:
	for w in title.split(' '):
	words.add(w)

	print ('Unique words:' + str(len(words)))

	# This is a helper function from sklearn that helps me "hot-encode" the data, it creates this "vectorizer" object
	# which I can apply to the raw data to get a hot-encoded version of the data.abs
	# I "pickle" it, which means I save it as a file so I can re-use it when I need to run predictions.
	# 2018 https://stackoverflow.com/questions/48226506/how-to-get-one-hot-encoding-of-specific-words-in-a-text-in-pandas
	cv = CountVectorizer(vocabulary=list(words))
	f = open('feature_extractor', 'wb')
	pickle.dump(cv, f)
	f.close()

	# This is where I get the actual dataframe to train, notice the use of the Vectorizer
	# r here is the result of transforming the Cleaned Title column to hot-encode it.
	# Then
	r = pd.SparseDataFrame(cv.fit_transform(df['Cleaned_Title']), df.index, cv.get_feature_names(), default_fill_value=0)
	df2 = df.join(r)

	# Easy-peasy once we prepare the data.
	clf = RandomForestClassifier(n_estimators=100)
	X = df2.iloc[:, 5:]
	y = df2.iloc[:, 3]

	# Break it down into a training and testing datasets
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

	# Lets time it
	t0 = time()
	clf.fit(X_train, y_train)
	train_time = time() - t0
	print("train time: %0.3fs" % train_time)

	# Lets time ourselves running the predictions across the test dataset
	t0 = time()
	pred = clf.predict(X_test)
	test_time = time() - t0
	print("test time: %0.3fs" % test_time)

	# Print score
	score = metrics.accuracy_score(y_test, pred)
	print("accuracy: %0.3f" % score)

	# Save the model so we can use it in the future without training it again and so we
	# can share it.
	f = open('forest_classifier.pkl', 'wb')
	pickle.dump(clf, f)
	f.close()