Skip to content

Instantly share code, notes, and snippets.

@feliperyan
Created February 14, 2018 00:08
Show Gist options
  • Save feliperyan/851cc0e43beaa6fa959ef2d4a86eb2cb to your computer and use it in GitHub Desktop.
Save feliperyan/851cc0e43beaa6fa959ef2d4a86eb2cb to your computer and use it in GitHub Desktop.
Jupyter notebook showing how to load a model and vectoriser and apply it. Results in 2 csv files one with predicted data and one with words that the model has never seen before.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pickle\n",
"import pandas as pd\n",
"import re\n",
"import numpy as np\n",
"f = open('forest_classifier.pkl', 'rb')\n",
"forest = pickle.load(f)\n",
"f.close()\n",
"f = open('feature_extractor.pkl', 'rb')\n",
"vectorizer = pickle.load(f)\n",
"f.close()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def get_words_for_title(raw_title):\n",
" clean = raw_title.replace('_', ' ')\n",
" clean = clean.lower()\n",
" clean = re.compile('[a-z]{3,}').findall(clean)\n",
" clean = ' '.join(clean)\n",
" \n",
" return clean"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" SFDC LeadID Title Job Level Job Function \\\n",
"74 00Q3000001ApXeHEAV 0WNER Executive Management Uncategorized \n",
"85 00Q3000001Eu91OEAR aaa Uncategorized Uncategorized \n",
"86 00Q3000001AmRxYEAV aaa Uncategorized Uncategorized \n",
"87 00Q30000017p9xuEAA aaa Uncategorized Uncategorized \n",
"89 00Q3000001CDloKEAT abc Uncategorized Uncategorized \n",
"\n",
" Cleaned_Title \n",
"74 wner \n",
"85 aaa \n",
"86 aaa \n",
"87 aaa \n",
"89 abc \n"
]
}
],
"source": [
"# Read in the new data, keep in mind this will be dirty data.\n",
"\n",
"df = pd.read_csv('job_funtion_uncat.csv')\n",
"clean_titles = [get_words_for_title(i) for i in df['Title'].astype(str)]\n",
"df['Cleaned_Title'] = clean_titles\n",
"\n",
"# Converting blanks to NaN so I can drop them.\n",
"df = df.where(cond=(df.astype(str) != ''))\n",
"df.dropna(inplace=True)\n",
"\n",
"print(df.iloc[:5, :])"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" SFDC LeadID Title Job Level Job Function \\\n",
"91 00Q30000015IF2REAW acc Uncategorized Uncategorized \n",
"94 00Q30000018HdN0EAK Account Exec Manager Level Uncategorized \n",
"96 00Q30000010Fql7EAC Account Manager Manager Level Uncategorized \n",
"97 00Q3000000zVD5MEAW Account Manager Manager Level Uncategorized \n",
"98 00Q30000011lOJXEA2 Account Manager Staff Level Uncategorized \n",
"\n",
" Cleaned_Title \n",
"91 acc \n",
"94 account exec \n",
"96 account manager \n",
"97 account manager \n",
"98 account manager \n"
]
}
],
"source": [
"# Lots of nonsensical Titles which will be cut off once I use the Vectorizer\n",
"# It got good data, pre-cleaned by humans, when we trained the model initially.\n",
"\n",
"X = vectorizer.transform(df['Cleaned_Title'])\n",
"# Get array of rows where there has been no match for the bag of words\n",
"# https://stackoverflow.com/questions/\n",
"# 23726026/finding-which-rows-have-all-elements-as-zeros-in-a-matrix-with-numpy/23726100#23726100\n",
"dense_X = X.todense()\n",
"to_drop = np.where(~dense_X.any(axis=1))[0]\n",
"bad = df.iloc[to_drop]\n",
"\n",
"df.drop(df.index[[to_drop]], inplace=True)\n",
"\n",
"# Should have gotten rid of those rubbish titles.\n",
"print(df.iloc[:5, :])\n",
"\n",
"\n",
"X = vectorizer.transform(df['Cleaned_Title'])\n",
"\n",
"y = forest.predict(X)\n",
"df['Predicted'] = y"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"df.to_csv('new_pred.csv')\n",
"bad.to_csv('cannot_categorise.csv')\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment