feliperyan/load_modle_from_pickle_v2.ipynb

## load_modle_from_pickle_v2.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "import pandas as pd\n",
    "import re\n",
    "import numpy as np\n",
    "f = open('forest_classifier.pkl', 'rb')\n",
    "forest = pickle.load(f)\n",
    "f.close()\n",
    "f = open('feature_extractor.pkl', 'rb')\n",
    "vectorizer = pickle.load(f)\n",
    "f.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_words_for_title(raw_title):\n",
    "    clean = raw_title.replace('_', ' ')\n",
    "    clean = clean.lower()\n",
    "    clean = re.compile('[a-z]{3,}').findall(clean)\n",
    "    clean = ' '.join(clean)\n",
    "    \n",
    "    return clean"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "           SFDC LeadID  Title             Job Level   Job Function  \\\n",
      "74  00Q3000001ApXeHEAV  0WNER  Executive Management  Uncategorized   \n",
      "85  00Q3000001Eu91OEAR    aaa         Uncategorized  Uncategorized   \n",
      "86  00Q3000001AmRxYEAV    aaa         Uncategorized  Uncategorized   \n",
      "87  00Q30000017p9xuEAA    aaa         Uncategorized  Uncategorized   \n",
      "89  00Q3000001CDloKEAT    abc         Uncategorized  Uncategorized   \n",
      "\n",
      "   Cleaned_Title  \n",
      "74          wner  \n",
      "85           aaa  \n",
      "86           aaa  \n",
      "87           aaa  \n",
      "89           abc  \n"
     ]
    }
   ],
   "source": [
    "# Read in the new data, keep in mind this will be dirty data.\n",
    "\n",
    "df = pd.read_csv('job_funtion_uncat.csv')\n",
    "clean_titles = [get_words_for_title(i) for i in df['Title'].astype(str)]\n",
    "df['Cleaned_Title'] = clean_titles\n",
    "\n",
    "# Converting blanks to NaN so I can drop them.\n",
    "df = df.where(cond=(df.astype(str) != ''))\n",
    "df.dropna(inplace=True)\n",
    "\n",
    "print(df.iloc[:5, :])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "           SFDC LeadID            Title      Job Level   Job Function  \\\n",
      "91  00Q30000015IF2REAW              acc  Uncategorized  Uncategorized   \n",
      "94  00Q30000018HdN0EAK     Account Exec  Manager Level  Uncategorized   \n",
      "96  00Q30000010Fql7EAC  Account Manager  Manager Level  Uncategorized   \n",
      "97  00Q3000000zVD5MEAW  Account Manager  Manager Level  Uncategorized   \n",
      "98  00Q30000011lOJXEA2  Account Manager    Staff Level  Uncategorized   \n",
      "\n",
      "      Cleaned_Title  \n",
      "91              acc  \n",
      "94     account exec  \n",
      "96  account manager  \n",
      "97  account manager  \n",
      "98  account manager  \n"
     ]
    }
   ],
   "source": [
    "# Lots of nonsensical Titles which will be cut off once I use the Vectorizer\n",
    "# It got good data, pre-cleaned by humans, when we trained the model initially.\n",
    "\n",
    "X = vectorizer.transform(df['Cleaned_Title'])\n",
    "# Get array of rows where there has been no match for the bag of words\n",
    "# https://stackoverflow.com/questions/\n",
    "# 23726026/finding-which-rows-have-all-elements-as-zeros-in-a-matrix-with-numpy/23726100#23726100\n",
    "dense_X = X.todense()\n",
    "to_drop = np.where(~dense_X.any(axis=1))[0]\n",
    "bad = df.iloc[to_drop]\n",
    "\n",
    "df.drop(df.index[[to_drop]], inplace=True)\n",
    "\n",
    "# Should have gotten rid of those rubbish titles.\n",
    "print(df.iloc[:5, :])\n",
    "\n",
    "\n",
    "X = vectorizer.transform(df['Cleaned_Title'])\n",
    "\n",
    "y = forest.predict(X)\n",
    "df['Predicted'] = y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_csv('new_pred.csv')\n",
    "bad.to_csv('cannot_categorise.csv')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [],
	"source": [
	"import pickle\n",
	"import pandas as pd\n",
	"import re\n",
	"import numpy as np\n",
	"f = open('forest_classifier.pkl', 'rb')\n",
	"forest = pickle.load(f)\n",
	"f.close()\n",
	"f = open('feature_extractor.pkl', 'rb')\n",
	"vectorizer = pickle.load(f)\n",
	"f.close()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [],
	"source": [
	"def get_words_for_title(raw_title):\n",
	" clean = raw_title.replace('_', ' ')\n",
	" clean = clean.lower()\n",
	" clean = re.compile('[a-z]{3,}').findall(clean)\n",
	" clean = ' '.join(clean)\n",
	" \n",
	" return clean"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" SFDC LeadID Title Job Level Job Function \\\n",
	"74 00Q3000001ApXeHEAV 0WNER Executive Management Uncategorized \n",
	"85 00Q3000001Eu91OEAR aaa Uncategorized Uncategorized \n",
	"86 00Q3000001AmRxYEAV aaa Uncategorized Uncategorized \n",
	"87 00Q30000017p9xuEAA aaa Uncategorized Uncategorized \n",
	"89 00Q3000001CDloKEAT abc Uncategorized Uncategorized \n",
	"\n",
	" Cleaned_Title \n",
	"74 wner \n",
	"85 aaa \n",
	"86 aaa \n",
	"87 aaa \n",
	"89 abc \n"
	]
	}
	],
	"source": [
	"# Read in the new data, keep in mind this will be dirty data.\n",
	"\n",
	"df = pd.read_csv('job_funtion_uncat.csv')\n",
	"clean_titles = [get_words_for_title(i) for i in df['Title'].astype(str)]\n",
	"df['Cleaned_Title'] = clean_titles\n",
	"\n",
	"# Converting blanks to NaN so I can drop them.\n",
	"df = df.where(cond=(df.astype(str) != ''))\n",
	"df.dropna(inplace=True)\n",
	"\n",
	"print(df.iloc[:5, :])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" SFDC LeadID Title Job Level Job Function \\\n",
	"91 00Q30000015IF2REAW acc Uncategorized Uncategorized \n",
	"94 00Q30000018HdN0EAK Account Exec Manager Level Uncategorized \n",
	"96 00Q30000010Fql7EAC Account Manager Manager Level Uncategorized \n",
	"97 00Q3000000zVD5MEAW Account Manager Manager Level Uncategorized \n",
	"98 00Q30000011lOJXEA2 Account Manager Staff Level Uncategorized \n",
	"\n",
	" Cleaned_Title \n",
	"91 acc \n",
	"94 account exec \n",
	"96 account manager \n",
	"97 account manager \n",
	"98 account manager \n"
	]
	}
	],
	"source": [
	"# Lots of nonsensical Titles which will be cut off once I use the Vectorizer\n",
	"# It got good data, pre-cleaned by humans, when we trained the model initially.\n",
	"\n",
	"X = vectorizer.transform(df['Cleaned_Title'])\n",
	"# Get array of rows where there has been no match for the bag of words\n",
	"# https://stackoverflow.com/questions/\n",
	"# 23726026/finding-which-rows-have-all-elements-as-zeros-in-a-matrix-with-numpy/23726100#23726100\n",
	"dense_X = X.todense()\n",
	"to_drop = np.where(~dense_X.any(axis=1))[0]\n",
	"bad = df.iloc[to_drop]\n",
	"\n",
	"df.drop(df.index[[to_drop]], inplace=True)\n",
	"\n",
	"# Should have gotten rid of those rubbish titles.\n",
	"print(df.iloc[:5, :])\n",
	"\n",
	"\n",
	"X = vectorizer.transform(df['Cleaned_Title'])\n",
	"\n",
	"y = forest.predict(X)\n",
	"df['Predicted'] = y"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {},
	"outputs": [],
	"source": [
	"df.to_csv('new_pred.csv')\n",
	"bad.to_csv('cannot_categorise.csv')\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.4"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 1
	}