ldrewniak/01-model-training.ipynb

## 01-model-training.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import mlflow\n",
    "import os\n",
    "import uuid \n",
    "import shutil\n",
    "\n",
    "from keras.models import Sequential\n",
    "from keras.layers import Dense, Activation\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import precision_recall_fscore_support as score\n",
    "\n",
    "from mlflow.keras import save_model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# load data\n",
    "train_labels, train_values = [pd.read_csv(f'../data/{f}.csv') for f in ['train_labels', 'train_values']]\n",
    "tt_data = train_values.join(train_labels.set_index('building_id'), on='building_id')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# drop unnesesary columns from train data\n",
    "tt_x = tt_data.drop(columns=['building_id','damage_grade']).copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# label encode all columns with stings\n",
    "for col in tt_x.columns:\n",
    "    if tt_x[col].dtype == 'object':\n",
    "        tt_x[col] = LabelEncoder().fit_transform(tt_x[col])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def one_hot_encode(data):\n",
    "    examples = len(data)\n",
    "    set_len = len(set(data))\n",
    "    arr = np.zeros((examples, set_len))\n",
    "    arr[np.arange(examples), data.apply(lambda x: x-1)] = 1\n",
    "    return arr\n",
    "\n",
    "# one-hot encode column to predict\n",
    "tt_y = one_hot_encode(tt_data.damage_grade)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# split into test/train in 0.25 ratio\n",
    "train_x, test_x, train_y, test_y = train_test_split(tt_x, tt_y, test_size=0.25, random_state=5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def run_learning(layer_input, layer_hidden, layer_output, batch_size, nb_epoch, loss, optimizer, metrics):\n",
    "    # save passed params\n",
    "    params = locals()\n",
    "    \n",
    "    # set path to main directory instead run direcotry\n",
    "    absolute_path = os.path.abspath(\"../\")\n",
    "    mlflow.set_tracking_uri(f'file:{absolute_path}/mlruns')\n",
    "    mlflow.set_experiment('modeling-earthquake-damage')\n",
    "    \n",
    "    with mlflow.start_run(run_name='sample_run') as run:\n",
    "        # log parameters\n",
    "        for param in params:\n",
    "            mlflow.log_param(param, params[param])\n",
    "        \n",
    "        #create temp folder\n",
    "        experiment_temp_path = f'.tmp/{uuid.uuid1().hex}'\n",
    "        \n",
    "        # create model\n",
    "        model = Sequential()\n",
    "        model.add(Dense(layer_input[0], input_dim=layer_input[1], activation=layer_input[2]))\n",
    "        for neurons, activation in layer_hidden:\n",
    "            model.add(Dense(neurons, activation=activation))\n",
    "        model.add(Dense(layer_output[0], activation=layer_output[1]))\n",
    "        model.compile(loss=loss, optimizer=optimizer, metrics=metrics)\n",
    "\n",
    "        # fit model\n",
    "        fit_result = model.fit(train_x, train_y,\n",
    "                  batch_size=batch_size,\n",
    "                  epochs=nb_epoch,\n",
    "                  validation_data=(test_x, test_y),\n",
    "                  shuffle=True)\n",
    "        # predict\n",
    "        predictions = model.predict(test_x)\n",
    "\n",
    "        precision, recall, fscore, support = score(np.argmax(test_y, axis=1), np.argmax(predictions, axis=1), average='micro')\n",
    "        \n",
    "        # save results\n",
    "        for key in fit_result.history.keys():\n",
    "            for epoch,value in enumerate(fit_result.history[key], 1):\n",
    "                mlflow.log_metric(key, value, epoch)\n",
    "        \n",
    "        mlflow.log_metric(\"micro_precision\", precision)\n",
    "        mlflow.log_metric(\"micro_recall\", recall)\n",
    "        mlflow.log_metric(\"micro_fscore\", fscore)\n",
    "        \n",
    "        # save model\n",
    "        save_model(model, f'{experiment_temp_path}/model')\n",
    "        shutil.make_archive(f'{experiment_temp_path}/model', 'zip', f'{experiment_temp_path}/model')\n",
    "        \n",
    "        # move model to artifact\n",
    "        mlflow.log_artifact(f'{experiment_temp_path}/model.zip')\n",
    "        \n",
    "        #remove temp files as they were already moved\n",
    "        shutil.rmtree('.tmp')\n",
    "        \n",
    "# some parameters        \n",
    "params = {\n",
    "    'layer_input': (76, 38, 'sigmoid'),\n",
    "    'layer_hidden': [(76, 'sigmoid')],\n",
    "    'layer_output': (3, 'softmax'),\n",
    "    'batch_size': 100,\n",
    "    'nb_epoch': 10,\n",
    "    'loss': 'categorical_crossentropy',\n",
    "    'optimizer': 'adamax', \n",
    "    'metrics': ['accuracy']\n",
    "}\n",
    "# run the code\n",
    "run_learning(**params)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "params = {\n",
    "    'layer_input': (76, 38, 'sigmoid'),\n",
    "    'layer_hidden': [(76, 'sigmoid')],\n",
    "    'layer_output': (3, 'softmax'),\n",
    "    'batch_size': 1000,\n",
    "    'nb_epoch': 10,\n",
    "    'loss': 'categorical_crossentropy',\n",
    "    'optimizer': 'adamax', \n",
    "    'metrics': ['accuracy']\n",
    "}\n",
    "run_learning(**params)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "params = {\n",
    "    'layer_input': (152, 38, 'sigmoid'),\n",
    "    'layer_hidden': [(152, 'sigmoid')],\n",
    "    'layer_output': (3, 'softmax'),\n",
    "    'batch_size': 100,\n",
    "    'nb_epoch': 10,\n",
    "    'loss': 'categorical_crossentropy',\n",
    "    'optimizer': 'adamax', \n",
    "    'metrics': ['accuracy']\n",
    "}\n",
    "run_learning(**params)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "params = {\n",
    "    'layer_input': (152, 38, 'sigmoid'),\n",
    "    'layer_hidden': [(152, 'sigmoid')],\n",
    "    'layer_output': (3, 'softmax'),\n",
    "    'batch_size': 100,\n",
    "    'nb_epoch': 10,\n",
    "    'loss': 'categorical_crossentropy',\n",
    "    'optimizer': 'adamax', \n",
    "    'metrics': ['accuracy']\n",
    "}\n",
    "run_learning(**params)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"import pandas as pd\n",
	"import numpy as np\n",
	"import mlflow\n",
	"import os\n",
	"import uuid \n",
	"import shutil\n",
	"\n",
	"from keras.models import Sequential\n",
	"from keras.layers import Dense, Activation\n",
	"from sklearn.preprocessing import LabelEncoder\n",
	"from sklearn.model_selection import train_test_split\n",
	"from sklearn.metrics import precision_recall_fscore_support as score\n",
	"\n",
	"from mlflow.keras import save_model"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# load data\n",
	"train_labels, train_values = [pd.read_csv(f'../data/{f}.csv') for f in ['train_labels', 'train_values']]\n",
	"tt_data = train_values.join(train_labels.set_index('building_id'), on='building_id')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# drop unnesesary columns from train data\n",
	"tt_x = tt_data.drop(columns=['building_id','damage_grade']).copy()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# label encode all columns with stings\n",
	"for col in tt_x.columns:\n",
	" if tt_x[col].dtype == 'object':\n",
	" tt_x[col] = LabelEncoder().fit_transform(tt_x[col])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"def one_hot_encode(data):\n",
	" examples = len(data)\n",
	" set_len = len(set(data))\n",
	" arr = np.zeros((examples, set_len))\n",
	" arr[np.arange(examples), data.apply(lambda x: x-1)] = 1\n",
	" return arr\n",
	"\n",
	"# one-hot encode column to predict\n",
	"tt_y = one_hot_encode(tt_data.damage_grade)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# split into test/train in 0.25 ratio\n",
	"train_x, test_x, train_y, test_y = train_test_split(tt_x, tt_y, test_size=0.25, random_state=5)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"def run_learning(layer_input, layer_hidden, layer_output, batch_size, nb_epoch, loss, optimizer, metrics):\n",
	" # save passed params\n",
	" params = locals()\n",
	" \n",
	" # set path to main directory instead run direcotry\n",
	" absolute_path = os.path.abspath(\"../\")\n",
	" mlflow.set_tracking_uri(f'file:{absolute_path}/mlruns')\n",
	" mlflow.set_experiment('modeling-earthquake-damage')\n",
	" \n",
	" with mlflow.start_run(run_name='sample_run') as run:\n",
	" # log parameters\n",
	" for param in params:\n",
	" mlflow.log_param(param, params[param])\n",
	" \n",
	" #create temp folder\n",
	" experiment_temp_path = f'.tmp/{uuid.uuid1().hex}'\n",
	" \n",
	" # create model\n",
	" model = Sequential()\n",
	" model.add(Dense(layer_input[0], input_dim=layer_input[1], activation=layer_input[2]))\n",
	" for neurons, activation in layer_hidden:\n",
	" model.add(Dense(neurons, activation=activation))\n",
	" model.add(Dense(layer_output[0], activation=layer_output[1]))\n",
	" model.compile(loss=loss, optimizer=optimizer, metrics=metrics)\n",
	"\n",
	" # fit model\n",
	" fit_result = model.fit(train_x, train_y,\n",
	" batch_size=batch_size,\n",
	" epochs=nb_epoch,\n",
	" validation_data=(test_x, test_y),\n",
	" shuffle=True)\n",
	" # predict\n",
	" predictions = model.predict(test_x)\n",
	"\n",
	" precision, recall, fscore, support = score(np.argmax(test_y, axis=1), np.argmax(predictions, axis=1), average='micro')\n",
	" \n",
	" # save results\n",
	" for key in fit_result.history.keys():\n",
	" for epoch,value in enumerate(fit_result.history[key], 1):\n",
	" mlflow.log_metric(key, value, epoch)\n",
	" \n",
	" mlflow.log_metric(\"micro_precision\", precision)\n",
	" mlflow.log_metric(\"micro_recall\", recall)\n",
	" mlflow.log_metric(\"micro_fscore\", fscore)\n",
	" \n",
	" # save model\n",
	" save_model(model, f'{experiment_temp_path}/model')\n",
	" shutil.make_archive(f'{experiment_temp_path}/model', 'zip', f'{experiment_temp_path}/model')\n",
	" \n",
	" # move model to artifact\n",
	" mlflow.log_artifact(f'{experiment_temp_path}/model.zip')\n",
	" \n",
	" #remove temp files as they were already moved\n",
	" shutil.rmtree('.tmp')\n",
	" \n",
	"# some parameters \n",
	"params = {\n",
	" 'layer_input': (76, 38, 'sigmoid'),\n",
	" 'layer_hidden': [(76, 'sigmoid')],\n",
	" 'layer_output': (3, 'softmax'),\n",
	" 'batch_size': 100,\n",
	" 'nb_epoch': 10,\n",
	" 'loss': 'categorical_crossentropy',\n",
	" 'optimizer': 'adamax', \n",
	" 'metrics': ['accuracy']\n",
	"}\n",
	"# run the code\n",
	"run_learning(**params)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"params = {\n",
	" 'layer_input': (76, 38, 'sigmoid'),\n",
	" 'layer_hidden': [(76, 'sigmoid')],\n",
	" 'layer_output': (3, 'softmax'),\n",
	" 'batch_size': 1000,\n",
	" 'nb_epoch': 10,\n",
	" 'loss': 'categorical_crossentropy',\n",
	" 'optimizer': 'adamax', \n",
	" 'metrics': ['accuracy']\n",
	"}\n",
	"run_learning(**params)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"params = {\n",
	" 'layer_input': (152, 38, 'sigmoid'),\n",
	" 'layer_hidden': [(152, 'sigmoid')],\n",
	" 'layer_output': (3, 'softmax'),\n",
	" 'batch_size': 100,\n",
	" 'nb_epoch': 10,\n",
	" 'loss': 'categorical_crossentropy',\n",
	" 'optimizer': 'adamax', \n",
	" 'metrics': ['accuracy']\n",
	"}\n",
	"run_learning(**params)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"params = {\n",
	" 'layer_input': (152, 38, 'sigmoid'),\n",
	" 'layer_hidden': [(152, 'sigmoid')],\n",
	" 'layer_output': (3, 'softmax'),\n",
	" 'batch_size': 100,\n",
	" 'nb_epoch': 10,\n",
	" 'loss': 'categorical_crossentropy',\n",
	" 'optimizer': 'adamax', \n",
	" 'metrics': ['accuracy']\n",
	"}\n",
	"run_learning(**params)"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.7"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 4
	}