Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save Hsankesara/a5ba2e47dfa1452ab30ad2f51c5ac441 to your computer and use it in GitHub Desktop.
Save Hsankesara/a5ba2e47dfa1452ab30ad2f51c5ac441 to your computer and use it in GitHub Desktop.
Sustainable Industry: Rinse Over Run competition. Scored 58 rank out of 1200+ participants. link: https://www.drivendata.org/competitions/56/predict-cleaning-time-series/
Sustainable Industry: Rinse Over Run competition. Scored 58 rank out of 1200+ participants. link: https://www.drivendata.org/competitions/56/predict-cleaning-time-series/
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "Sustainable Industry: Rinse Over Run #0.ipynb",
"version": "0.3.2",
"provenance": [],
"collapsed_sections": [
"N_uO6NYwUu6z"
]
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
}
},
"cells": [
{
"metadata": {
"id": "y9xOU8iunHZf",
"colab_type": "code",
"outputId": "d8026208-e0e3-4ec5-a590-bdcd108552b7",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
}
},
"cell_type": "code",
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from matplotlib import pyplot as plt\n",
"import seaborn as sns\n",
"from tqdm import tqdm\n",
"import gc\n",
"tqdm.pandas()\n",
"gc.collect()"
],
"execution_count": 2,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0"
]
},
"metadata": {
"tags": []
},
"execution_count": 2
}
]
},
{
"metadata": {
"id": "N_uO6NYwUu6z",
"colab_type": "text"
},
"cell_type": "markdown",
"source": [
"## Getting Data"
]
},
{
"metadata": {
"id": "uAudXj1GUx77",
"colab_type": "code",
"outputId": "62f59daf-daa7-4a53-eb91-2a4a2aee6f1c",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 798
}
},
"cell_type": "code",
"source": [
"!wget https://s3.amazonaws.com/drivendata/data/56/public/train_values.zip\n",
"!wget https://s3.amazonaws.com/drivendata/data/56/public/test_values.zip\n",
"!wget https://s3.amazonaws.com/drivendata/data/56/public/submission_format.csv\n",
"!wget https://s3.amazonaws.com/drivendata/data/56/public/train_labels.csv"
],
"execution_count": 3,
"outputs": [
{
"output_type": "stream",
"text": [
"--2019-01-30 13:53:09-- https://s3.amazonaws.com/drivendata/data/56/public/train_values.zip\n",
"Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.99.141\n",
"Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.99.141|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 334639180 (319M) [application/zip]\n",
"Saving to: ‘train_values.zip’\n",
"\n",
"train_values.zip 100%[===================>] 319.14M 89.9MB/s in 3.6s \n",
"\n",
"2019-01-30 13:53:17 (89.4 MB/s) - ‘train_values.zip’ saved [334639180/334639180]\n",
"\n",
"--2019-01-30 13:53:19-- https://s3.amazonaws.com/drivendata/data/56/public/test_values.zip\n",
"Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.160.61\n",
"Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.160.61|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 106633113 (102M) [application/zip]\n",
"Saving to: ‘test_values.zip’\n",
"\n",
"test_values.zip 100%[===================>] 101.69M 98.9MB/s in 1.0s \n",
"\n",
"2019-01-30 13:53:21 (98.9 MB/s) - ‘test_values.zip’ saved [106633113/106633113]\n",
"\n",
"--2019-01-30 13:53:22-- https://s3.amazonaws.com/drivendata/data/56/public/submission_format.csv\n",
"Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.101.77\n",
"Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.101.77|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 29715 (29K) [text/csv]\n",
"Saving to: ‘submission_format.csv’\n",
"\n",
"submission_format.c 100%[===================>] 29.02K --.-KB/s in 0.01s \n",
"\n",
"2019-01-30 13:53:22 (2.22 MB/s) - ‘submission_format.csv’ saved [29715/29715]\n",
"\n",
"--2019-01-30 13:53:24-- https://s3.amazonaws.com/drivendata/data/56/public/train_labels.csv\n",
"Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.229.117\n",
"Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.229.117|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 122192 (119K) [text/csv]\n",
"Saving to: ‘train_labels.csv’\n",
"\n",
"train_labels.csv 100%[===================>] 119.33K --.-KB/s in 0.03s \n",
"\n",
"2019-01-30 13:53:24 (4.31 MB/s) - ‘train_labels.csv’ saved [122192/122192]\n",
"\n"
],
"name": "stdout"
}
]
},
{
"metadata": {
"id": "pGmeEpy0U3mf",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"!mkdir data"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "oXgGSzb5U9w-",
"colab_type": "code",
"outputId": "a5001a1a-eb2f-4ddf-e187-24406b57517b",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 85
}
},
"cell_type": "code",
"source": [
"!unzip train_values.zip -d data/\n",
"!unzip test_values.zip -d data/\n",
"!mv train_labels.csv data/\n",
"!mv submission_format.csv data/"
],
"execution_count": 5,
"outputs": [
{
"output_type": "stream",
"text": [
"Archive: train_values.zip\n",
" inflating: data/train_values.csv \n",
"Archive: test_values.zip\n",
" inflating: data/test_values.csv \n"
],
"name": "stdout"
}
]
},
{
"metadata": {
"id": "Envm7L7YVOye",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"!rm train_values.zip\n",
"!rm test_values.zip"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "Y17WyGIjTwR8",
"colab_type": "text"
},
"cell_type": "markdown",
"source": [
"## Baseline Model"
]
},
{
"metadata": {
"id": "hq_JWxTUVn0V",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"%matplotlib inline\n",
"# mute warnings for this blog post\n",
"import warnings\n",
"warnings.filterwarnings(\"ignore\")\n",
"\n",
"from pathlib import Path\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"pd.set_option('display.max_columns', 40)\n",
"\n",
"import matplotlib.pyplot as plt\n",
"\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.ensemble import RandomForestRegressor"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "IE50pchoVyCb",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"DATA_DIR = Path('./data/')"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "tuc6cK48nML4",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 51
},
"outputId": "39e0f245-a38a-47a5-eff5-31b4d0b4fb93"
},
"cell_type": "code",
"source": [
"%%time\n",
"# for training our model\n",
"train_values = pd.read_csv(DATA_DIR / 'train_values.csv',index_col=0,\n",
" parse_dates=['timestamp'])\n",
"\n",
"train_labels = pd.read_csv(DATA_DIR / 'train_labels.csv',\n",
" index_col=0)\n",
"\n",
"# load the test data\n",
"test_values = pd.read_csv(DATA_DIR / 'test_values.csv',\n",
" index_col=0,\n",
" parse_dates=['timestamp'])"
],
"execution_count": 5,
"outputs": [
{
"output_type": "stream",
"text": [
"CPU times: user 52.9 s, sys: 5 s, total: 57.9 s\n",
"Wall time: 57.9 s\n"
],
"name": "stdout"
}
]
},
{
"metadata": {
"id": "FKtL8Tf9qOPM",
"colab_type": "code",
"outputId": "12373c54-cdb0-47e9-9451-e1969742a0cd",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 425
}
},
"cell_type": "code",
"source": [
"train_values.head()"
],
"execution_count": 6,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>process_id</th>\n",
" <th>object_id</th>\n",
" <th>phase</th>\n",
" <th>timestamp</th>\n",
" <th>pipeline</th>\n",
" <th>supply_flow</th>\n",
" <th>supply_pressure</th>\n",
" <th>return_temperature</th>\n",
" <th>return_conductivity</th>\n",
" <th>return_turbidity</th>\n",
" <th>return_flow</th>\n",
" <th>supply_pump</th>\n",
" <th>supply_pre_rinse</th>\n",
" <th>supply_caustic</th>\n",
" <th>return_caustic</th>\n",
" <th>supply_acid</th>\n",
" <th>return_acid</th>\n",
" <th>supply_clean_water</th>\n",
" <th>return_recovery_water</th>\n",
" <th>return_drain</th>\n",
" <th>object_low_level</th>\n",
" <th>tank_level_pre_rinse</th>\n",
" <th>tank_level_caustic</th>\n",
" <th>tank_level_acid</th>\n",
" <th>tank_level_clean_water</th>\n",
" <th>tank_temperature_pre_rinse</th>\n",
" <th>tank_temperature_caustic</th>\n",
" <th>tank_temperature_acid</th>\n",
" <th>tank_concentration_caustic</th>\n",
" <th>tank_concentration_acid</th>\n",
" <th>tank_lsh_caustic</th>\n",
" <th>tank_lsh_acid</th>\n",
" <th>tank_lsh_clean_water</th>\n",
" <th>tank_lsh_pre_rinse</th>\n",
" <th>target_time_period</th>\n",
" </tr>\n",
" <tr>\n",
" <th>row_id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>20001</td>\n",
" <td>405</td>\n",
" <td>pre_rinse</td>\n",
" <td>2018-04-15 04:20:47</td>\n",
" <td>L4</td>\n",
" <td>8550.348</td>\n",
" <td>0.615451</td>\n",
" <td>18.044704</td>\n",
" <td>4.990765</td>\n",
" <td>0.177228</td>\n",
" <td>15776.9100</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>55.499672</td>\n",
" <td>41.555992</td>\n",
" <td>44.026875</td>\n",
" <td>49.474102</td>\n",
" <td>32.385708</td>\n",
" <td>83.036750</td>\n",
" <td>73.03241</td>\n",
" <td>45.394646</td>\n",
" <td>44.340126</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>20001</td>\n",
" <td>405</td>\n",
" <td>pre_rinse</td>\n",
" <td>2018-04-15 04:20:49</td>\n",
" <td>L4</td>\n",
" <td>11364.294</td>\n",
" <td>0.654297</td>\n",
" <td>18.229168</td>\n",
" <td>3.749680</td>\n",
" <td>0.122975</td>\n",
" <td>13241.4640</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>55.487920</td>\n",
" <td>41.624170</td>\n",
" <td>44.045685</td>\n",
" <td>49.457645</td>\n",
" <td>32.385708</td>\n",
" <td>83.015045</td>\n",
" <td>73.03241</td>\n",
" <td>45.394447</td>\n",
" <td>44.339380</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>20001</td>\n",
" <td>405</td>\n",
" <td>pre_rinse</td>\n",
" <td>2018-04-15 04:20:51</td>\n",
" <td>L4</td>\n",
" <td>12174.479</td>\n",
" <td>0.699870</td>\n",
" <td>18.395544</td>\n",
" <td>2.783954</td>\n",
" <td>0.387008</td>\n",
" <td>10698.7850</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>55.476166</td>\n",
" <td>41.638275</td>\n",
" <td>44.045685</td>\n",
" <td>49.462350</td>\n",
" <td>32.385708</td>\n",
" <td>83.015045</td>\n",
" <td>73.03241</td>\n",
" <td>45.396280</td>\n",
" <td>44.336735</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>20001</td>\n",
" <td>405</td>\n",
" <td>pre_rinse</td>\n",
" <td>2018-04-15 04:20:53</td>\n",
" <td>L4</td>\n",
" <td>13436.776</td>\n",
" <td>0.761502</td>\n",
" <td>18.583622</td>\n",
" <td>1.769353</td>\n",
" <td>0.213397</td>\n",
" <td>8007.8125</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>55.471466</td>\n",
" <td>41.647675</td>\n",
" <td>44.048030</td>\n",
" <td>49.462350</td>\n",
" <td>32.385708</td>\n",
" <td>83.036750</td>\n",
" <td>73.03241</td>\n",
" <td>45.401875</td>\n",
" <td>44.333110</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>20001</td>\n",
" <td>405</td>\n",
" <td>pre_rinse</td>\n",
" <td>2018-04-15 04:20:55</td>\n",
" <td>L4</td>\n",
" <td>13776.766</td>\n",
" <td>0.837240</td>\n",
" <td>18.627026</td>\n",
" <td>0.904020</td>\n",
" <td>0.148293</td>\n",
" <td>6004.0510</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>55.459705</td>\n",
" <td>41.654730</td>\n",
" <td>44.048030</td>\n",
" <td>49.462350</td>\n",
" <td>32.385708</td>\n",
" <td>83.015045</td>\n",
" <td>73.03241</td>\n",
" <td>45.398197</td>\n",
" <td>44.334373</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" <td>False</td>\n",
" <td>0.0</td>\n",
" <td>False</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" process_id object_id phase timestamp pipeline \\\n",
"row_id \n",
"0 20001 405 pre_rinse 2018-04-15 04:20:47 L4 \n",
"1 20001 405 pre_rinse 2018-04-15 04:20:49 L4 \n",
"2 20001 405 pre_rinse 2018-04-15 04:20:51 L4 \n",
"3 20001 405 pre_rinse 2018-04-15 04:20:53 L4 \n",
"4 20001 405 pre_rinse 2018-04-15 04:20:55 L4 \n",
"\n",
" supply_flow supply_pressure return_temperature return_conductivity \\\n",
"row_id \n",
"0 8550.348 0.615451 18.044704 4.990765 \n",
"1 11364.294 0.654297 18.229168 3.749680 \n",
"2 12174.479 0.699870 18.395544 2.783954 \n",
"3 13436.776 0.761502 18.583622 1.769353 \n",
"4 13776.766 0.837240 18.627026 0.904020 \n",
"\n",
" return_turbidity return_flow supply_pump supply_pre_rinse \\\n",
"row_id \n",
"0 0.177228 15776.9100 True True \n",
"1 0.122975 13241.4640 True True \n",
"2 0.387008 10698.7850 True True \n",
"3 0.213397 8007.8125 True True \n",
"4 0.148293 6004.0510 True True \n",
"\n",
" supply_caustic return_caustic supply_acid return_acid \\\n",
"row_id \n",
"0 False False False False \n",
"1 False False False False \n",
"2 False False False False \n",
"3 False False False False \n",
"4 False False False False \n",
"\n",
" supply_clean_water return_recovery_water return_drain \\\n",
"row_id \n",
"0 False False True \n",
"1 False False True \n",
"2 False False True \n",
"3 False False True \n",
"4 False False True \n",
"\n",
" object_low_level tank_level_pre_rinse tank_level_caustic \\\n",
"row_id \n",
"0 True 55.499672 41.555992 \n",
"1 True 55.487920 41.624170 \n",
"2 True 55.476166 41.638275 \n",
"3 True 55.471466 41.647675 \n",
"4 True 55.459705 41.654730 \n",
"\n",
" tank_level_acid tank_level_clean_water tank_temperature_pre_rinse \\\n",
"row_id \n",
"0 44.026875 49.474102 32.385708 \n",
"1 44.045685 49.457645 32.385708 \n",
"2 44.045685 49.462350 32.385708 \n",
"3 44.048030 49.462350 32.385708 \n",
"4 44.048030 49.462350 32.385708 \n",
"\n",
" tank_temperature_caustic tank_temperature_acid \\\n",
"row_id \n",
"0 83.036750 73.03241 \n",
"1 83.015045 73.03241 \n",
"2 83.015045 73.03241 \n",
"3 83.036750 73.03241 \n",
"4 83.015045 73.03241 \n",
"\n",
" tank_concentration_caustic tank_concentration_acid tank_lsh_caustic \\\n",
"row_id \n",
"0 45.394646 44.340126 False \n",
"1 45.394447 44.339380 False \n",
"2 45.396280 44.336735 False \n",
"3 45.401875 44.333110 False \n",
"4 45.398197 44.334373 False \n",
"\n",
" tank_lsh_acid tank_lsh_clean_water tank_lsh_pre_rinse \\\n",
"row_id \n",
"0 0.0 False 0.0 \n",
"1 0.0 False 0.0 \n",
"2 0.0 False 0.0 \n",
"3 0.0 False 0.0 \n",
"4 0.0 False 0.0 \n",
"\n",
" target_time_period \n",
"row_id \n",
"0 False \n",
"1 False \n",
"2 False \n",
"3 False \n",
"4 False "
]
},
"metadata": {
"tags": []
},
"execution_count": 6
}
]
},
{
"metadata": {
"id": "QOi7FY_rW_T7",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"train_values = train_values[train_values.phase != 'final_rinse']"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "5HWVT2WYXE8y",
"colab_type": "code",
"outputId": "e873b014-03a6-404b-f5d3-a3403f623194",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 277
}
},
"cell_type": "code",
"source": [
"train_values.groupby('process_id').phase.nunique().value_counts().sort_index().plot.bar()\n",
"plt.title(\"Number of Processes with $N$ Phases\");"
],
"execution_count": 8,
"outputs": [
{
"output_type": "display_data",
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAEECAYAAAAmiP8hAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAGHJJREFUeJzt3X+UnFWd5/F3TwIj+aFpoCFMdEB2\nnS/rYWdhWGAQwSgG0SWDHmA4Qw5KANeZgRn54XrCIozoeHBFxd2RI2aE4YerE4zD8PMYDMQ1gGB0\nBcWVryIKOkHTQsiESQiQ9P7xPA1l09VVqa50dV/er3Ny8tR9ftz73Kf7U7fuU9XVNzQ0hCSpXL/T\n6wZIknYsg16SCmfQS1LhDHpJKpxBL0mFM+glqXAGvSQVbnqvG6D2RcQQcFVmnt5QNh/4cGbO78Lx\n5wNfyMx/P95jtVnfF4E3AWdk5ooR64aAnwLPUw1INgBLMvOOiWjbZBIRhwAfzcy31Y/fm5l/HxH7\nAA9nZsvf44joB54E7s3MwxrKrwA2Z+Y5I7Zv2v/bU68mB0f0U8+bIuLAXjeiS/4MmD8y5BvMz8z9\nMvMPgLOBr0TEwMQ1b3LIzG83hPw04NIODnMA8Cvg9RExt6H8QOD+JvvY/4XwGXnqOR/4DNVI+AUj\nR+ONj+sR2LeAy4DTgT7g3cCFVAGwIjNPazjWJ4E/AbYBp2XmPXX5ccDfAjOBh4GTgVnAPcAy4I8y\n87faVe93IvA3VD9va4H3AldSDTRWRMRfZ+ZtY510Zt4dEQ8Dh0XE90fWOVodmfnTuv53Ax+qD3Uf\n1SuILU3O5yngCuAIYBrwfeBUYNNo5Zn5r6MdJzN/ExHTm+3T0De/AN6cmQ9HxEnAdcCczNwUEecC\n+wD/xIvX9uvAqyLiIeDt9TFOowrifuCDmfnlUbrwAOA7wBbgOODz9ZPGfwS+N1bfj+z/+jya1hsR\nZwDn1dficeCUzHx0rP7YnmvR2H9qjyP6KSYzvwL0RcQJ27nr7sCvMjOofmGWAe8B/hA4OSL+Xb3d\nPsB36lHcp4DLASJiX6oQ+rPM3BdYRfVLOHzs+5uE/O8Dfw+8MzP3A24FPt8w1TS/Vcg32IkqqH6r\nzmZ11PXvA3wSmA8EVZD89Rjn8zbgtcB+wOuAH1KF26jlLfql2bEarWooOxL4LnBI/fgIYORU1WnA\n1vo8h6h+h3fOzD8EzqEKy9EMj9z/GXhnXbZfvf+PmuwzUmP/j1pvROwBfBZYkJmvowrtC+t9trcP\n2+k/tcGgn5rOBv5HRLxiO/aZDnylXv4BsCYzf5OZT1CNun6vXvcMcH29fD1wQF3PMcA3MvPBet0V\nVKP+aVQBcEOTehcAqzLz4frxF4A316O7tkXE24G5wN11UWOdY9VxNHBPZq7NzCGqkeJlY5zPb4DX\nA+8CZmTmhfXU0mCT8lGPU4+Wm+3TqDHo/5jqlc7hDY+/0aJr+oBr6+XvAa9ust0BVEF/K3BERMyu\ny36Ymc+1qGO0/h+13sxcB7wyM39Zr1sN7Fsvb1cf0vxaaDsZ9FNQZv5f4JvAudux29bM3Dy8DDzd\nuI4qsAGeyMxt9fLwS+R+YA5wZEQ8VE8bfIvqBt1u9bGbvZweANY3tH0DVUjs3kabv1HX92OqUePb\nM3O43Y11jlXH7lRTAMPrnsnM58c4n58Bf1X/+1VEfCki5mTmt0crH6tfxtin0SqqUW0/8CxwJ3B4\nROwHPFafy1i2Zuam4WVevI4viIjfBf4D1Sug9cC3qaZ9xpqfh9b9/5J66ye4j0TE/4uIBD5GnTMd\n9OGo16JFf2gUztFPXf+d6mX+z+rHI3/J+zs8buN+w79UT1LNe6/MzN+aMqqnRsbyaxpebteBto1q\ntNbK/IaRYad1/AZ4Q8O6VwK70OR8asuB5RGxK3AV8N+ACzJztPKfjHEcmuxzQcP6n0fELKpR7bcy\n85GIeC2jT9t0an+qewyP1I+Hp2/2pPkrMWi//xudRDUaP7K+T/FeYNHwyg76cMz+U3sc0U9Rmfk4\n1fz5h+uix4G9ImKPelS1qNm+LcyIiHfVyydQTfFsAVZQveTfF6q3/EXE/2zjeF+nGq0Nv3z/c+D2\nelTdLWPVcRvVCHmfiOijmhY4vdn5RMTiiLgQIDOfBB4ChpqVNztOvdxsn5FWA+/nxWmRh6jm4kcL\n+ueA36mnXtp1IPD9euoK4CbgHbQe0XdiD+DndcjvBvwp1Q37sfpju65Fl9v7smDQT22fAn4XoJ6f\nvopqvvQuOh8NPkQ1lfAQ1cv1M+vjP071bpkbIuJHVDfclrU6WD0iPAO4sT7mkcD7OmzbdtdRr/uv\nVFMiP6YKik+PcT43AgdFxE/q8tcDn25W3qJfmh1rpFXAoVTvJKL+/0BeDP5Gj1Nd38d48b5KK8Pz\n88P99XPg51Sv2B5o8xjt+jKwW/0OnS9TvdvpNRHxKba/D9vtP7XQ5xePSFLZHNFLUuEMekkqnEEv\nSYUz6CWpcAa9JBVuUn5ganBw45R4K1B//wzWr9/UekO1ZF92l/3ZXVOlPwcGZveNVu6IfhymT3/J\np83VIfuyu+zP7prq/WnQS1LhDHpJKpxBL0mFM+glqXAGvSQVzqCXpMIZ9JJUOINekgo3KT8ZK0md\n+tzHv9HrJrTlL5bMn7C6HNFLUuEMekkqnEEvSYUz6CWpcAa9JBXOoJekwhn0klQ4g16SCtfyA1MR\nMQO4GtgTeAXwUeAE4CDgiXqzSzPz1ohYBJwNbAOWZuaVEbFTvf/ewFZgcWY+0uXzkCQ10c4nYxcC\n38nMT0TE3sDXgXuA8zPzluGNImImcBFwCPAssCYibqj3fyozF0XE0cAlwEldPg9JUhMtgz4zlzU8\nfA3wyyabHgqsycwNABFxN3A4cBRwbb3NSuCqjlsrSdpubc/RR8Q9wJeopmYAzoqIOyPiHyNid2Au\nMNiwyzpgr8byzNwGDEXEzt1ovCSptbb/qFlmviEiDgC+CJwDPJGZ90fEEuDDVNM5jfqaHKpZ+Qv6\n+2dMmW9dHxiY3esmFMO+7C77c3KbyOvTzs3Yg4B1mfmLOtinAz/IzHX1JjcBnwOWU43eh80D7gXW\n1uUP1Ddm+zLz2bHqXL9+0/afSQ8MDMxmcHBjr5tRBPuyu+zPyW9HXJ9mTx7tTN0cCZwHEBF7ArOA\nz0fEvvX6+cCDwH3AwRExJyJmUc3PrwZuB06st10IrOrsFCRJnWhn6uYK4MqIWA3sApwJPA0si4hN\n9fLizNxcT+OsAIaAizNzQ0QsAxZExF3AFuDUHXAekqQm2nnXzWbg5FFWHTzKtsuppnAay7YCiztt\noCRpfPxkrCQVzqCXpMIZ9JJUOINekgpn0EtS4Qx6SSqcQS9JhTPoJalwBr0kFc6gl6TCGfSSVDiD\nXpIKZ9BLUuEMekkqnEEvSYUz6CWpcAa9JBXOoJekwhn0klS4lt8ZGxEzgKuBPYFXAB8FHgCuA6YB\njwOnZOaWiFgEnA1sA5Zm5pURsVO9/97AVqovEn+k+6ciSRpNOyP6hcB3MvNNwJ8CnwY+AlyemUcA\nDwOnRcRM4CLgrcB84JyI2JXqi8Wfysw3Ah8DLun6WUiSmmo5os/MZQ0PXwP8kirI/7wuuxn4AJDA\nmszcABARdwOHA0cB19bbrgSu6kbDJUntaRn0wyLiHuDVwLHAyszcUq9aB+wFzAUGG3Z5SXlmbouI\noYjYOTOfbVZXf/8Mpk+ftl0n0isDA7N73YRi2JfdZX9ObhN5fdoO+sx8Q0QcAHwR6GtY1ddkl+0t\nf8H69ZvabVZPDQzMZnBwY6+bUQT7srvsz8lvR1yfZk8eLefoI+KgiHgNQGbeT/XksDEidqk3mQes\nrf/Nbdj1JeX1jdm+sUbzkqTuaudm7JHAeQARsScwi2qu/fh6/fHA14D7gIMjYk5EzKKan18N3A6c\nWG+7EFjVtdZLklpqJ+ivAPaIiNXArcCZwN8A76nLdgWuyczNwBJgBdUTwcX1jdllwLSIuKve9/zu\nn4YkqZl23nWzmeotkiMtGGXb5cDyEWVbgcWdNlCSND5+MlaSCmfQS1LhDHpJKpxBL0mFM+glqXAG\nvSQVzqCXpMIZ9JJUOINekgpn0EtS4Qx6SSqcQS9JhTPoJalwBr0kFc6gl6TCGfSSVDiDXpIKZ9BL\nUuEMekkqXMvvjAWIiE8AR9TbXwL8CXAQ8ES9yaWZeWtELALOBrYBSzPzyojYCbga2BvYCizOzEe6\nehaSpKZaBn1EvBnYPzMPi4jdgO8BdwLnZ+YtDdvNBC4CDgGeBdZExA3AQuCpzFwUEUdTPVGc1P1T\nkSSNpp2pm28CJ9bLTwEzgWmjbHcosCYzN2TmZuBu4HDgKOCGepuVdZkkaYK0HNFn5lbg3+qHpwO3\nUU3BnBUR5wLrgLOAucBgw67rgL0ayzNzW0QMRcTOmflsszr7+2cwffpozyWTz8DA7F43oRj2ZXfZ\nn5PbRF6ftuboASLiOKqgPxr4z8ATmXl/RCwBPgzcM2KXviaHalb+gvXrN7XbrJ4aGJjN4ODGXjej\nCPZld9mfk9+OuD7NnjzavRn7NuAC4JjM3ADc0bD6JuBzwHKq0fuwecC9wNq6/IH6xmzfWKN5SVJ3\ntZyjj4hXAZcCx2bmk3XZVyNi33qT+cCDwH3AwRExJyJmUc3FrwZu58U5/oXAqq6egSRpTO2M6E8C\ndgeuj4jhsn8AlkXEJuBpqrdMbq6ncVYAQ8DFmbkhIpYBCyLiLmALcGqXz0GSNIZ2bsYuBZaOsuqa\nUbZdTjWF01i2FVjcaQMlSePjJ2MlqXAGvSQVzqCXpMIZ9JJUOINekgpn0EtS4Qx6SSqcQS9JhTPo\nJalwBr0kFc6gl6TCGfSSVDiDXpIKZ9BLUuEMekkqnEEvSYUz6CWpcAa9JBWune+MJSI+ARxRb38J\nsAa4DpgGPA6ckplbImIRcDawDViamVdGxE7A1cDewFaq75d9pNsnIkkaXcsRfUS8Gdg/Mw8DjgE+\nA3wEuDwzjwAeBk6LiJnARcBbgfnAORGxK3Ay8FRmvhH4GNUThSRpgrQzdfNN4MR6+SlgJlWQ31SX\n3UwV7ocCazJzQ2ZuBu4GDgeOAm6ot11Zl0mSJkjLoM/MrZn5b/XD04HbgJmZuaUuWwfsBcwFBht2\nfUl5Zm4DhiJi5+40X5LUSltz9AARcRxV0B8N/KRhVV+TXba3/AX9/TOYPn1au03rqYGB2b1uQjHs\ny+6yPye3ibw+7d6MfRtwAXBMZm6IiKcjYpd6imYesLb+N7dht3nAvQ3lD9Q3Zvsy89mx6lu/ftP2\nn0kPDAzMZnBwY6+bUQT7srvsz8lvR1yfZk8e7dyMfRVwKXBsZj5ZF68Ejq+Xjwe+BtwHHBwRcyJi\nFtVc/Grgdl6c418IrOrwHCRJHWhnRH8SsDtwfUQMl70H+EJEvA94FLgmM5+LiCXACmAIuLge/S8D\nFkTEXcAW4NQun4MkaQwtgz4zlwJLR1m1YJRtlwPLR5RtBRZ32kBJ0vj4yVhJKpxBL0mFM+glqXAG\nvSQVzqCXpMIZ9JJUOINekgpn0EtS4Qx6SSqcQS9JhTPoJalwBr0kFc6gl6TCGfSSVDiDXpIKZ9BL\nUuEMekkqnEEvSYUz6CWpcO18OTgRsT9wI3BZZn42Iq4GDgKeqDe5NDNvjYhFwNnANmBpZl4ZETsB\nVwN7A1uBxZn5SHdPQ5LUTMugj4iZwN8Bd4xYdX5m3jJiu4uAQ4BngTURcQOwEHgqMxdFxNHAJcBJ\nXWq/JKmFdqZutgDvANa22O5QYE1mbsjMzcDdwOHAUcAN9TYr6zJJ0gRpGfSZ+Xwd3COdFRF3RsQ/\nRsTuwFxgsGH9OmCvxvLM3AYMRcTO42+6JKkdbc3Rj+I64InMvD8ilgAfBu4ZsU1fk32blb+gv38G\n06dP67BpE2tgYHavm1AM+7K77M/JbSKvT0dBn5mN8/U3AZ8DllON3ofNA+6lmvKZCzxQ35jty8xn\nxzr++vWbOmnWhBsYmM3g4MZeN6MI9mV32Z+T3464Ps2ePDp6e2VEfDUi9q0fzgceBO4DDo6IOREx\ni2oufjVwO3Bive1CYFUndUqSOtPOu24OAj4F7AM8FxEnUL0LZ1lEbAKepnrL5OZ6GmcFMARcnJkb\nImIZsCAi7qK6sXvqDjkTSdKoWgZ9Zn6XatQ+0ldH2XY51RROY9lWYHGH7ZMkjZOfjJWkwhn0klQ4\ng16SCmfQS1LhDHpJKpxBL0mFM+glqXAGvSQVzqCXpMIZ9JJUOINekgpn0EtS4Qx6SSqcQS9JhTPo\nJalwBr0kFc6gl6TCGfSSVDiDXpIK1/I7YwEiYn/gRuCyzPxsRLwGuA6YBjwOnJKZWyJiEXA2sA1Y\nmplXRsROwNXA3sBWqi8Sf6T7pyJJGk3LEX1EzAT+DrijofgjwOWZeQTwMHBavd1FwFupvkz8nIjY\nFTgZeCoz3wh8DLikq2cgSRpTO1M3W4B3AGsbyuYDN9XLN1OF+6HAmszckJmbgbuBw4GjgBvqbVfW\nZZKkCdJy6iYznweej4jG4pmZuaVeXgfsBcwFBhu2eUl5Zm6LiKGI2Dkzn+1C+9UDZ975wV43oS2X\nv+UTvW6CNCm0NUffQl+Xyl/Q3z+D6dOndd6iCTQwMLvXTVATL/dr83I//8luIq9Pp0H/dETsUk/R\nzKOa1llLNXofNg+4t6H8gfrGbF+r0fz69Zs6bNbEGhiYzeDgxl43Q028nK+NP5uT3464Ps2ePDp9\ne+VK4Ph6+Xjga8B9wMERMSciZlHNxa8GbgdOrLddCKzqsE5JUgdajugj4iDgU8A+wHMRcQKwCLg6\nIt4HPApck5nPRcQSYAUwBFycmRsiYhmwICLuorqxe+oOORNJ0qjauRn7Xap32Yy0YJRtlwPLR5Rt\nBRZ32D5J0jj5yVhJKpxBL0mFM+glqXAGvSQVzqCXpMIZ9JJUOINekgpn0EtS4Qx6SSqcQS9JhTPo\nJalwBr0kFc6gl6TCGfSSVDiDXpIKZ9BLUuEMekkqnEEvSYUz6CWpcC2/M3Y0ETEf+Arww7roB8An\ngOuAacDjwCmZuSUiFgFnA9uApZl55XgbLUlq33hG9P8nM+fX//4K+AhweWYeATwMnBYRM4GLgLdS\nfcH4ORGx63gbLUlqXzenbuYDN9XLN1OF+6HAmszckJmbgbuBw7tYpySphY6mbmqvj4ibgF2Bi4GZ\nmbmlXrcO2AuYCww27DNcPqb+/hlMnz5tHE2bOAMDs3vdBDXxcr82L/fzn+wm8vp0GvQ/oQr364F9\ngVUjjtXXZL9m5b9l/fpNHTZrYg0MzGZwcGOvm6EmXs7Xxp/NyW9HXJ9mTx4dBX1m/guwrH7404j4\nFXBwROxST9HMA9bW/+Y27DoPuLeTOiVJnelojj4iFkXEB+rlucCewD8Ax9ebHA98DbiP6glgTkTM\nopqfXz3uVkuS2tbp1M1NwJci4jhgZ+AvgO8B10bE+4BHgWsy87mIWAKsAIaAizNzQxfaLUlqU6dT\nNxuBhaOsWjDKtsuB5Z3UI0kaPz8ZK0mFM+glqXAGvSQVzqCXpMIZ9JJUOINekgpn0EtS4Qx6SSqc\nQS9JhRvPnymeUk77+J29bkJLVy15S6+bIKlAjuglqXAGvSQVzqCXpMIZ9JJUOINekgpn0EtS4Qx6\nSSqcQS9JhTPoJalwE/LJ2Ii4DPhjqi8If39mrpmIeiVJEzCij4g3Aa/LzMOA04H/taPrlCS9aCJG\n9EcB/wyQmT+KiP6IeGVm/usE1C1Nej8+49TuH7PrR4Q/+MLVO+Comgh9Q0NDO7SCiFgK3JqZN9aP\nVwOnZ+aO+FmUJI3Qi5uxfT2oU5JetiYi6NcCcxse/x7w+ATUK0liYoL+duAEgIj4I2BtZm6cgHol\nSUzAHD1ARHwcOBLYBpyZmQ/s8EolScAEBb0kqXf8ZKwkFc6gl6TCGfTjFBFzet2GqSoiXvJW24h4\ndS/aUpKI2L3XbShJRLyl120Yrwn5WzeF+ydgyv8gTKSIeBfwGWBGRNwGnNXwTqxrsT/bFhH/Bfg0\n8AvgbOB/A9MjYibwl5l5Wy/bN9VExLtHFPUBH4qIjwJk5rUT36rxM+jbEBF/2WRVHzBvIttSiCXA\ngcBTwBnA7RFxTGZuwA/Uba8PAQuA3wduAY7LzAciYk/gZsCg3z4XAU8At/Liz+IrgNf2rEVdYNC3\n51xgJaN/0GunCW5LCbZm5pP18tKI+DWwIiKOpfoLp2rflsx8DHgsIv5l+K3LmfnriHimx22bivYH\nLgT+E3BuZj5aD0Iu7nG7xsWgb887qf7q5vszc0vjioiY35MWTW13RcQtwImZuTkzb6xD6Q5gtx63\nbar5dUR8IDM/mZmHwwv3Oc6jms7RdsjMZ4ALIiKAyyPiHgq4lznlT2AiZOaDwLHAc6OsPm+CmzPl\nZeYHgU8CzzSUrQCOAKb0yKkHTgUeG1G2B/Ao1Z8FVweycizVk+XPet2e8fIDU5JUOEf0klQ4g16S\nCmfQS1LhDHpJKpxBL0mF+/+DXzH7m9adowAAAABJRU5ErkJggg==\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"tags": []
}
}
]
},
{
"metadata": {
"id": "u4sWmF5BXH8O",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"# create a unique phase identifier by joining process_id and phase\n",
"train_values['process_phase'] = train_values.process_id.astype(str) + '_' + train_values.phase.astype(str)\n",
"process_phases = train_values.process_phase.unique()\n",
"\n",
"# randomly select 80% of phases to keep\n",
"rng = np.random.RandomState(2019)\n",
"to_keep = rng.choice(\n",
" process_phases,\n",
" size=np.int(len(process_phases) * 0.8),\n",
" replace=False)\n",
"\n",
"train_limited = train_values[train_values.process_phase.isin(to_keep)]\n",
"\n",
"# subset labels to match our training data\n",
"train_labels = train_labels.loc[train_limited.process_id.unique()]"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "jDwA-F94XMe_",
"colab_type": "code",
"outputId": "4cfcf6f6-6201-43cc-fe67-ddc4377bfd46",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 278
}
},
"cell_type": "code",
"source": [
"train_limited.groupby('process_id').phase.nunique().value_counts().sort_index().plot.bar()\n",
"plt.title(\"Number of Processes with $N$ Phases (Subset for Training)\");"
],
"execution_count": 10,
"outputs": [
{
"output_type": "display_data",
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAEFCAYAAADt1CyEAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAHAZJREFUeJzt3X28XFV97/HPMQeUhGgOcGhitESt\nfqnFKvKiqCEalKgoXB8AuQWVR7XqrQJWX1EQCOoLa0X0IldJQUFqK5orAgUJDQ8KKJjaFuoDv4gI\n9JpADhLSKCGQh/vHWhOGwzydfSYzzOL7fr3yysya/bD22nu+s87ae2YPbdmyBTMzK9fT+l0BMzPb\nthz0ZmaFc9CbmRXOQW9mVjgHvZlZ4Rz0ZmaFc9CbmRXOQW9mVrjhflegmyRtAb4WEcfWlc0HTouI\n+V1Y/nzgvIj4k8kuq8P1/QPwGuC4iFg67rUtwK+BjaQP7LXAwoi4phd1ezKR9BfApyLiDfn5eyLi\n7yXNAe6IiLbHuaQR4AHg5oh4ZV35V4H1EXHCuOmbtv9E1ttLkt4PvCIijpS0F/A5YDap/r8DPhoR\nN7aYfz5dPv5r+6pBedNjfwLL/gqwX376AmAlsD4/3zsi1nW4nMcdX1WnabOOM4CNEfHJKvO386Q6\nELvkNZL2jIh/73dFuuAvgRdFxK+bvD4/Iv4fgKS5wOWSFBFjPavhk0BE/ASohfwU4O+AJ4RHGy8D\n7gVeLGlmRNyby/cE/k+TeRq2/0Tr3wv5w+fjwB6ShoDLgfdExBX59bcDl0p6bkQ81KM6tdpX7Y79\ntiLi/XXrugt4Z6sPshbL2Xp8TWaaNk4FbpN0SUT82ySW01CJQf9x4Iuk3sBW43sj9c/zm+DHwFnA\nscAQ8G7gk6QAWBoRx9Qt6/PA/wA2A8dExI9y+VuATwPTgDuAw4EdgR8BFwMvj4jH1SvPdyhpRw+T\neh3vAc4n9bSWSvpQRFzZaqMj4iZJdwCvlHTb+HU2WkftTSTp3cDJeVG3kHpRG5psz4PAV4F5wBTg\nNuAo4KFG5RHx342WExH3SxpuNk9d2/wXsF9E3CHpMOAiYEZEPCTpRGAO8F0e27f/AjxL0u3AAXkZ\nxwDHAyPAxyLinxo04cuAfwU2AG8Bzs1B9BKgbaehvv3zdjRdr6TjgI/kfbEKeFdE3N2qPSayL+rb\nr87HgK/nZY0Cs4Cb6+r/XUk/ye06nwbvFeC4/PwJx/9E6x4R9zNuX0XEb/Lyr2fcsd/kPbKJNu+t\nZvJ7fvx7pNl+qW3//qScOCOvfyfgxIi4uEmeNJruE6Rj4m7g66TjYk5EPCLpS8BJwMGdbkenihuj\nj4jvAEOSDpngrLsA90aESAfpxcCRwJ8Dh0t6QZ5uDvCvEfEi4EzgHABJzyeF0F9GxPOB60gHfm3Z\n/9Ek5P+Y1KN5a0TsDlwBnFs31DS/XcjX2Y4UVI9bZ7N15PXPAT4PzAdEejN+qMX2vAF4HrA78ELg\n56Rwa1jepl2aLavedXVlrwZ+CvxFfj4PGD9UdQywKW/nFtIxvn1E/DlwAilwGtkT+A/ge8Bbc9nu\nef5fNplnvPr2b7heSbsCXwYWRMQLScFX+3N9om3YSfvVHAxckh/fDywHrpN0rKTnAdT+OmljDg2O\n/wp1h7p9VQv5XI/5+eH8HPJNj19avLc6UP8eabVfxs+zOSJeQgrsZsfTE6aT9GekD9yXko7dd4yb\n5xLgzZKmVtiWlooL+ux44G8lPWMC8wwD38mP/xNYHhH3R8TvSJ/uz86vPQx8Oz/+NvCyvJ43AtdH\nxM/ya18l9XqmkAKg9iYbbwFwXUTckZ+fB+yXe0gdk3QAMBO4KRfVr7PVOl4P/CgiVkbEFlJP8awW\n23M/8GLgbcDUiPhkHkMda1LecDm5t9xsnnr1Qf8K0l86c+ueX9+maYaAb+TH/w48p8l0LyMF/RXA\nPEnTc9nPI+LRNuto1P4N1xsRq4Fn1oXqDcDz8+MJtSHN98X4us0BnkX+SyPv5wWk4+PDwJ2Sfp6H\nb9ppdvxX2f+danj8kt6zrd5b7Wydt81+qTdM6okD/Bvwx02W3Wi6V5PaYlVEPAx8rX6GPFx4L/Dy\nSlvTQpFBn8e4fgicOIHZNkVE7UTNJuD39a+RAhvgdxGxOT+u/Yk8AswAXi3p9vyn6I9JJ+h2zstu\n9Oc0wCiwpq7ua0khsUsHdb4+r28Fqdd4QETU6l2/zlbr2IU0BFB77eGI2Nhie34D/HX+d6+kf5Q0\nI49RPqG8Vbu0mKfedaSe4QjwCHAtMFfS7sA9eVta2RSPjTnX78etJD0d+FNS724N8BPSsE+tl99M\nu/Z/wnpzwJ0u6ReSAvgM+X1YoQ0b7osG9dyVxx+3RMTaiDg1/8Uxk/Sh9C1Jf9pie6HJ8V9l/7dZ\nT71Wx2+r91Y7W+dttV8azPOH2mMaHE8tpqud8K/5bYP5VpP2V1eVOEZf8wnSn/m1PwnH75SRisut\nn6/2pnqANG64LCIeN2SUe1Ot3Efdn9s50DaTemvtbD0ZOIl13A+8qu61ZwI70GR7siXAEkk7kXol\nHwVOiohG5b9qsRyazHNS3et3SdqR1DP8cUTcmYcaGg3bVLUH6RzDnfl5bfjmj2jdW+y0/esdRuqN\nvzrSeYr3AEfUXqzQhi3bLxuqfyLpOcCcyCcmI+I+0l/A7wD+jHS8NHuvNDv+J1z3Dt4bNZN5j3Sq\n5X7pkv8mnbOrmdXl5TdVZI8eICJWkcYPT8tFq4BZknbNn95Vd+JUSW/Ljw8hDfFsAJaS/uR/PqTL\nrfLJlXb+hdTjqf2Z+FfA1blX3S2t1nElqYc8R+lqjK+STkg33B5JR0v6JEBEPADcDmxpVt5sOflx\ns3nGu4E0xFAbFrmdNL7bKOgfBZ6Wh146tSdwWx7SALgMeBPte/RV7ArclcNkZ9I47Y7Qsj0mtC8a\nrHM1sLOk2vv9ucD3lC6xJC9zb9LwwnJav1caHv9V9j+d76uGxy/p0tZuabpfuugnpCHTXfJfkUc2\nmGaUNAzWVcUGfXYm8HSAPL73NdJ46Y1U7w3eThpKuJ305/oH8/JXkc6wXyLpl6QTOxe3W1juER5H\nurTtdtI43vsq1m3C68ivvZc0JLKC9Ob8QovtuRTYS9KvcvmLgS80K2/TLs2WNd51wD6kqyTI/+/J\nY8FfbxVp/97DY+dV2qmNz9fa6y7gLlKP9dYOl9GpfyKF7h358cnAcyWdycTbsNP2u4vUm3xJ3r4f\nk/b5VyRFrstZwGERcXeb90rD479C3aFuX0l6FU304j1C6/3SFXl460JSu15LusR16wez0gnhWaSR\niK4a8h2mzMqn9MWvlRFxer/r8lQmaaj2l6OkNwOfjog98/P3Am+KiLe2WkYVpffozSz5HHBsPt9h\nfaD0/YX7Je2Wh0nfQTo5jaTtSFcLfmZbrNtBb/YUEBF3ksL+7H7X5akq0jfWTyINha0gfZHqtPzy\nacAlEbF8W6zbQzdmZoVzj97MrHAdXUcvaQ/SWfWzIuLLeTzpQuBPgHXAIRGxRtIRpHGmzcDiiDg/\nT3sBsBvpWvaj85+RZmbWA538fOs00rhe/SVW7wHGIuLwfKZ4nqRrgFNIv0PyCLBc0iXAQcCDEXGE\npNeTfujnsFbrHBtbNxDjSSMjU1mzpic/9Fc8t2V3uT27a1Dac3R0+lCj8k6GbjaQvjyysq7sIOCb\nABGxOCIuI13nvDx/tXo96RrnucDreOzbhct47HdKBt7w8ER+rsNacVt2l9uzuwa9PdsGfURsrPsN\nmJo5wAGSrpf0rfyV55k8/htdq0kX/28tz7+RsUXS9t2ovJmZtVf1t26GgIiIRZJOJv0G/Pjf7G74\nJ0SL8q1GRqYOzCfo6OhEvmlvrbgtu8vt2V2D3J5Vg/4+4Af58VJgEennXWfWTTObdGODlbn81nxi\ndigiHmm18EEYC4O048fGOrobmbXhtuwut2d3DUp7Nvswqnp55fdJvyYIsBcQpDsT7S1pRv723VzS\nj1FdDRyapz2I9LslZmbWI51cdbMX6cfB5gCPKt256XDgS5KOJf1u+5ERsV7SQlIPfwuwKCLWSroY\nWCDpRtKJ3aO2yZaYmVlDT8pvxg7K5ZWD8ufcIHBbdpfbs7sGpT0nc3mlmZkNMAe9mVnhSr6VoNlA\nWHHcUd1fZteXCC8674JtsFTrBffozcwK56A3Myuch27MrChf+ez1/a5CR96/cH7P1uUevZlZ4Rz0\nZmaFc9CbmRXOQW9mVjgHvZlZ4Rz0ZmaFc9CbmRXOQW9mVjgHvZlZ4Rz0ZmaFc9CbmRXOQW9mVriO\nftRM0h7ApcBZEfHluvI3AFdFxFB+fgRwPLAZWBwR50vaDrgA2A3YBBwdEXd2dSvMzKyptj16SdOA\ns4FrxpU/A/g4sKpuulOA/YH5wAmSdiLdSPzBiNgX+AxwRhfrb2ZmbXQydLMBeBOwclz5J4BzgEfy\n832A5RGxNiLWAzcBc4HXAZfkaZblMjMz65G2QR8RG3NwbyXpRcBLI+I7dcUzgbG656uBWfXlEbEZ\n2CJp+8lW3MzMOlP1xiNnAR9qM83QBMu3GhmZyvDwlAlXqh9GR6f3uwrFeKq25ba4v+u28FTdP9tK\nL9tzwkEvaTawO/BNSQCzJP0AOJXUe6+ZDdxMGvKZCdyaT8wORcQjtLBmzUMTrVZfjI5OZ2xsXb+r\nUQS35ZOf9093bYv2bPbhMeGgj4jfAi+oPZd0V0S8RtIOwHmSZgAbSWPxxwPPBA4FlgIHAddNuPZm\nZlZZ26CXtBdwJjAHeFTSIcDbI+KB+ukiYr2khaRA3wIsioi1ki4GFki6kXRi96juboKZmbXSNugj\n4qekyyWbvT6n7vESYMm41zcBR1euoZmZTYq/GWtmVjgHvZlZ4Rz0ZmaFc9CbmRXOQW9mVjgHvZlZ\n4Rz0ZmaFc9CbmRXOQW9mVjgHvZlZ4Rz0ZmaFc9CbmRXOQW9mVjgHvZlZ4Rz0ZmaFc9CbmRXOQW9m\nVjgHvZlZ4Tq6ObikPYBLgbMi4suSngt8HdgOeBR4Z0TcK+kI0g3BNwOLI+J8SdsBFwC7AZuAoyPi\nzu5vivXKB6/9WL+r0JFzXvu5flfB7EmhbY9e0jTgbOCauuJPk4L8NcAlwIl5ulOA/Un3mD1B0k7A\n4cCDEbEv8BngjK5ugZmZtdTJ0M0G4E3AyrqyDwD/Nz8eA3YG9gGWR8TaiFgP3ATMBV5H+jAAWJbL\nzMysR9oGfURszMFdX/aHiNgkaQrwQeAfgZmk0K9ZDcyqL4+IzcAWSdt3qf5mZtZGR2P0jeSQvwi4\nNiKukXT4uEmGmszarHyrkZGpDA9PqVq1nhodnd7vKlgTg7JvVvS7Ah0alPYcFL1sz8pBTzoZ+6uI\nWJSfryT13mtmAzfXld+aT8wORcQjrRa8Zs1Dk6hW74yOTmdsbF2/q2FNeN90l9uzu7ZFezb78KgU\n9Pnqmkci4tS64luA8yTNADaSxuKPB54JHAosBQ4CrquyTjMzq6Zt0EvaCzgTmAM8KukQYFfgYUnX\n58l+EREfkLSQFOhbgEURsVbSxcACSTeSTuwe1fWtMDOzptoGfUT8lHS5ZFsRsQRYMq5sE3B0lcqZ\nmdnk+ZuxZmaFc9CbmRXOQW9mVjgHvZlZ4Rz0ZmaFc9CbmRXOQW9mVjgHvZlZ4Rz0ZmaFc9CbmRXO\nQW9mVjgHvZlZ4Rz0ZmaFc9CbmRXOQW9mVjgHvZlZ4Rz0ZmaFc9CbmRWuo5uDS9oDuBQ4KyK+LOm5\nwEXAFGAV8K6I2JBvGn48sBlYHBHnS9oOuADYDdgEHB0Rd3Z/U8zMrJG2PXpJ04CzgWvqik8HzomI\necAdwDF5ulOA/Un3mD1B0k7A4cCDEbEv8BngjK5ugZmZtdTJ0M0G4E3Ayrqy+cBl+fHlpHDfB1ge\nEWsjYj1wEzAXeB1wSZ52WS4zM7MeaTt0ExEbgY2S6ounRcSG/Hg1MAuYCYzVTfOE8ojYLGmLpO0j\n4pFm6xwZmcrw8JQJbUi/jI5O73cVrIlB2Tcr+l2BDg1Kew6KXrZnR2P0bQx1qXyrNWseql6bHhod\nnc7Y2Lp+V8Oa8L7pLrdnd22L9mz24VH1qpvfS9ohP55NGtZZSeq906w8n5gdatWbNzOz7qoa9MuA\ng/Pjg4GrgFuAvSXNkLQjaSz+BuBq4NA87UHAddWra2ZmE9V26EbSXsCZwBzgUUmHAEcAF0h6H3A3\ncGFEPCppIbAU2AIsioi1ki4GFki6kXRi96htsiVmZtZQJydjf0q6yma8BQ2mXQIsGVe2CTi6Yv3M\nzGyS/M1YM7PCOejNzArnoDczK5yD3syscA56M7PCOejNzArnoDczK5yD3syscA56M7PCOejNzArn\noDczK5yD3syscA56M7PCOejNzArnoDczK5yD3syscA56M7PCOejNzArX9laCjeSbf38DGAGeDiwC\n7gW+Qrpf7G0R8f487UdJNwev3Uf2yi7U28zMOlS1R38UEBGxH3AI8CXgi8CHI2Iu8CxJB0h6HvA/\ngX2BA4EvSJoy+WqbmVmnqgb9/cDO+fEI8ADwvIhYnssuB/YH9gO+HxGPRMQYcDfw4knU18zMJqjS\n0E1EfEvSUZLuIAX9QcA5dZOsBmYBvwPGGpT/Z6vlj4xMZXh4MDr+o6PT+10Fa2JQ9s2KflegQ4PS\nnoOil+1ZdYz+ncA9EfFGSS8FLgHW1k0y1GTWZuWPs2bNQ1Wq1XOjo9MZG1vX72pYE9433eX27K5t\n0Z7NPjyqDt3MBZYCRMStwA7ALnWvzwZW5n8zG5SbmVmPVA36O4B9ACTtBqwDfilp3/z624GrgGuB\nN0vaXtKzSUH/i8lV2czMJqLS0A1wLvA1ST/Iy/gr0uWV50p6GnBLRCwDkPT3wA9Jl1e+PyI2T77a\nZmbWqaonY38PvKPBS/MaTHs2cHaV9ZiZ2eT5m7FmZoVz0JuZFc5Bb2ZWOAe9mVnhHPRmZoVz0JuZ\nFc5Bb2ZWOAe9mVnhHPRmZoVz0JuZFc5Bb2ZWOAe9mVnhHPRmZoVz0JuZFc5Bb2ZWOAe9mVnhHPRm\nZoWreitBJB0BfAzYCJwC3AZcBEwBVgHviogNebrjgc3A4og4f9K1NjOzjlXq0UvaGTgV2Bc4EHgL\ncDpwTkTMI908/BhJ00gfAvsD84ETJO3UhXqbmVmHqvbo9weWRcQ6YB3wXkm/Id0kHOBy4G+AAJZH\nxFoASTcBc/PrZmbWA1WDfg4wVdJlwAhwGjAtIjbk11cDs4CZwFjdfLXynjvms9f2Y7UT8rWFr+13\nFcysQFWDfgjYGXgbsBtwXS6rf73ZfG2NjExleHhKxaoNrtHR6f2uQlEGpT1X9LsCHRqU9hwUvWzP\nqkF/H/CjiNgI/FrSOmCjpB0iYj0wG1iZ/82sm282cHO7ha9Z81DFag22sbF1/a5CUdye3eX27K5t\n0Z7NPjyqXl55NfBaSU/LJ2Z3BJYBB+fXDwauAm4B9pY0Q9KOpPH5Gyqu08zMKqgU9BHxW2AJqXf+\nfeCvSVfhHCnpBmAn4MLcu18ILCV9ECyqnZg1M7PeqHwdfUScC5w7rnhBg+mWkD4UzMysD/zNWDOz\nwjnozcwK56A3Myucg97MrHAOejOzwjnozcwK56A3Myucg97MrHAOejOzwjnozcwK56A3Myucg97M\nrHAOejOzwjnozcwK56A3Myucg97MrHAOejOzwjnozcwKV/lWggCSdgB+BnwKuAa4CJgCrALeFREb\nJB0BHA9sBhZHxPmTq7KZmU3EZHv0JwMP5MenA+dExDzgDuAYSdOAU4D9gfnACZJ2muQ6zcxsAioH\nvaTdgRcDV+Si+cBl+fHlpHDfB1geEWsjYj1wEzC3cm3NzGzCJtOjPxM4se75tIjYkB+vBmYBM4Gx\numlq5WZm1iOVxuglvRv4cUT8RlKjSYaazNqs/HFGRqYyPDylStUG2ujo9H5XoSiD0p4r+l2BDg1K\new6KXrZn1ZOxbwaeL+lA4DnABuD3knbIQzSzgZX538y6+WYDN7db+Jo1D1Ws1mAbG1vX7yoUxe3Z\nXW7P7toW7dnsw6NS0EfEYbXHkk4D7gJeBRwM/EP+/yrgFuA8STOAjaTx+eOrrNPMzKrp5nX0pwJH\nSroB2Am4MPfuFwJLgWXAoohY28V1mplZG5O6jh4gIk6re7qgwetLgCWTXY+ZmVXjb8aamRXOQW9m\nVjgHvZlZ4Rz0ZmaFc9CbmRXOQW9mVjgHvZlZ4Rz0ZmaFc9CbmRXOQW9mVjgHvZlZ4Rz0ZmaFc9Cb\nmRXOQW9mVjgHvZlZ4Rz0ZmaFc9CbmRXOQW9mVrjKtxKU9DlgXl7GGcBy4CJgCrAKeFdEbJB0BOmG\n4JuBxRFx/qRrbWZmHavUo5e0H7BHRLwSeCPwReB04JyImAfcARwjaRpwCrA/MB84QdJO3ai4mZl1\npurQzQ+BQ/PjB4FppCC/LJddTgr3fYDlEbE2ItYDNwFzK9fWzMwmrNLQTURsAv6Qnx4LXAm8ISI2\n5LLVwCxgJjBWN2utvKWRkakMD0+pUrWBNjo6vd9VKMqgtOeKflegQ4PSnoOil+1ZeYweQNJbSEH/\neuBXdS8NNZmlWfnjrFnz0GSqNbDGxtb1uwpFcXt2l9uzu7ZFezb78Kh81Y2kNwAnAQdExFrg95J2\nyC/PBlbmfzPrZquVm5lZj1Q9Gfss4O+AAyPigVy8DDg4Pz4YuAq4Bdhb0gxJO5LG52+YXJXNzGwi\nqg7dHAbsAnxbUq3sSOA8Se8D7gYujIhHJS0ElgJbgEW5929mZj1S9WTsYmBxg5cWNJh2CbCkynrM\nzGzy/M1YM7PCOejNzArnoDczK5yD3syscA56M7PCOejNzArnoDczK5yD3syscA56M7PCOejNzArn\noDczK5yD3syscA56M7PCOejNzArnoDczK5yD3syscA56M7PCOejNzApX9Z6xEyLpLOAVpPvGfjgi\nlvdivWZm1oMevaTXAC+MiFcCxwL/e1uv08zMHtOLoZvXAd8DiIhfAiOSntmD9ZqZGTC0ZcuWbboC\nSYuBKyLi0vz8BuDYiFixTVdsZmZAf07GDvVhnWZmT1m9CPqVwMy6588GVvVgvWZmRm+C/mrgEABJ\nLwdWRsS6HqzXzMzowRg9gKTPAq8GNgMfjIhbt/lKzcwM6FHQm5lZ//ibsWZmhXPQm5kVzkE/SZJm\n9LsOg0rSEy61lfScftSlJJJ26XcdSiLptf2uw2T15LduCvddYOAPhF6S9Dbgi8BUSVcC/6vuSqxv\n4PbsmKQ3A18A/gs4HvgmMCxpGvCBiLiyn/UbNJLePa5oCDhZ0qcAIuIbva/V5DnoOyDpA01eGgJm\n97IuhVgI7Ak8CBwHXC3pjRGxFn+hbqJOBhYAfwz8M/CWiLhV0h8BlwMO+ok5BfgdcAWPHYvPAJ7X\ntxp1gYO+MycCy2j8Ra/telyXEmyKiAfy48WS7gOWSjqQ9Aun1rkNEXEPcI+k39YuXY6I+yQ93Oe6\nDaI9gE8CLwVOjIi7cydkUZ/rNSkO+s68lfSrmx+OiA31L0ia35caDbYbJf0zcGhErI+IS3MoXQPs\n3Oe6DZr7JP1NRHw+IubC1vMcHyEN59gERMTDwEmSBJwj6UcUcC5z4DegFyLiZ8CBwKMNXv5Ij6sz\n8CLiY8DngYfrypYC84CB7jn1wVHAPePKdgXuJv0suFUQyYGkD8vf9Ls+k+UvTJmZFc49ejOzwjno\nzcwK56A3Myucg97MrHAOejOzwv1/wmdg9PG/WdAAAAAASUVORK5CYII=\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"tags": []
}
}
]
},
{
"metadata": {
"id": "Znz5-1WjXO0l",
"colab_type": "code",
"outputId": "84cd125d-7f35-49e4-f03f-decbd78363ef",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 266
}
},
"cell_type": "code",
"source": [
"def prep_metadata(df):\n",
" # select process_id and pipeline\n",
" meta = df[['process_id', 'pipeline']].drop_duplicates().set_index('process_id') \n",
" \n",
" # convert categorical pipeline data to dummy variables\n",
" meta = pd.get_dummies(meta)\n",
" \n",
" # pipeline L12 not in test data\n",
" if 'L12' not in meta.columns:\n",
" meta['pipeline_L12'] = 0\n",
" \n",
" # calculate number of phases for each process_object\n",
" meta['num_phases'] = df.groupby('process_id')['phase'].apply(lambda x: x.nunique())\n",
" \n",
" return meta\n",
"\n",
"# show example for first 5,000 observations\n",
"prep_metadata(train_limited.head(5000))"
],
"execution_count": 11,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>pipeline_L3</th>\n",
" <th>pipeline_L4</th>\n",
" <th>pipeline_L7</th>\n",
" <th>pipeline_L12</th>\n",
" <th>num_phases</th>\n",
" </tr>\n",
" <tr>\n",
" <th>process_id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>20001</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20002</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20003</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20004</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20005</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20008</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" pipeline_L3 pipeline_L4 pipeline_L7 pipeline_L12 num_phases\n",
"process_id \n",
"20001 0 1 0 0 4\n",
"20002 1 0 0 0 2\n",
"20003 1 0 0 0 3\n",
"20004 0 0 1 0 2\n",
"20005 0 0 1 0 1\n",
"20008 0 1 0 0 3"
]
},
"metadata": {
"tags": []
},
"execution_count": 11
}
]
},
{
"metadata": {
"id": "eC4wM7FBJodj",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 238
},
"outputId": "354649cd-dce5-4ade-b57b-66a3002f1539"
},
"cell_type": "code",
"source": [
"train_values.columns"
],
"execution_count": 12,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"Index(['process_id', 'object_id', 'phase', 'timestamp', 'pipeline',\n",
" 'supply_flow', 'supply_pressure', 'return_temperature',\n",
" 'return_conductivity', 'return_turbidity', 'return_flow', 'supply_pump',\n",
" 'supply_pre_rinse', 'supply_caustic', 'return_caustic', 'supply_acid',\n",
" 'return_acid', 'supply_clean_water', 'return_recovery_water',\n",
" 'return_drain', 'object_low_level', 'tank_level_pre_rinse',\n",
" 'tank_level_caustic', 'tank_level_acid', 'tank_level_clean_water',\n",
" 'tank_temperature_pre_rinse', 'tank_temperature_caustic',\n",
" 'tank_temperature_acid', 'tank_concentration_caustic',\n",
" 'tank_concentration_acid', 'tank_lsh_caustic', 'tank_lsh_acid',\n",
" 'tank_lsh_clean_water', 'tank_lsh_pre_rinse', 'target_time_period',\n",
" 'process_phase'],\n",
" dtype='object')"
]
},
"metadata": {
"tags": []
},
"execution_count": 12
}
]
},
{
"metadata": {
"id": "aR3kcbRYeFp7",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"def feature_engineering(df):\n",
" df.return_temperature = np.square(df.return_temperature)\n",
" df.return_turbidity = np.log(df.return_turbidity + 1)\n",
" df['tank_level_diff12'] = df['tank_level_pre_rinse'] - df['tank_level_caustic']\n",
" df['tank_level_diff23'] = df['tank_level_caustic'] - df['tank_level_acid']\n",
" df['tank_level_diff34'] = df['tank_level_acid'] - df['tank_level_clean_water']\n",
" df = df.drop(['tank_level_pre_rinse', 'tank_level_caustic', 'tank_level_acid', 'tank_level_clean_water'], axis=1)\n",
" df['tank_temp_diff12'] = df['tank_temperature_pre_rinse'] - df['tank_temperature_caustic']\n",
" df['tank_temp_diff23'] = df['tank_temperature_caustic'] - df['tank_temperature_acid']\n",
" df = df.drop(['tank_temperature_pre_rinse', 'tank_temperature_caustic', 'tank_temperature_acid'], axis=1)\n",
" df.loc[:, df.dtypes == bool] = df.loc[:, df.dtypes == bool].astype('int')\n",
" return df"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "fy17bIjoeYRu",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 51
},
"outputId": "425ff1ef-6926-473c-8931-a08d11357f9d"
},
"cell_type": "code",
"source": [
"%%time\n",
"train_limited = feature_engineering(train_limited)\n",
"test_values = feature_engineering(test_values)"
],
"execution_count": 14,
"outputs": [
{
"output_type": "stream",
"text": [
"CPU times: user 3.68 s, sys: 1.31 s, total: 4.99 s\n",
"Wall time: 4.92 s\n"
],
"name": "stdout"
}
]
},
{
"metadata": {
"id": "IImzXuHGXT8K",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"# variables we'll use to create our time series features\n",
"ts_cols = [\n",
" 'process_id',\n",
" 'supply_flow',\n",
" 'supply_pressure',\n",
" 'return_temperature',\n",
" 'return_conductivity',\n",
" 'return_turbidity',\n",
" 'return_flow',\n",
" 'tank_concentration_caustic',\n",
" 'tank_concentration_acid',\n",
" 'tank_level_diff12',\n",
" 'tank_level_diff23',\n",
" 'tank_level_diff34',\n",
" 'tank_temp_diff12',\n",
" 'tank_temp_diff23',\n",
"]\n",
"\n",
"binary_cols = [\n",
" 'process_id',\n",
" 'supply_pump',\n",
" 'supply_pre_rinse', 'supply_caustic', 'return_caustic', 'supply_acid',\n",
" 'return_acid', 'supply_clean_water', 'return_recovery_water',\n",
" 'return_drain', 'object_low_level',\n",
" 'tank_lsh_caustic', \n",
" 'tank_lsh_acid',\n",
" 'tank_lsh_clean_water',\n",
" 'tank_lsh_pre_rinse',\n",
"]"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "X3w2BFK6XXMp",
"colab_type": "code",
"outputId": "60f97c17-f0bf-40e2-cae5-b714730676c7",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 346
}
},
"cell_type": "code",
"source": [
"def prep_time_series_features(df, columns=None):\n",
" if columns is None:\n",
" columns = df.columns\n",
" ts_df = df[ts_cols].set_index('process_id')\n",
" # create features: min, max, mean, standard deviation, and mean of the last five observations\n",
" ts_features = ts_df.groupby('process_id').agg(['min', 'max', 'mean', 'std', lambda x: x.tail(5).mean()])\n",
" bn_features = df[binary_cols].set_index('process_id')\n",
" bn_features = bn_features.groupby('process_id').agg(['mean', 'std', lambda x: x.tail(5).mean()])\n",
" feature_mat = pd.concat([ts_features,bn_features ], axis=1)\n",
" return feature_mat\n",
"\n",
"# show example for first 5,000 observations\n",
"prep_time_series_features(train_limited.head(5000), columns=ts_cols)"
],
"execution_count": 16,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead tr th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe thead tr:last-of-type th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr>\n",
" <th></th>\n",
" <th colspan=\"5\" halign=\"left\">supply_flow</th>\n",
" <th colspan=\"5\" halign=\"left\">supply_pressure</th>\n",
" <th colspan=\"5\" halign=\"left\">return_temperature</th>\n",
" <th colspan=\"5\" halign=\"left\">return_conductivity</th>\n",
" <th>...</th>\n",
" <th colspan=\"2\" halign=\"left\">return_recovery_water</th>\n",
" <th colspan=\"3\" halign=\"left\">return_drain</th>\n",
" <th colspan=\"3\" halign=\"left\">object_low_level</th>\n",
" <th colspan=\"3\" halign=\"left\">tank_lsh_caustic</th>\n",
" <th colspan=\"3\" halign=\"left\">tank_lsh_acid</th>\n",
" <th colspan=\"3\" halign=\"left\">tank_lsh_clean_water</th>\n",
" <th colspan=\"3\" halign=\"left\">tank_lsh_pre_rinse</th>\n",
" </tr>\n",
" <tr>\n",
" <th></th>\n",
" <th>min</th>\n",
" <th>max</th>\n",
" <th>mean</th>\n",
" <th>std</th>\n",
" <th>&lt;lambda&gt;</th>\n",
" <th>min</th>\n",
" <th>max</th>\n",
" <th>mean</th>\n",
" <th>std</th>\n",
" <th>&lt;lambda&gt;</th>\n",
" <th>min</th>\n",
" <th>max</th>\n",
" <th>mean</th>\n",
" <th>std</th>\n",
" <th>&lt;lambda&gt;</th>\n",
" <th>min</th>\n",
" <th>max</th>\n",
" <th>mean</th>\n",
" <th>std</th>\n",
" <th>&lt;lambda&gt;</th>\n",
" <th>...</th>\n",
" <th>std</th>\n",
" <th>&lt;lambda&gt;</th>\n",
" <th>mean</th>\n",
" <th>std</th>\n",
" <th>&lt;lambda&gt;</th>\n",
" <th>mean</th>\n",
" <th>std</th>\n",
" <th>&lt;lambda&gt;</th>\n",
" <th>mean</th>\n",
" <th>std</th>\n",
" <th>&lt;lambda&gt;</th>\n",
" <th>mean</th>\n",
" <th>std</th>\n",
" <th>&lt;lambda&gt;</th>\n",
" <th>mean</th>\n",
" <th>std</th>\n",
" <th>&lt;lambda&gt;</th>\n",
" <th>mean</th>\n",
" <th>std</th>\n",
" <th>&lt;lambda&gt;</th>\n",
" </tr>\n",
" <tr>\n",
" <th>process_id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>20001</th>\n",
" <td>21.701390</td>\n",
" <td>59396.703</td>\n",
" <td>49501.604051</td>\n",
" <td>12445.724586</td>\n",
" <td>48618.3452</td>\n",
" <td>-0.036024</td>\n",
" <td>2.223741</td>\n",
" <td>1.673456</td>\n",
" <td>0.344843</td>\n",
" <td>1.204688</td>\n",
" <td>192.901238</td>\n",
" <td>6811.263623</td>\n",
" <td>4785.290155</td>\n",
" <td>2424.995471</td>\n",
" <td>5250.902249</td>\n",
" <td>0.255486</td>\n",
" <td>57.301300</td>\n",
" <td>32.976699</td>\n",
" <td>18.421732</td>\n",
" <td>42.254814</td>\n",
" <td>...</td>\n",
" <td>0.239511</td>\n",
" <td>0</td>\n",
" <td>0.231579</td>\n",
" <td>0.421989</td>\n",
" <td>0</td>\n",
" <td>0.065263</td>\n",
" <td>0.247076</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20002</th>\n",
" <td>7.233796</td>\n",
" <td>34295.430</td>\n",
" <td>27142.963425</td>\n",
" <td>8337.197885</td>\n",
" <td>29688.2238</td>\n",
" <td>-0.034071</td>\n",
" <td>2.170790</td>\n",
" <td>1.477925</td>\n",
" <td>0.421852</td>\n",
" <td>1.552300</td>\n",
" <td>76.423316</td>\n",
" <td>5835.815584</td>\n",
" <td>3397.445078</td>\n",
" <td>2014.266483</td>\n",
" <td>5827.088734</td>\n",
" <td>0.172301</td>\n",
" <td>46.425180</td>\n",
" <td>30.834840</td>\n",
" <td>17.792754</td>\n",
" <td>42.065181</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0</td>\n",
" <td>0.264574</td>\n",
" <td>0.441601</td>\n",
" <td>0</td>\n",
" <td>0.094170</td>\n",
" <td>0.292394</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20003</th>\n",
" <td>-1244.213000</td>\n",
" <td>103096.070</td>\n",
" <td>29902.913654</td>\n",
" <td>10872.426632</td>\n",
" <td>30179.3980</td>\n",
" <td>-0.033854</td>\n",
" <td>3.855469</td>\n",
" <td>2.630566</td>\n",
" <td>0.917518</td>\n",
" <td>3.092057</td>\n",
" <td>460.491127</td>\n",
" <td>6656.335770</td>\n",
" <td>5000.020521</td>\n",
" <td>1892.554316</td>\n",
" <td>5108.878406</td>\n",
" <td>0.625647</td>\n",
" <td>46.821130</td>\n",
" <td>36.146996</td>\n",
" <td>15.031208</td>\n",
" <td>43.976728</td>\n",
" <td>...</td>\n",
" <td>0.198993</td>\n",
" <td>0</td>\n",
" <td>0.113953</td>\n",
" <td>0.317847</td>\n",
" <td>0</td>\n",
" <td>0.026744</td>\n",
" <td>0.161382</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20004</th>\n",
" <td>-43.402780</td>\n",
" <td>49537.035</td>\n",
" <td>31117.193119</td>\n",
" <td>13046.723480</td>\n",
" <td>33796.2958</td>\n",
" <td>-0.009549</td>\n",
" <td>0.482422</td>\n",
" <td>0.287865</td>\n",
" <td>0.163790</td>\n",
" <td>0.343012</td>\n",
" <td>117.816067</td>\n",
" <td>6741.592292</td>\n",
" <td>3915.156547</td>\n",
" <td>2019.189578</td>\n",
" <td>5125.022720</td>\n",
" <td>0.198415</td>\n",
" <td>46.577435</td>\n",
" <td>29.147263</td>\n",
" <td>18.143533</td>\n",
" <td>44.471240</td>\n",
" <td>...</td>\n",
" <td>0.125975</td>\n",
" <td>0</td>\n",
" <td>0.230563</td>\n",
" <td>0.421759</td>\n",
" <td>0</td>\n",
" <td>0.016086</td>\n",
" <td>0.125975</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20005</th>\n",
" <td>0.000000</td>\n",
" <td>31295.209</td>\n",
" <td>28241.716353</td>\n",
" <td>7107.794924</td>\n",
" <td>30962.0944</td>\n",
" <td>-0.023438</td>\n",
" <td>0.487196</td>\n",
" <td>0.418469</td>\n",
" <td>0.128794</td>\n",
" <td>0.468186</td>\n",
" <td>323.526197</td>\n",
" <td>5115.188500</td>\n",
" <td>3731.087905</td>\n",
" <td>1873.209470</td>\n",
" <td>5028.949060</td>\n",
" <td>0.297533</td>\n",
" <td>50.747000</td>\n",
" <td>33.776595</td>\n",
" <td>18.339520</td>\n",
" <td>42.345008</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0</td>\n",
" <td>0.248031</td>\n",
" <td>0.432723</td>\n",
" <td>0</td>\n",
" <td>0.318898</td>\n",
" <td>0.466969</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20008</th>\n",
" <td>28638.598000</td>\n",
" <td>60980.902</td>\n",
" <td>51773.149642</td>\n",
" <td>8205.815088</td>\n",
" <td>55064.3812</td>\n",
" <td>-0.092665</td>\n",
" <td>0.174913</td>\n",
" <td>0.167884</td>\n",
" <td>0.036967</td>\n",
" <td>0.135851</td>\n",
" <td>163.937807</td>\n",
" <td>6862.104422</td>\n",
" <td>6083.104591</td>\n",
" <td>1779.620290</td>\n",
" <td>869.118722</td>\n",
" <td>1.070271</td>\n",
" <td>53.216606</td>\n",
" <td>40.437314</td>\n",
" <td>11.592061</td>\n",
" <td>48.014119</td>\n",
" <td>...</td>\n",
" <td>0.237828</td>\n",
" <td>1</td>\n",
" <td>0.061381</td>\n",
" <td>0.240182</td>\n",
" <td>0</td>\n",
" <td>0.021739</td>\n",
" <td>0.145924</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>6 rows × 107 columns</p>\n",
"</div>"
],
"text/plain": [
" supply_flow \\\n",
" min max mean std <lambda> \n",
"process_id \n",
"20001 21.701390 59396.703 49501.604051 12445.724586 48618.3452 \n",
"20002 7.233796 34295.430 27142.963425 8337.197885 29688.2238 \n",
"20003 -1244.213000 103096.070 29902.913654 10872.426632 30179.3980 \n",
"20004 -43.402780 49537.035 31117.193119 13046.723480 33796.2958 \n",
"20005 0.000000 31295.209 28241.716353 7107.794924 30962.0944 \n",
"20008 28638.598000 60980.902 51773.149642 8205.815088 55064.3812 \n",
"\n",
" supply_pressure \\\n",
" min max mean std <lambda> \n",
"process_id \n",
"20001 -0.036024 2.223741 1.673456 0.344843 1.204688 \n",
"20002 -0.034071 2.170790 1.477925 0.421852 1.552300 \n",
"20003 -0.033854 3.855469 2.630566 0.917518 3.092057 \n",
"20004 -0.009549 0.482422 0.287865 0.163790 0.343012 \n",
"20005 -0.023438 0.487196 0.418469 0.128794 0.468186 \n",
"20008 -0.092665 0.174913 0.167884 0.036967 0.135851 \n",
"\n",
" return_temperature \\\n",
" min max mean std \n",
"process_id \n",
"20001 192.901238 6811.263623 4785.290155 2424.995471 \n",
"20002 76.423316 5835.815584 3397.445078 2014.266483 \n",
"20003 460.491127 6656.335770 5000.020521 1892.554316 \n",
"20004 117.816067 6741.592292 3915.156547 2019.189578 \n",
"20005 323.526197 5115.188500 3731.087905 1873.209470 \n",
"20008 163.937807 6862.104422 6083.104591 1779.620290 \n",
"\n",
" return_conductivity \\\n",
" <lambda> min max mean std \n",
"process_id \n",
"20001 5250.902249 0.255486 57.301300 32.976699 18.421732 \n",
"20002 5827.088734 0.172301 46.425180 30.834840 17.792754 \n",
"20003 5108.878406 0.625647 46.821130 36.146996 15.031208 \n",
"20004 5125.022720 0.198415 46.577435 29.147263 18.143533 \n",
"20005 5028.949060 0.297533 50.747000 33.776595 18.339520 \n",
"20008 869.118722 1.070271 53.216606 40.437314 11.592061 \n",
"\n",
" ... return_recovery_water return_drain \\\n",
" <lambda> ... std <lambda> mean \n",
"process_id ... \n",
"20001 42.254814 ... 0.239511 0 0.231579 \n",
"20002 42.065181 ... 0.000000 0 0.264574 \n",
"20003 43.976728 ... 0.198993 0 0.113953 \n",
"20004 44.471240 ... 0.125975 0 0.230563 \n",
"20005 42.345008 ... 0.000000 0 0.248031 \n",
"20008 48.014119 ... 0.237828 1 0.061381 \n",
"\n",
" object_low_level \\\n",
" std <lambda> mean std <lambda> \n",
"process_id \n",
"20001 0.421989 0 0.065263 0.247076 0 \n",
"20002 0.441601 0 0.094170 0.292394 0 \n",
"20003 0.317847 0 0.026744 0.161382 0 \n",
"20004 0.421759 0 0.016086 0.125975 0 \n",
"20005 0.432723 0 0.318898 0.466969 0 \n",
"20008 0.240182 0 0.021739 0.145924 0 \n",
"\n",
" tank_lsh_caustic tank_lsh_acid \\\n",
" mean std <lambda> mean std <lambda> \n",
"process_id \n",
"20001 0 0.0 0 0.0 0.0 0.0 \n",
"20002 0 0.0 0 0.0 0.0 0.0 \n",
"20003 0 0.0 0 0.0 0.0 0.0 \n",
"20004 0 0.0 0 0.0 0.0 0.0 \n",
"20005 0 0.0 0 0.0 0.0 0.0 \n",
"20008 0 0.0 0 0.0 0.0 0.0 \n",
"\n",
" tank_lsh_clean_water tank_lsh_pre_rinse \n",
" mean std <lambda> mean std <lambda> \n",
"process_id \n",
"20001 0 0.0 0 0.0 0.0 0.0 \n",
"20002 0 0.0 0 0.0 0.0 0.0 \n",
"20003 0 0.0 0 0.0 0.0 0.0 \n",
"20004 0 0.0 0 0.0 0.0 0.0 \n",
"20005 0 0.0 0 0.0 0.0 0.0 \n",
"20008 0 0.0 0 0.0 0.0 0.0 \n",
"\n",
"[6 rows x 107 columns]"
]
},
"metadata": {
"tags": []
},
"execution_count": 16
}
]
},
{
"metadata": {
"id": "veSTzIRLXZLv",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"def create_feature_matrix(df):\n",
" metadata = prep_metadata(df)\n",
" time_series = prep_time_series_features(df)\n",
" \n",
" # join metadata and time series features into a single dataframe\n",
" feature_matrix = pd.concat([metadata, time_series], axis=1)\n",
" \n",
" return feature_matrix"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "wn-5TjyvXcnl",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 51
},
"outputId": "bd7765f7-fbe2-4729-ced3-76bfa9b380c1"
},
"cell_type": "code",
"source": [
"%%time\n",
"train_features = create_feature_matrix(train_limited)\n",
"# create metadata and time series features\n",
"test_features = create_feature_matrix(test_values)"
],
"execution_count": 18,
"outputs": [
{
"output_type": "stream",
"text": [
"CPU times: user 34.3 s, sys: 1.21 s, total: 35.6 s\n",
"Wall time: 35.2 s\n"
],
"name": "stdout"
}
]
},
{
"metadata": {
"id": "2VmDoffLXe3H",
"colab_type": "code",
"outputId": "51d774a1-10db-409b-bb98-b44aff13314c",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 293
}
},
"cell_type": "code",
"source": [
"train_features.head()"
],
"execution_count": 19,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>pipeline_L1</th>\n",
" <th>pipeline_L10</th>\n",
" <th>pipeline_L11</th>\n",
" <th>pipeline_L12</th>\n",
" <th>pipeline_L2</th>\n",
" <th>pipeline_L3</th>\n",
" <th>pipeline_L4</th>\n",
" <th>pipeline_L6</th>\n",
" <th>pipeline_L7</th>\n",
" <th>pipeline_L8</th>\n",
" <th>pipeline_L9</th>\n",
" <th>num_phases</th>\n",
" <th>(supply_flow, min)</th>\n",
" <th>(supply_flow, max)</th>\n",
" <th>(supply_flow, mean)</th>\n",
" <th>(supply_flow, std)</th>\n",
" <th>(supply_flow, &lt;lambda&gt;)</th>\n",
" <th>(supply_pressure, min)</th>\n",
" <th>(supply_pressure, max)</th>\n",
" <th>(supply_pressure, mean)</th>\n",
" <th>...</th>\n",
" <th>(return_recovery_water, std)</th>\n",
" <th>(return_recovery_water, &lt;lambda&gt;)</th>\n",
" <th>(return_drain, mean)</th>\n",
" <th>(return_drain, std)</th>\n",
" <th>(return_drain, &lt;lambda&gt;)</th>\n",
" <th>(object_low_level, mean)</th>\n",
" <th>(object_low_level, std)</th>\n",
" <th>(object_low_level, &lt;lambda&gt;)</th>\n",
" <th>(tank_lsh_caustic, mean)</th>\n",
" <th>(tank_lsh_caustic, std)</th>\n",
" <th>(tank_lsh_caustic, &lt;lambda&gt;)</th>\n",
" <th>(tank_lsh_acid, mean)</th>\n",
" <th>(tank_lsh_acid, std)</th>\n",
" <th>(tank_lsh_acid, &lt;lambda&gt;)</th>\n",
" <th>(tank_lsh_clean_water, mean)</th>\n",
" <th>(tank_lsh_clean_water, std)</th>\n",
" <th>(tank_lsh_clean_water, &lt;lambda&gt;)</th>\n",
" <th>(tank_lsh_pre_rinse, mean)</th>\n",
" <th>(tank_lsh_pre_rinse, std)</th>\n",
" <th>(tank_lsh_pre_rinse, &lt;lambda&gt;)</th>\n",
" </tr>\n",
" <tr>\n",
" <th>process_id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>20001</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>4</td>\n",
" <td>21.701390</td>\n",
" <td>59396.703</td>\n",
" <td>49501.604051</td>\n",
" <td>12445.724586</td>\n",
" <td>48618.3452</td>\n",
" <td>-0.036024</td>\n",
" <td>2.223741</td>\n",
" <td>1.673456</td>\n",
" <td>...</td>\n",
" <td>0.239511</td>\n",
" <td>0.0</td>\n",
" <td>0.231579</td>\n",
" <td>0.421989</td>\n",
" <td>0</td>\n",
" <td>0.065263</td>\n",
" <td>0.247076</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20002</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>7.233796</td>\n",
" <td>34295.430</td>\n",
" <td>27142.963425</td>\n",
" <td>8337.197885</td>\n",
" <td>29688.2238</td>\n",
" <td>-0.034071</td>\n",
" <td>2.170790</td>\n",
" <td>1.477925</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.264574</td>\n",
" <td>0.441601</td>\n",
" <td>0</td>\n",
" <td>0.094170</td>\n",
" <td>0.292394</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20003</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>-1244.213000</td>\n",
" <td>103096.070</td>\n",
" <td>29902.913654</td>\n",
" <td>10872.426632</td>\n",
" <td>30179.3980</td>\n",
" <td>-0.033854</td>\n",
" <td>3.855469</td>\n",
" <td>2.630566</td>\n",
" <td>...</td>\n",
" <td>0.198993</td>\n",
" <td>0.0</td>\n",
" <td>0.113953</td>\n",
" <td>0.317847</td>\n",
" <td>0</td>\n",
" <td>0.026744</td>\n",
" <td>0.161382</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20004</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>-43.402780</td>\n",
" <td>49537.035</td>\n",
" <td>31117.193119</td>\n",
" <td>13046.723480</td>\n",
" <td>33796.2958</td>\n",
" <td>-0.009549</td>\n",
" <td>0.482422</td>\n",
" <td>0.287865</td>\n",
" <td>...</td>\n",
" <td>0.125975</td>\n",
" <td>0.0</td>\n",
" <td>0.230563</td>\n",
" <td>0.421759</td>\n",
" <td>0</td>\n",
" <td>0.016086</td>\n",
" <td>0.125975</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20005</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0.000000</td>\n",
" <td>31295.209</td>\n",
" <td>28241.716353</td>\n",
" <td>7107.794924</td>\n",
" <td>30962.0944</td>\n",
" <td>-0.023438</td>\n",
" <td>0.487196</td>\n",
" <td>0.418469</td>\n",
" <td>...</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.248031</td>\n",
" <td>0.432723</td>\n",
" <td>0</td>\n",
" <td>0.318898</td>\n",
" <td>0.466969</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 119 columns</p>\n",
"</div>"
],
"text/plain": [
" pipeline_L1 pipeline_L10 pipeline_L11 pipeline_L12 \\\n",
"process_id \n",
"20001 0 0 0 0 \n",
"20002 0 0 0 0 \n",
"20003 0 0 0 0 \n",
"20004 0 0 0 0 \n",
"20005 0 0 0 0 \n",
"\n",
" pipeline_L2 pipeline_L3 pipeline_L4 pipeline_L6 pipeline_L7 \\\n",
"process_id \n",
"20001 0 0 1 0 0 \n",
"20002 0 1 0 0 0 \n",
"20003 0 1 0 0 0 \n",
"20004 0 0 0 0 1 \n",
"20005 0 0 0 0 1 \n",
"\n",
" pipeline_L8 pipeline_L9 num_phases (supply_flow, min) \\\n",
"process_id \n",
"20001 0 0 4 21.701390 \n",
"20002 0 0 2 7.233796 \n",
"20003 0 0 3 -1244.213000 \n",
"20004 0 0 2 -43.402780 \n",
"20005 0 0 1 0.000000 \n",
"\n",
" (supply_flow, max) (supply_flow, mean) (supply_flow, std) \\\n",
"process_id \n",
"20001 59396.703 49501.604051 12445.724586 \n",
"20002 34295.430 27142.963425 8337.197885 \n",
"20003 103096.070 29902.913654 10872.426632 \n",
"20004 49537.035 31117.193119 13046.723480 \n",
"20005 31295.209 28241.716353 7107.794924 \n",
"\n",
" (supply_flow, <lambda>) (supply_pressure, min) \\\n",
"process_id \n",
"20001 48618.3452 -0.036024 \n",
"20002 29688.2238 -0.034071 \n",
"20003 30179.3980 -0.033854 \n",
"20004 33796.2958 -0.009549 \n",
"20005 30962.0944 -0.023438 \n",
"\n",
" (supply_pressure, max) (supply_pressure, mean) \\\n",
"process_id \n",
"20001 2.223741 1.673456 \n",
"20002 2.170790 1.477925 \n",
"20003 3.855469 2.630566 \n",
"20004 0.482422 0.287865 \n",
"20005 0.487196 0.418469 \n",
"\n",
" ... (return_recovery_water, std) \\\n",
"process_id ... \n",
"20001 ... 0.239511 \n",
"20002 ... 0.000000 \n",
"20003 ... 0.198993 \n",
"20004 ... 0.125975 \n",
"20005 ... 0.000000 \n",
"\n",
" (return_recovery_water, <lambda>) (return_drain, mean) \\\n",
"process_id \n",
"20001 0.0 0.231579 \n",
"20002 0.0 0.264574 \n",
"20003 0.0 0.113953 \n",
"20004 0.0 0.230563 \n",
"20005 0.0 0.248031 \n",
"\n",
" (return_drain, std) (return_drain, <lambda>) \\\n",
"process_id \n",
"20001 0.421989 0 \n",
"20002 0.441601 0 \n",
"20003 0.317847 0 \n",
"20004 0.421759 0 \n",
"20005 0.432723 0 \n",
"\n",
" (object_low_level, mean) (object_low_level, std) \\\n",
"process_id \n",
"20001 0.065263 0.247076 \n",
"20002 0.094170 0.292394 \n",
"20003 0.026744 0.161382 \n",
"20004 0.016086 0.125975 \n",
"20005 0.318898 0.466969 \n",
"\n",
" (object_low_level, <lambda>) (tank_lsh_caustic, mean) \\\n",
"process_id \n",
"20001 0.0 0.0 \n",
"20002 0.0 0.0 \n",
"20003 0.0 0.0 \n",
"20004 0.0 0.0 \n",
"20005 0.0 0.0 \n",
"\n",
" (tank_lsh_caustic, std) (tank_lsh_caustic, <lambda>) \\\n",
"process_id \n",
"20001 0.0 0.0 \n",
"20002 0.0 0.0 \n",
"20003 0.0 0.0 \n",
"20004 0.0 0.0 \n",
"20005 0.0 0.0 \n",
"\n",
" (tank_lsh_acid, mean) (tank_lsh_acid, std) \\\n",
"process_id \n",
"20001 0.0 0.0 \n",
"20002 0.0 0.0 \n",
"20003 0.0 0.0 \n",
"20004 0.0 0.0 \n",
"20005 0.0 0.0 \n",
"\n",
" (tank_lsh_acid, <lambda>) (tank_lsh_clean_water, mean) \\\n",
"process_id \n",
"20001 0.0 0.0 \n",
"20002 0.0 0.0 \n",
"20003 0.0 0.0 \n",
"20004 0.0 0.0 \n",
"20005 0.0 0.0 \n",
"\n",
" (tank_lsh_clean_water, std) (tank_lsh_clean_water, <lambda>) \\\n",
"process_id \n",
"20001 0.0 0 \n",
"20002 0.0 0 \n",
"20003 0.0 0 \n",
"20004 0.0 0 \n",
"20005 0.0 0 \n",
"\n",
" (tank_lsh_pre_rinse, mean) (tank_lsh_pre_rinse, std) \\\n",
"process_id \n",
"20001 0.0 0.0 \n",
"20002 0.0 0.0 \n",
"20003 0.0 0.0 \n",
"20004 0.0 0.0 \n",
"20005 0.0 0.0 \n",
"\n",
" (tank_lsh_pre_rinse, <lambda>) \n",
"process_id \n",
"20001 0.0 \n",
"20002 0.0 \n",
"20003 0.0 \n",
"20004 0.0 \n",
"20005 0.0 \n",
"\n",
"[5 rows x 119 columns]"
]
},
"metadata": {
"tags": []
},
"execution_count": 19
}
]
},
{
"metadata": {
"id": "tUpLybrMdWW3",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"train_label_transformed = np.log(train_labels + 1)"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "0-tBroG3ZBR1",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"def mean_absolute_percentage_error(y_true, y_pred): \n",
" #y_true, y_pred = check_array(y_true, y_pred)\n",
" return np.mean(np.abs((y_true - y_pred) / np.maximum(y_true, 290000)))"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "ynI8h0JidrLj",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"import lightgbm as lgb\n",
"params = {'application': 'regression',\n",
" 'boosting': 'gbdt',\n",
" 'metric': 'rmse',\n",
" 'num_leaves': 80,\n",
" 'max_depth': 11,\n",
" 'learning_rate': 0.01,\n",
" 'bagging_fraction': 0.9,\n",
" 'feature_fraction': 0.8,\n",
" 'min_split_gain': 0.01,\n",
" 'min_child_samples': 150,\n",
" 'min_child_weight': 0.1,\n",
" 'verbosity': -1,\n",
" 'data_random_seed': 3,\n",
" 'early_stop': 100,\n",
" 'verbose_eval': 100,\n",
" 'num_rounds': 10000}"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "O6azR6OpHlKZ",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"from sklearn.model_selection import KFold"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "KKs9SFuKy1VB",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"train_labels = train_labels.values.reshape(-1)"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "TfAUmhUDuKSn",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"train_label_transformed = train_label_transformed.values.reshape(-1)"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "OES_VmjlHt-R",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 2114
},
"outputId": "9efd79f7-40b2-4d2c-cbf7-9fe703d5e44d"
},
"cell_type": "code",
"source": [
"def run_cv_model(train, test, target, real_target, model_fn, params={}, eval_fn=None, label='model'):\n",
" kf = KFold(n_splits=5, random_state=42, shuffle=True)\n",
" fold_splits = kf.split(train, target)\n",
" pred_full_test = 0\n",
" pred_train = np.zeros((train.shape[0], 1))\n",
" feature_importance_df = pd.DataFrame()\n",
" cv_scores = []\n",
" i = 1\n",
" for dev_index, val_index in fold_splits:\n",
" print('Started ' + label + ' fold ' + str(i) + '/5')\n",
" if isinstance(train, pd.DataFrame):\n",
" dev_X, val_X = train.iloc[dev_index], train.iloc[val_index]\n",
" dev_y, val_y = target[dev_index], target[val_index]\n",
" dev_y_real, val_y_real = real_target[dev_index], real_target[val_index]\n",
" else:\n",
" dev_X, val_X = train[dev_index], train[val_index]\n",
" dev_y, val_y = target[dev_index], target[val_index]\n",
" dev_y_real, val_y_real = real_target[dev_index], real_target[val_index]\n",
" params2 = params.copy()\n",
" pred_val_y, pred_test_y, importances = model_fn(dev_X, dev_y, val_X, val_y, test, params2)\n",
" pred_full_test = pred_full_test + pred_test_y\n",
" pred_train[val_index] = pred_val_y\n",
" if eval_fn is not None:\n",
" cv_score = eval_fn(val_y_real, np.exp(pred_val_y))\n",
" cv_scores.append(cv_score)\n",
" print(label + ' cv score {}: MAPE {} '.format(i, cv_score))\n",
" fold_importance_df = pd.DataFrame()\n",
" fold_importance_df['feature'] = train.columns.values\n",
" fold_importance_df['importance'] = importances\n",
" fold_importance_df['fold'] = i\n",
" feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0) \n",
" i += 1\n",
" print('{} cv MAPE scores : {}'.format(label, cv_scores))\n",
" print('{} cv mean MAPE score : {}'.format(label, np.mean(cv_scores)))\n",
" print('{} cv std MAPE score : {}'.format(label, np.mean(cv_scores)))\n",
" pred_full_test = pred_full_test / 5.0\n",
" results = {'label': label,\n",
" 'train': pred_train, 'test': pred_full_test,\n",
" 'cv': cv_scores, 'importance': feature_importance_df}\n",
" return results\n",
"\n",
"params = {'application': 'regression',\n",
" 'boosting': 'gbdt',\n",
" 'metric': 'mape',\n",
" 'num_leaves': 80,\n",
" 'max_depth': 11,\n",
" 'learning_rate': 0.01,\n",
" 'bagging_fraction': 0.9,\n",
" 'feature_fraction': 0.8,\n",
" 'min_split_gain': 0.01,\n",
" 'min_child_samples': 150,\n",
" 'min_child_weight': 0.1,\n",
" 'verbosity': -1,\n",
" 'data_random_seed': 3,\n",
" 'early_stop': 100,\n",
" 'verbose_eval': 100,\n",
" 'num_rounds': 10000,\n",
" 'seed': 42}\n",
"\n",
"\n",
"def runLGB(train_X, train_y, test_X, test_y, test_X2, params):\n",
" print('Prep LGB')\n",
" d_train = lgb.Dataset(train_X, label=train_y)\n",
" d_valid = lgb.Dataset(test_X, label=test_y)\n",
" watchlist = [d_train, d_valid]\n",
" print('Train LGB')\n",
" num_rounds = params.pop('num_rounds')\n",
" verbose_eval = params.pop('verbose_eval')\n",
" early_stop = None\n",
" if params.get('early_stop'):\n",
" early_stop = params.pop('early_stop')\n",
" model = lgb.train(params,\n",
" train_set=d_train,\n",
" num_boost_round=num_rounds,\n",
" valid_sets=watchlist,\n",
" verbose_eval=verbose_eval,\n",
" early_stopping_rounds=early_stop)\n",
" print('Predict 1/2')\n",
" pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)\n",
" print('Predict 2/2')\n",
" pred_test_y2 = model.predict(test_X2, num_iteration=model.best_iteration)\n",
" return pred_test_y.reshape(-1, 1), pred_test_y2.reshape(-1, 1), model.feature_importance()\n",
"\n",
"results = run_cv_model(train_features, test_features, train_label_transformed, train_labels, runLGB, params, mean_absolute_percentage_error)"
],
"execution_count": 83,
"outputs": [
{
"output_type": "stream",
"text": [
"Started model fold 1/5\n",
"Prep LGB\n",
"Train LGB\n",
"Training until validation scores don't improve for 100 rounds.\n",
"[100]\ttraining's mape: 0.0508964\tvalid_1's mape: 0.0516945\n",
"[200]\ttraining's mape: 0.0387995\tvalid_1's mape: 0.040767\n",
"[300]\ttraining's mape: 0.0339207\tvalid_1's mape: 0.037187\n",
"[400]\ttraining's mape: 0.0314788\tvalid_1's mape: 0.0357507\n",
"[500]\ttraining's mape: 0.0299509\tvalid_1's mape: 0.0352325\n",
"[600]\ttraining's mape: 0.0287523\tvalid_1's mape: 0.0348991\n",
"[700]\ttraining's mape: 0.0277569\tvalid_1's mape: 0.0347556\n",
"[800]\ttraining's mape: 0.0268199\tvalid_1's mape: 0.0345863\n",
"[900]\ttraining's mape: 0.0259208\tvalid_1's mape: 0.0344536\n",
"[1000]\ttraining's mape: 0.0251064\tvalid_1's mape: 0.0343589\n",
"[1100]\ttraining's mape: 0.0243731\tvalid_1's mape: 0.0343142\n",
"[1200]\ttraining's mape: 0.0236688\tvalid_1's mape: 0.0342496\n",
"[1300]\ttraining's mape: 0.0229801\tvalid_1's mape: 0.034255\n",
"Early stopping, best iteration is:\n",
"[1203]\ttraining's mape: 0.0236481\tvalid_1's mape: 0.0342474\n",
"Predict 1/2\n",
"Predict 2/2\n",
"model cv score 1: MAPE 2.1235148650583446 \n",
"Started model fold 2/5\n",
"Prep LGB\n",
"Train LGB\n",
"Training until validation scores don't improve for 100 rounds.\n",
"[100]\ttraining's mape: 0.0509111\tvalid_1's mape: 0.0535342\n",
"[200]\ttraining's mape: 0.0390031\tvalid_1's mape: 0.0418574\n",
"[300]\ttraining's mape: 0.034349\tvalid_1's mape: 0.037871\n",
"[400]\ttraining's mape: 0.0317938\tvalid_1's mape: 0.035915\n",
"[500]\ttraining's mape: 0.0302896\tvalid_1's mape: 0.0352378\n",
"[600]\ttraining's mape: 0.0289782\tvalid_1's mape: 0.0347424\n",
"[700]\ttraining's mape: 0.0279389\tvalid_1's mape: 0.0344295\n",
"[800]\ttraining's mape: 0.0269706\tvalid_1's mape: 0.0341808\n",
"[900]\ttraining's mape: 0.026141\tvalid_1's mape: 0.0340406\n",
"[1000]\ttraining's mape: 0.0253568\tvalid_1's mape: 0.0339143\n",
"[1100]\ttraining's mape: 0.0245863\tvalid_1's mape: 0.0338124\n",
"[1200]\ttraining's mape: 0.0238915\tvalid_1's mape: 0.0337311\n",
"[1300]\ttraining's mape: 0.0232652\tvalid_1's mape: 0.0336871\n",
"[1400]\ttraining's mape: 0.0226265\tvalid_1's mape: 0.0336303\n",
"[1500]\ttraining's mape: 0.0220056\tvalid_1's mape: 0.0335803\n",
"[1600]\ttraining's mape: 0.0214327\tvalid_1's mape: 0.0335356\n",
"[1700]\ttraining's mape: 0.0208814\tvalid_1's mape: 0.0334652\n",
"[1800]\ttraining's mape: 0.0203417\tvalid_1's mape: 0.0334476\n",
"[1900]\ttraining's mape: 0.0198237\tvalid_1's mape: 0.0334427\n",
"Early stopping, best iteration is:\n",
"[1867]\ttraining's mape: 0.0199848\tvalid_1's mape: 0.033425\n",
"Predict 1/2\n",
"Predict 2/2\n",
"model cv score 2: MAPE 2.07144347354752 \n",
"Started model fold 3/5\n",
"Prep LGB\n",
"Train LGB\n",
"Training until validation scores don't improve for 100 rounds.\n",
"[100]\ttraining's mape: 0.0508453\tvalid_1's mape: 0.0507927\n",
"[200]\ttraining's mape: 0.0385358\tvalid_1's mape: 0.0405767\n",
"[300]\ttraining's mape: 0.033774\tvalid_1's mape: 0.0372539\n",
"[400]\ttraining's mape: 0.0312947\tvalid_1's mape: 0.0359226\n",
"[500]\ttraining's mape: 0.0297466\tvalid_1's mape: 0.0353496\n",
"[600]\ttraining's mape: 0.0285653\tvalid_1's mape: 0.0349835\n",
"[700]\ttraining's mape: 0.0275356\tvalid_1's mape: 0.0346906\n",
"[800]\ttraining's mape: 0.0265643\tvalid_1's mape: 0.034453\n",
"[900]\ttraining's mape: 0.0256515\tvalid_1's mape: 0.0343369\n",
"[1000]\ttraining's mape: 0.0248963\tvalid_1's mape: 0.0342941\n",
"[1100]\ttraining's mape: 0.0241464\tvalid_1's mape: 0.0342504\n",
"Early stopping, best iteration is:\n",
"[1076]\ttraining's mape: 0.0243176\tvalid_1's mape: 0.0342394\n",
"Predict 1/2\n",
"Predict 2/2\n",
"model cv score 3: MAPE 1.984561224480483 \n",
"Started model fold 4/5\n",
"Prep LGB\n",
"Train LGB\n",
"Training until validation scores don't improve for 100 rounds.\n",
"[100]\ttraining's mape: 0.0508197\tvalid_1's mape: 0.0532779\n",
"[200]\ttraining's mape: 0.0389338\tvalid_1's mape: 0.0417905\n",
"[300]\ttraining's mape: 0.0340717\tvalid_1's mape: 0.0377237\n",
"[400]\ttraining's mape: 0.0316728\tvalid_1's mape: 0.0361663\n",
"[500]\ttraining's mape: 0.030061\tvalid_1's mape: 0.0354495\n",
"[600]\ttraining's mape: 0.0288324\tvalid_1's mape: 0.0350795\n",
"[700]\ttraining's mape: 0.027765\tvalid_1's mape: 0.0347992\n",
"[800]\ttraining's mape: 0.0267837\tvalid_1's mape: 0.034639\n",
"[900]\ttraining's mape: 0.0259105\tvalid_1's mape: 0.0345784\n",
"[1000]\ttraining's mape: 0.0251395\tvalid_1's mape: 0.0345314\n",
"[1100]\ttraining's mape: 0.0244009\tvalid_1's mape: 0.0344845\n",
"[1200]\ttraining's mape: 0.0236851\tvalid_1's mape: 0.0344376\n",
"[1300]\ttraining's mape: 0.023003\tvalid_1's mape: 0.0344201\n",
"Early stopping, best iteration is:\n",
"[1242]\ttraining's mape: 0.0233844\tvalid_1's mape: 0.0343925\n",
"Predict 1/2\n",
"Predict 2/2\n",
"model cv score 4: MAPE 2.0065877027372334 \n",
"Started model fold 5/5\n",
"Prep LGB\n",
"Train LGB\n",
"Training until validation scores don't improve for 100 rounds.\n",
"[100]\ttraining's mape: 0.0512045\tvalid_1's mape: 0.0505437\n",
"[200]\ttraining's mape: 0.0391113\tvalid_1's mape: 0.0395823\n",
"[300]\ttraining's mape: 0.0342194\tvalid_1's mape: 0.0357116\n",
"[400]\ttraining's mape: 0.031724\tvalid_1's mape: 0.0341973\n",
"[500]\ttraining's mape: 0.0299868\tvalid_1's mape: 0.0334058\n",
"[600]\ttraining's mape: 0.0287222\tvalid_1's mape: 0.0331236\n",
"[700]\ttraining's mape: 0.0276685\tvalid_1's mape: 0.0329757\n",
"[800]\ttraining's mape: 0.0267437\tvalid_1's mape: 0.0328425\n",
"[900]\ttraining's mape: 0.0259025\tvalid_1's mape: 0.0327061\n",
"[1000]\ttraining's mape: 0.0251412\tvalid_1's mape: 0.0326443\n",
"[1100]\ttraining's mape: 0.0243883\tvalid_1's mape: 0.0325874\n",
"[1200]\ttraining's mape: 0.0236792\tvalid_1's mape: 0.0325517\n",
"[1300]\ttraining's mape: 0.0230229\tvalid_1's mape: 0.0324836\n",
"[1400]\ttraining's mape: 0.0223587\tvalid_1's mape: 0.0324813\n",
"Early stopping, best iteration is:\n",
"[1316]\ttraining's mape: 0.0229139\tvalid_1's mape: 0.0324729\n",
"Predict 1/2\n",
"Predict 2/2\n",
"model cv score 5: MAPE 2.0231157359919614 \n",
"model cv MAPE scores : [2.1235148650583446, 2.07144347354752, 1.984561224480483, 2.0065877027372334, 2.0231157359919614]\n",
"model cv mean MAPE score : 2.0418446003631088\n",
"model cv std MAPE score : 2.0418446003631088\n"
],
"name": "stdout"
}
]
},
{
"metadata": {
"id": "qx1fdRc4ZRMG",
"colab_type": "code",
"outputId": "c8a691c5-2ff6-49f1-f758-b2e59887c984",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
}
},
"cell_type": "code",
"source": [
"print(mean_absolute_percentage_error(np.ravel(train_labels), np.exp(results['train'])))"
],
"execution_count": 84,
"outputs": [
{
"output_type": "stream",
"text": [
"2.0428334957762653\n"
],
"name": "stdout"
}
]
},
{
"metadata": {
"id": "siX_iqCmXqHd",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"preds = np.exp(results['test'])"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "WadT7kvCXsxv",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"submission_format = pd.read_csv(DATA_DIR / 'submission_format.csv', index_col=0)"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "t7DNfDKJXuzt",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"# confirm everything is in the right order\n",
"assert np.all(test_features.index == submission_format.index)"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "R52Mqb7bXxJr",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"my_submission = pd.DataFrame(data=preds,\n",
" columns=submission_format.columns,\n",
" index=submission_format.index)"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "GFM6mmeAXzaw",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"my_submission.to_csv('submission.csv')"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "BvceqrBlX1G8",
"colab_type": "code",
"outputId": "97cfa3f0-c78d-45c7-ad32-2bfa5fef5b14",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 194
}
},
"cell_type": "code",
"source": [
"!head submission.csv"
],
"execution_count": 60,
"outputs": [
{
"output_type": "stream",
"text": [
"process_id,final_rinse_total_turbidity_liter\n",
"20000,1115778.3324606512\n",
"20006,1173079.0674722453\n",
"20007,1225419.4915231618\n",
"20009,973443.0059577588\n",
"20010,663299.0118895936\n",
"20012,681235.0197957861\n",
"20013,96374.07507977533\n",
"20015,200744.15204852956\n",
"20020,691260.0870321316\n"
],
"name": "stdout"
}
]
},
{
"metadata": {
"id": "U50Tpz9VwYLY",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"importance = results['importance'].groupby('feature').mean().reset_index()[['feature', 'importance']]"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "QUprjvrqX262",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1969
},
"outputId": "5e90b226-1b55-4fe0-fa1f-24dcdc1389a4"
},
"cell_type": "code",
"source": [
"imporatance.sort_values('importance', ascending=False)"
],
"execution_count": 76,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>feature</th>\n",
" <th>importance</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>30</th>\n",
" <td>(return_turbidity, &lt;lambda&gt;)</td>\n",
" <td>1272.2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32</th>\n",
" <td>(return_turbidity, mean)</td>\n",
" <td>595.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>33</th>\n",
" <td>(return_turbidity, min)</td>\n",
" <td>567.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>55</th>\n",
" <td>(supply_pressure, min)</td>\n",
" <td>500.2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>31</th>\n",
" <td>(return_turbidity, max)</td>\n",
" <td>493.8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>45</th>\n",
" <td>(supply_flow, max)</td>\n",
" <td>483.8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>(return_temperature, max)</td>\n",
" <td>478.8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>48</th>\n",
" <td>(supply_flow, std)</td>\n",
" <td>477.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>(object_low_level, mean)</td>\n",
" <td>462.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>(return_temperature, &lt;lambda&gt;)</td>\n",
" <td>459.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>46</th>\n",
" <td>(supply_flow, mean)</td>\n",
" <td>446.8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>(return_conductivity, max)</td>\n",
" <td>438.2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>70</th>\n",
" <td>(tank_level_diff12, &lt;lambda&gt;)</td>\n",
" <td>416.8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>(return_flow, &lt;lambda&gt;)</td>\n",
" <td>399.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>(return_flow, max)</td>\n",
" <td>397.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>44</th>\n",
" <td>(supply_flow, &lt;lambda&gt;)</td>\n",
" <td>396.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>52</th>\n",
" <td>(supply_pressure, &lt;lambda&gt;)</td>\n",
" <td>382.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>(object_low_level, std)</td>\n",
" <td>377.2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>(return_conductivity, mean)</td>\n",
" <td>360.8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>88</th>\n",
" <td>(tank_lsh_caustic, &lt;lambda&gt;)</td>\n",
" <td>352.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>53</th>\n",
" <td>(supply_pressure, max)</td>\n",
" <td>349.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>(return_temperature, std)</td>\n",
" <td>343.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>(return_temperature, min)</td>\n",
" <td>342.2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>34</th>\n",
" <td>(return_turbidity, std)</td>\n",
" <td>339.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>(return_conductivity, &lt;lambda&gt;)</td>\n",
" <td>332.2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>(return_flow, min)</td>\n",
" <td>329.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>118</th>\n",
" <td>pipeline_L9</td>\n",
" <td>325.2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>106</th>\n",
" <td>(tank_temp_diff23, std)</td>\n",
" <td>323.8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>58</th>\n",
" <td>(supply_pump, mean)</td>\n",
" <td>321.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>(return_flow, std)</td>\n",
" <td>309.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>41</th>\n",
" <td>(supply_clean_water, &lt;lambda&gt;)</td>\n",
" <td>63.2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>90</th>\n",
" <td>(tank_lsh_caustic, std)</td>\n",
" <td>57.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>112</th>\n",
" <td>pipeline_L2</td>\n",
" <td>39.2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>116</th>\n",
" <td>pipeline_L7</td>\n",
" <td>37.8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>117</th>\n",
" <td>pipeline_L8</td>\n",
" <td>24.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>38</th>\n",
" <td>(supply_caustic, &lt;lambda&gt;)</td>\n",
" <td>20.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>(object_low_level, &lt;lambda&gt;)</td>\n",
" <td>15.8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>113</th>\n",
" <td>pipeline_L3</td>\n",
" <td>11.2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>107</th>\n",
" <td>num_phases</td>\n",
" <td>9.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>(return_caustic, &lt;lambda&gt;)</td>\n",
" <td>7.8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>108</th>\n",
" <td>pipeline_L1</td>\n",
" <td>6.8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>114</th>\n",
" <td>pipeline_L4</td>\n",
" <td>2.2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>115</th>\n",
" <td>pipeline_L6</td>\n",
" <td>1.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>110</th>\n",
" <td>pipeline_L11</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>109</th>\n",
" <td>pipeline_L10</td>\n",
" <td>0.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>(return_acid, &lt;lambda&gt;)</td>\n",
" <td>0.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>111</th>\n",
" <td>pipeline_L12</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>57</th>\n",
" <td>(supply_pump, &lt;lambda&gt;)</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>(return_drain, &lt;lambda&gt;)</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96</th>\n",
" <td>(tank_lsh_pre_rinse, std)</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>95</th>\n",
" <td>(tank_lsh_pre_rinse, mean)</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>94</th>\n",
" <td>(tank_lsh_pre_rinse, &lt;lambda&gt;)</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>93</th>\n",
" <td>(tank_lsh_clean_water, std)</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>91</th>\n",
" <td>(tank_lsh_clean_water, &lt;lambda&gt;)</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>87</th>\n",
" <td>(tank_lsh_acid, std)</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>86</th>\n",
" <td>(tank_lsh_acid, mean)</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>85</th>\n",
" <td>(tank_lsh_acid, &lt;lambda&gt;)</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>(return_recovery_water, &lt;lambda&gt;)</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>49</th>\n",
" <td>(supply_pre_rinse, &lt;lambda&gt;)</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>92</th>\n",
" <td>(tank_lsh_clean_water, mean)</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>119 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" feature importance\n",
"30 (return_turbidity, <lambda>) 1272.2\n",
"32 (return_turbidity, mean) 595.0\n",
"33 (return_turbidity, min) 567.4\n",
"55 (supply_pressure, min) 500.2\n",
"31 (return_turbidity, max) 493.8\n",
"45 (supply_flow, max) 483.8\n",
"26 (return_temperature, max) 478.8\n",
"48 (supply_flow, std) 477.6\n",
"1 (object_low_level, mean) 462.6\n",
"25 (return_temperature, <lambda>) 459.6\n",
"46 (supply_flow, mean) 446.8\n",
"10 (return_conductivity, max) 438.2\n",
"70 (tank_level_diff12, <lambda>) 416.8\n",
"17 (return_flow, <lambda>) 399.6\n",
"18 (return_flow, max) 397.4\n",
"44 (supply_flow, <lambda>) 396.4\n",
"52 (supply_pressure, <lambda>) 382.6\n",
"2 (object_low_level, std) 377.2\n",
"11 (return_conductivity, mean) 360.8\n",
"88 (tank_lsh_caustic, <lambda>) 352.6\n",
"53 (supply_pressure, max) 349.0\n",
"29 (return_temperature, std) 343.6\n",
"28 (return_temperature, min) 342.2\n",
"34 (return_turbidity, std) 339.6\n",
"9 (return_conductivity, <lambda>) 332.2\n",
"20 (return_flow, min) 329.0\n",
"118 pipeline_L9 325.2\n",
"106 (tank_temp_diff23, std) 323.8\n",
"58 (supply_pump, mean) 321.4\n",
"21 (return_flow, std) 309.0\n",
".. ... ...\n",
"41 (supply_clean_water, <lambda>) 63.2\n",
"90 (tank_lsh_caustic, std) 57.0\n",
"112 pipeline_L2 39.2\n",
"116 pipeline_L7 37.8\n",
"117 pipeline_L8 24.6\n",
"38 (supply_caustic, <lambda>) 20.0\n",
"0 (object_low_level, <lambda>) 15.8\n",
"113 pipeline_L3 11.2\n",
"107 num_phases 9.6\n",
"6 (return_caustic, <lambda>) 7.8\n",
"108 pipeline_L1 6.8\n",
"114 pipeline_L4 2.2\n",
"115 pipeline_L6 1.6\n",
"110 pipeline_L11 1.0\n",
"109 pipeline_L10 0.4\n",
"3 (return_acid, <lambda>) 0.4\n",
"111 pipeline_L12 0.0\n",
"57 (supply_pump, <lambda>) 0.0\n",
"14 (return_drain, <lambda>) 0.0\n",
"96 (tank_lsh_pre_rinse, std) 0.0\n",
"95 (tank_lsh_pre_rinse, mean) 0.0\n",
"94 (tank_lsh_pre_rinse, <lambda>) 0.0\n",
"93 (tank_lsh_clean_water, std) 0.0\n",
"91 (tank_lsh_clean_water, <lambda>) 0.0\n",
"87 (tank_lsh_acid, std) 0.0\n",
"86 (tank_lsh_acid, mean) 0.0\n",
"85 (tank_lsh_acid, <lambda>) 0.0\n",
"22 (return_recovery_water, <lambda>) 0.0\n",
"49 (supply_pre_rinse, <lambda>) 0.0\n",
"92 (tank_lsh_clean_water, mean) 0.0\n",
"\n",
"[119 rows x 2 columns]"
]
},
"metadata": {
"tags": []
},
"execution_count": 76
}
]
},
{
"metadata": {
"id": "DfByIcRvvjYL",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
""
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "SQqUzt8EwG0l",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
""
],
"execution_count": 0,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment