Skip to content

Instantly share code, notes, and snippets.

@wassname
Last active February 13, 2018 03:28
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save wassname/e0d8fad125dcd7702091390e9d5f45f0 to your computer and use it in GitHub Desktop.
Save wassname/e0d8fad125dcd7702091390e9d5f45f0 to your computer and use it in GitHub Desktop.
starter colab jupyter notebook for the hydrosaver competition
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "hydrosaver.ipynb",
"version": "0.3.2",
"views": {},
"default_view": {},
"provenance": [],
"collapsed_sections": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
}
},
"cells": [
{
"metadata": {
"id": "ouWjqOFAxk3G",
"colab_type": "code",
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
},
"output_extras": [
{
"item_id": 4
}
],
"base_uri": "https://localhost:8080/",
"height": 600
},
"outputId": "7507890c-e73e-45bd-a287-dfa012032d9b",
"executionInfo": {
"status": "ok",
"timestamp": 1517110247838,
"user_tz": -480,
"elapsed": 6020,
"user": {
"displayName": "Mike C",
"photoUrl": "//lh3.googleusercontent.com/-xWyBhueI0Vo/AAAAAAAAAAI/AAAAAAAAIg8/rU-qliu5l-M/s50-c-k-no/photo.jpg",
"userId": "110113503404408134511"
}
}
},
"cell_type": "code",
"source": [
"# !pip install http://download.pytorch.org/whl/cu80/torch-0.3.0.post4-cp36-cp36m-linux_x86_64.whl \n",
"!pip install http://download.pytorch.org/whl/cpu/torch-0.3.0.post4-cp36-cp36m-linux_x86_64.whl \n",
"!pip install xgboost tpot pandas-profiling seaborn torchvision tqdm"
],
"execution_count": 4,
"outputs": [
{
"output_type": "stream",
"text": [
"Requirement already satisfied: torch==0.3.0.post4 from http://download.pytorch.org/whl/cpu/torch-0.3.0.post4-cp36-cp36m-linux_x86_64.whl in /usr/local/lib/python3.6/dist-packages\n",
"Requirement already satisfied: pyyaml in /usr/local/lib/python3.6/dist-packages (from torch==0.3.0.post4)\n",
"Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from torch==0.3.0.post4)\n",
"Requirement already satisfied: xgboost in /usr/local/lib/python3.6/dist-packages\n",
"Requirement already satisfied: tpot in /usr/local/lib/python3.6/dist-packages\n",
"Requirement already satisfied: pandas-profiling in /usr/local/lib/python3.6/dist-packages\n",
"Requirement already satisfied: seaborn in /usr/local/lib/python3.6/dist-packages\n",
"Requirement already satisfied: torchvision in /usr/local/lib/python3.6/dist-packages\n",
"Requirement already satisfied: tqdm in /usr/local/lib/python3.6/dist-packages\n",
"Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from xgboost)\n",
"Requirement already satisfied: scipy in /usr/local/lib/python3.6/dist-packages (from xgboost)\n",
"Requirement already satisfied: deap>=1.0 in /usr/local/lib/python3.6/dist-packages (from tpot)\n",
"Requirement already satisfied: update-checker>=0.16 in /usr/local/lib/python3.6/dist-packages (from tpot)\n",
"Requirement already satisfied: scikit-learn>=0.18.1 in /usr/local/lib/python3.6/dist-packages (from tpot)\n",
"Requirement already satisfied: stopit>=1.1.1 in /usr/local/lib/python3.6/dist-packages (from tpot)\n",
"Requirement already satisfied: pandas>=0.20.2 in /usr/local/lib/python3.6/dist-packages (from tpot)\n",
"Requirement already satisfied: matplotlib>=1.4 in /usr/local/lib/python3.6/dist-packages (from pandas-profiling)\n",
"Requirement already satisfied: jinja2>=2.8 in /usr/local/lib/python3.6/dist-packages (from pandas-profiling)\n",
"Requirement already satisfied: six>=1.9 in /usr/local/lib/python3.6/dist-packages (from pandas-profiling)\n",
"Requirement already satisfied: torch in /usr/local/lib/python3.6/dist-packages (from torchvision)\n",
"Requirement already satisfied: pillow>=4.1.1 in /usr/local/lib/python3.6/dist-packages (from torchvision)\n",
"Requirement already satisfied: requests>=2.3.0 in /usr/local/lib/python3.6/dist-packages (from update-checker>=0.16->tpot)\n",
"Requirement already satisfied: python-dateutil>=2 in /usr/local/lib/python3.6/dist-packages (from pandas>=0.20.2->tpot)\n",
"Requirement already satisfied: pytz>=2011k in /usr/local/lib/python3.6/dist-packages (from pandas>=0.20.2->tpot)\n",
"Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.6/dist-packages (from matplotlib>=1.4->pandas-profiling)\n",
"Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib>=1.4->pandas-profiling)\n",
"Requirement already satisfied: MarkupSafe in /usr/local/lib/python3.6/dist-packages (from jinja2>=2.8->pandas-profiling)\n",
"Requirement already satisfied: pyyaml in /usr/local/lib/python3.6/dist-packages (from torch->torchvision)\n",
"Requirement already satisfied: idna<2.7,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests>=2.3.0->update-checker>=0.16->tpot)\n",
"Requirement already satisfied: urllib3<1.23,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests>=2.3.0->update-checker>=0.16->tpot)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests>=2.3.0->update-checker>=0.16->tpot)\n",
"Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests>=2.3.0->update-checker>=0.16->tpot)\n"
],
"name": "stdout"
}
]
},
{
"metadata": {
"id": "iCu7iIqOOBwg",
"colab_type": "code",
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
}
}
},
"cell_type": "code",
"source": [
""
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "cOUoN4Iytsl6",
"colab_type": "code",
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
},
"output_extras": [
{
"item_id": 1
}
],
"base_uri": "https://localhost:8080/",
"height": 35
},
"outputId": "e5662991-d80f-42c4-998c-ee969542f125",
"executionInfo": {
"status": "ok",
"timestamp": 1517111898602,
"user_tz": -480,
"elapsed": 984,
"user": {
"displayName": "Mike C",
"photoUrl": "//lh3.googleusercontent.com/-xWyBhueI0Vo/AAAAAAAAAAI/AAAAAAAAIg8/rU-qliu5l-M/s50-c-k-no/photo.jpg",
"userId": "110113503404408134511"
}
}
},
"cell_type": "code",
"source": [
"%pylab inline\n",
"import numpy as np\n",
"import pandas as pd\n",
"import seaborn as sn\n",
"import os\n",
"from tqdm import tqdm"
],
"execution_count": 2,
"outputs": [
{
"output_type": "stream",
"text": [
"Populating the interactive namespace from numpy and matplotlib\n"
],
"name": "stdout"
}
]
},
{
"metadata": {
"id": "MYHMWSswtpMW",
"colab_type": "code",
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
}
}
},
"cell_type": "code",
"source": [
"eps = 1e-6\n",
"seed = 42\n",
"np.random.seed(seed)"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "P5QboV3_vVbt",
"colab_type": "code",
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
}
}
},
"cell_type": "code",
"source": [
""
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "DG73KWmRvvqD",
"colab_type": "text"
},
"cell_type": "markdown",
"source": [
"# Download data"
]
},
{
"metadata": {
"id": "i_S6eqlBu0Ti",
"colab_type": "code",
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
}
}
},
"cell_type": "code",
"source": [
"# from https://stackoverflow.com/a/39225039/221742\n",
"import requests\n",
"\n",
"def download_file_from_google_drive(id, destination):\n",
" def get_confirm_token(response):\n",
" for key, value in response.cookies.items():\n",
" if key.startswith('download_warning'):\n",
" return value\n",
"\n",
" return None\n",
"\n",
" def save_response_content(response, destination):\n",
" CHUNK_SIZE = 32768\n",
"\n",
" with open(destination, \"wb\") as f:\n",
" for chunk in response.iter_content(CHUNK_SIZE):\n",
" if chunk: # filter out keep-alive new chunks\n",
" f.write(chunk)\n",
"\n",
" URL = \"https://docs.google.com/uc?export=download\"\n",
"\n",
" session = requests.Session()\n",
"\n",
" response = session.get(URL, params = { 'id' : id }, stream = True)\n",
" token = get_confirm_token(response)\n",
"\n",
" if token:\n",
" params = { 'id' : id, 'confirm' : token }\n",
" response = session.get(URL, params = params, stream = True)\n",
"\n",
" save_response_content(response, destination) \n",
" \n",
"if not os.path.isdir('data/original'):\n",
" os.makedirs('data/original')\n",
"download_file_from_google_drive('15BqAMEBsTjAzT2eJXED-zA1pdHpGWZLl', './data/original/train.csv')\n",
"download_file_from_google_drive('1Xi_lLCKTsgSNECerpIPhQPzUCtmutDeS', './data/original/publishable_test_set.csv')\n"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "Wgr_5JJDv9A0",
"colab_type": "text"
},
"cell_type": "markdown",
"source": [
"# Load data"
]
},
{
"metadata": {
"id": "MjzCmx7ZtzRN",
"colab_type": "code",
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
}
}
},
"cell_type": "code",
"source": [
"# So we have some unique NaN values: 'No Data', 'Bad Input', etc. We also have date index col\n",
"df_train_val = pd.read_csv('./data/original/train.csv', index_col='timestamp', parse_dates=[0], na_values=['', 'No Data', 'Bad Input', 'Scan Off', 'I/O Timeout'])\n",
"df_train_val = df_train_val.dropna(axis=1, how='all') # drop the columns that are all NaN's\n",
"df_train_val = df_train_val.resample('1T').first()\n",
"df_train_val = df_train_val.drop('DIC88023.PV', 1)\n",
"\n",
"df_test = pd.read_csv('./data/original/publishable_test_set.csv', index_col='timestamp', parse_dates=[0], na_values=['', 'No Data', 'Bad Input', 'Scan Off', 'I/O Timeout'])\n",
"df_test = df_test.dropna(axis=1, how='all') # drop the columns that are all NaN's\n",
"\n",
"y_train_val = df_train_val.target\n",
"x_train_val = df_train_val.drop('target', 1) # We don't want the answer to be in the input data\n",
"\n",
"x_test = df_test\n",
"\n",
"# normalize the input columns\n",
"x_mean = x_train_val.mean()\n",
"x_std = x_train_val.mean()\n",
"\n",
"x_train_val = (x_train_val - x_mean)/(x_std + eps)\n",
"x_test = (x_test - x_mean)/(x_std + eps)\n",
"\n",
"# TODO I may want to normalize y too\n",
"\n",
"print('mean', x_mean)\n",
"print('std', x_std)\n",
"\n",
"# TPOT wont accept NaNs, so we either replace or drop\n",
"# Another approach would be to use unique numbers or extra columns for this\n",
"# Since we've normalized it, 0 is the nothing value. So let's use that\n",
"\n",
"\n",
"x_train_val = x_train_val.replace(np.nan, 0)\n",
"y_train_val = y_train_val.replace(np.nan, 0)\n",
"x_test = x_test.replace(np.nan, 0)\n",
"\n",
"# since it's a timeseries the validation will be in the future\n",
"val_split_in = int(len(df_train_val.index)*0.85)\n",
"x_val = x_train_val[val_split_in:]\n",
"x_train = x_train_val[:val_split_in]\n",
"y_val = y_train_val[val_split_in:]\n",
"y_train = y_train_val[:val_split_in]\n",
"\n",
"# convert to numpy\n",
"X_train = x_train.as_matrix()\n",
"y_train = y_train.as_matrix()\n",
"X_val = x_val.as_matrix()\n",
"y_val = y_val.as_matrix()\n",
"X_test = x_test.as_matrix()"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "DOW31bu1LmCZ",
"colab_type": "code",
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
}
}
},
"cell_type": "code",
"source": [
""
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "4B2eXef2LmTq",
"colab_type": "text"
},
"cell_type": "markdown",
"source": [
"# Have look into the data"
]
},
{
"metadata": {
"id": "eK3sF_pewzCe",
"colab_type": "code",
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
}
}
},
"cell_type": "code",
"source": [
"df_train_val.info()"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "ixLBMdpYtzlr",
"colab_type": "code",
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
},
"output_extras": [
{
"item_id": 1
}
],
"base_uri": "https://localhost:8080/",
"height": 333
},
"outputId": "677c9e2a-7995-448d-d8bb-97260d6bcf29",
"executionInfo": {
"status": "ok",
"timestamp": 1517110268232,
"user_tz": -480,
"elapsed": 1591,
"user": {
"displayName": "Mike C",
"photoUrl": "//lh3.googleusercontent.com/-xWyBhueI0Vo/AAAAAAAAAAI/AAAAAAAAIg8/rU-qliu5l-M/s50-c-k-no/photo.jpg",
"userId": "110113503404408134511"
}
}
},
"cell_type": "code",
"source": [
"df_train_val.describe()"
],
"execution_count": 10,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>WQI8100XCL1.CPV</th>\n",
" <th>XI84201.PV</th>\n",
" <th>XI84202.PV</th>\n",
" <th>XI84123.PV</th>\n",
" <th>XI84124.PV</th>\n",
" <th>XI84125.PV</th>\n",
" <th>FX87211.CPV1</th>\n",
" <th>FIC87211.PV</th>\n",
" <th>FIC87211.SV</th>\n",
" <th>FX87211.P01</th>\n",
" <th>...</th>\n",
" <th>NIC88002.PV</th>\n",
" <th>PIC88007.PV</th>\n",
" <th>LIC88006.PV</th>\n",
" <th>AIC88055.PV</th>\n",
" <th>FIC88022.PV</th>\n",
" <th>DIC88023.PV</th>\n",
" <th>SI88033.PV</th>\n",
" <th>SI88034.PV</th>\n",
" <th>MQI88024.CPV</th>\n",
" <th>target</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>567467.000000</td>\n",
" <td>568873.000000</td>\n",
" <td>568873.000000</td>\n",
" <td>568872.000000</td>\n",
" <td>568868.000000</td>\n",
" <td>568864.000000</td>\n",
" <td>568748.000000</td>\n",
" <td>568873.000000</td>\n",
" <td>568873.000000</td>\n",
" <td>568748.000000</td>\n",
" <td>...</td>\n",
" <td>568748.000000</td>\n",
" <td>568748.000000</td>\n",
" <td>568748.000000</td>\n",
" <td>568747.000000</td>\n",
" <td>568873.000000</td>\n",
" <td>568873.000000</td>\n",
" <td>568873.000000</td>\n",
" <td>568873.000000</td>\n",
" <td>568748.000000</td>\n",
" <td>568873.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>702.185281</td>\n",
" <td>105.143377</td>\n",
" <td>78.833709</td>\n",
" <td>0.046374</td>\n",
" <td>4.729106</td>\n",
" <td>23.870133</td>\n",
" <td>701.753521</td>\n",
" <td>4099.623820</td>\n",
" <td>4369.277083</td>\n",
" <td>21.063556</td>\n",
" <td>...</td>\n",
" <td>24.874992</td>\n",
" <td>8.484779</td>\n",
" <td>59.188180</td>\n",
" <td>35.818553</td>\n",
" <td>828.305065</td>\n",
" <td>52.464238</td>\n",
" <td>39.967696</td>\n",
" <td>23.986799</td>\n",
" <td>688.014555</td>\n",
" <td>52.463664</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>223.248174</td>\n",
" <td>39.179601</td>\n",
" <td>15.586560</td>\n",
" <td>0.020848</td>\n",
" <td>2.002726</td>\n",
" <td>10.889995</td>\n",
" <td>222.411906</td>\n",
" <td>1270.383514</td>\n",
" <td>623.180916</td>\n",
" <td>2.717285</td>\n",
" <td>...</td>\n",
" <td>7.449179</td>\n",
" <td>5.059790</td>\n",
" <td>31.757723</td>\n",
" <td>32.317187</td>\n",
" <td>241.930273</td>\n",
" <td>12.142946</td>\n",
" <td>34.175055</td>\n",
" <td>33.253739</td>\n",
" <td>212.166525</td>\n",
" <td>12.142832</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>-431.185300</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>-319.424500</td>\n",
" <td>-141.249900</td>\n",
" <td>0.000000</td>\n",
" <td>15.000000</td>\n",
" <td>...</td>\n",
" <td>-0.305536</td>\n",
" <td>-0.660624</td>\n",
" <td>0.100289</td>\n",
" <td>-0.185952</td>\n",
" <td>-8.621460</td>\n",
" <td>-1.073746</td>\n",
" <td>-0.091523</td>\n",
" <td>-0.042978</td>\n",
" <td>-19.506880</td>\n",
" <td>-1.073746</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>704.654800</td>\n",
" <td>85.875800</td>\n",
" <td>76.717600</td>\n",
" <td>0.043750</td>\n",
" <td>4.615610</td>\n",
" <td>23.181300</td>\n",
" <td>705.026125</td>\n",
" <td>3963.361000</td>\n",
" <td>4000.860000</td>\n",
" <td>19.000000</td>\n",
" <td>...</td>\n",
" <td>20.495800</td>\n",
" <td>4.414151</td>\n",
" <td>34.129130</td>\n",
" <td>-0.042471</td>\n",
" <td>823.644000</td>\n",
" <td>53.834340</td>\n",
" <td>0.061580</td>\n",
" <td>0.043636</td>\n",
" <td>684.447975</td>\n",
" <td>53.834050</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>771.509900</td>\n",
" <td>104.959000</td>\n",
" <td>80.717500</td>\n",
" <td>0.053122</td>\n",
" <td>5.664090</td>\n",
" <td>27.982300</td>\n",
" <td>771.734550</td>\n",
" <td>4365.759000</td>\n",
" <td>4366.150000</td>\n",
" <td>21.000000</td>\n",
" <td>...</td>\n",
" <td>25.944700</td>\n",
" <td>9.353214</td>\n",
" <td>45.527020</td>\n",
" <td>64.933850</td>\n",
" <td>893.697500</td>\n",
" <td>54.826150</td>\n",
" <td>61.135900</td>\n",
" <td>0.058294</td>\n",
" <td>748.692300</td>\n",
" <td>54.824650</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>813.349800</td>\n",
" <td>117.191000</td>\n",
" <td>86.680900</td>\n",
" <td>0.059375</td>\n",
" <td>5.875010</td>\n",
" <td>30.394600</td>\n",
" <td>813.457500</td>\n",
" <td>4711.450000</td>\n",
" <td>4711.220000</td>\n",
" <td>23.000000</td>\n",
" <td>...</td>\n",
" <td>30.657177</td>\n",
" <td>12.259470</td>\n",
" <td>99.866600</td>\n",
" <td>64.951930</td>\n",
" <td>945.023000</td>\n",
" <td>55.838750</td>\n",
" <td>69.629300</td>\n",
" <td>66.137960</td>\n",
" <td>795.153125</td>\n",
" <td>55.838450</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>1172.474000</td>\n",
" <td>726.869000</td>\n",
" <td>100.000000</td>\n",
" <td>0.200000</td>\n",
" <td>15.812500</td>\n",
" <td>51.659300</td>\n",
" <td>1146.185000</td>\n",
" <td>15645.750000</td>\n",
" <td>14000.000000</td>\n",
" <td>35.000000</td>\n",
" <td>...</td>\n",
" <td>58.597800</td>\n",
" <td>36.658300</td>\n",
" <td>100.127200</td>\n",
" <td>64.983190</td>\n",
" <td>1303.840000</td>\n",
" <td>77.728490</td>\n",
" <td>99.626400</td>\n",
" <td>98.255230</td>\n",
" <td>1399.242000</td>\n",
" <td>77.728490</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>8 rows × 23 columns</p>\n",
"</div>"
],
"text/plain": [
" WQI8100XCL1.CPV XI84201.PV XI84202.PV XI84123.PV \\\n",
"count 567467.000000 568873.000000 568873.000000 568872.000000 \n",
"mean 702.185281 105.143377 78.833709 0.046374 \n",
"std 223.248174 39.179601 15.586560 0.020848 \n",
"min -431.185300 0.000000 0.000000 0.000000 \n",
"25% 704.654800 85.875800 76.717600 0.043750 \n",
"50% 771.509900 104.959000 80.717500 0.053122 \n",
"75% 813.349800 117.191000 86.680900 0.059375 \n",
"max 1172.474000 726.869000 100.000000 0.200000 \n",
"\n",
" XI84124.PV XI84125.PV FX87211.CPV1 FIC87211.PV \\\n",
"count 568868.000000 568864.000000 568748.000000 568873.000000 \n",
"mean 4.729106 23.870133 701.753521 4099.623820 \n",
"std 2.002726 10.889995 222.411906 1270.383514 \n",
"min 0.000000 0.000000 -319.424500 -141.249900 \n",
"25% 4.615610 23.181300 705.026125 3963.361000 \n",
"50% 5.664090 27.982300 771.734550 4365.759000 \n",
"75% 5.875010 30.394600 813.457500 4711.450000 \n",
"max 15.812500 51.659300 1146.185000 15645.750000 \n",
"\n",
" FIC87211.SV FX87211.P01 ... NIC88002.PV \\\n",
"count 568873.000000 568748.000000 ... 568748.000000 \n",
"mean 4369.277083 21.063556 ... 24.874992 \n",
"std 623.180916 2.717285 ... 7.449179 \n",
"min 0.000000 15.000000 ... -0.305536 \n",
"25% 4000.860000 19.000000 ... 20.495800 \n",
"50% 4366.150000 21.000000 ... 25.944700 \n",
"75% 4711.220000 23.000000 ... 30.657177 \n",
"max 14000.000000 35.000000 ... 58.597800 \n",
"\n",
" PIC88007.PV LIC88006.PV AIC88055.PV FIC88022.PV \\\n",
"count 568748.000000 568748.000000 568747.000000 568873.000000 \n",
"mean 8.484779 59.188180 35.818553 828.305065 \n",
"std 5.059790 31.757723 32.317187 241.930273 \n",
"min -0.660624 0.100289 -0.185952 -8.621460 \n",
"25% 4.414151 34.129130 -0.042471 823.644000 \n",
"50% 9.353214 45.527020 64.933850 893.697500 \n",
"75% 12.259470 99.866600 64.951930 945.023000 \n",
"max 36.658300 100.127200 64.983190 1303.840000 \n",
"\n",
" DIC88023.PV SI88033.PV SI88034.PV MQI88024.CPV \\\n",
"count 568873.000000 568873.000000 568873.000000 568748.000000 \n",
"mean 52.464238 39.967696 23.986799 688.014555 \n",
"std 12.142946 34.175055 33.253739 212.166525 \n",
"min -1.073746 -0.091523 -0.042978 -19.506880 \n",
"25% 53.834340 0.061580 0.043636 684.447975 \n",
"50% 54.826150 61.135900 0.058294 748.692300 \n",
"75% 55.838750 69.629300 66.137960 795.153125 \n",
"max 77.728490 99.626400 98.255230 1399.242000 \n",
"\n",
" target \n",
"count 568873.000000 \n",
"mean 52.463664 \n",
"std 12.142832 \n",
"min -1.073746 \n",
"25% 53.834050 \n",
"50% 54.824650 \n",
"75% 55.838450 \n",
"max 77.728490 \n",
"\n",
"[8 rows x 23 columns]"
]
},
"metadata": {
"tags": []
},
"execution_count": 10
}
]
},
{
"metadata": {
"id": "LSicdcLVLp7i",
"colab_type": "code",
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
}
}
},
"cell_type": "code",
"source": [
"# You can use pandas profiling to get an overview of the data\n",
"import pandas_profiling\n",
"profile = pandas_profiling.ProfileReport(df_train_val[:2000])\n",
"profile.to_file(outputfile=\"/tmp/myoutputfile.html\")\n",
"profile"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "X9gXoF9CxTst",
"colab_type": "text"
},
"cell_type": "markdown",
"source": [
"# TPOT!\n",
"\n",
"TPOT is an automatic machine learning library that uses genetic algorithms to try different generations of scikit-learn algorihtms.\n",
"\n",
"link: https://epistasislab.github.io/tpot/"
]
},
{
"metadata": {
"id": "_26g069T5KCG",
"colab_type": "code",
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
},
"output_extras": [
{
"item_id": 1
}
],
"base_uri": "https://localhost:8080/",
"height": 35
},
"outputId": "eb67d081-78f6-42e4-c8e3-35d923022724",
"executionInfo": {
"status": "ok",
"timestamp": 1517110277832,
"user_tz": -480,
"elapsed": 779,
"user": {
"displayName": "Mike C",
"photoUrl": "//lh3.googleusercontent.com/-xWyBhueI0Vo/AAAAAAAAAAI/AAAAAAAAIg8/rU-qliu5l-M/s50-c-k-no/photo.jpg",
"userId": "110113503404408134511"
}
}
},
"cell_type": "code",
"source": [
"# Check data for TPOT compatability\n",
"from tpot.base import check_X_y\n",
"check_X_y(X_train, y_train, accept_sparse=True)\n",
"check_X_y(X_val, y_val, accept_sparse=True)\n",
"'ok'"
],
"execution_count": 12,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"'ok'"
]
},
"metadata": {
"tags": []
},
"execution_count": 12
}
]
},
{
"metadata": {
"id": "pwEFjEGKYHEV",
"colab_type": "code",
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
}
}
},
"cell_type": "code",
"source": [
"# Ensure the it respects causality, by only giving each sample access to a window of past data\n",
"# make padded sequences, we need to make the data in shape (batch, window_of_timesteps, features)\n",
"\n",
"def timeseries_to_seq(x, window=3):\n",
" \"\"\"\n",
" Inputs:\n",
" - x: shape (timeseries, features)\n",
" - window: e.g. 3\n",
" Outputs:\n",
" - y: shape shape (window, batch, features)\n",
" \"\"\"\n",
" x_pad = np.pad(x, [[window,0],[0,0]], mode='constant')\n",
" y = np.stack([x_pad[i:i+window] for i in range(len(x))], axis=1)\n",
" return y"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "mijUpEIFYOza",
"colab_type": "code",
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
},
"output_extras": [
{
"item_id": 1
}
],
"base_uri": "https://localhost:8080/",
"height": 35
},
"outputId": "ef055268-8e12-4b3c-a06c-cc6d19a7949e",
"executionInfo": {
"status": "ok",
"timestamp": 1517110279696,
"user_tz": -480,
"elapsed": 644,
"user": {
"displayName": "Mike C",
"photoUrl": "//lh3.googleusercontent.com/-xWyBhueI0Vo/AAAAAAAAAAI/AAAAAAAAIg8/rU-qliu5l-M/s50-c-k-no/photo.jpg",
"userId": "110113503404408134511"
}
}
},
"cell_type": "code",
"source": [
"# For now I will just run on a subset of the data, for speed!\n",
"subset = 200\n",
"window=60*3\n",
"x=X_train[:subset]\n",
"y_stacked=y_train[:subset]\n",
"print(x.shape)\n",
"X_train_stacked = timeseries_to_seq(x, window=window).reshape((x.shape[0], -1))"
],
"execution_count": 14,
"outputs": [
{
"output_type": "stream",
"text": [
"(200, 22)\n"
],
"name": "stdout"
}
]
},
{
"metadata": {
"id": "s1h_9IETxU0d",
"colab_type": "code",
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
}
}
},
"cell_type": "code",
"source": [
"from tpot import TPOTRegressor\n",
"# A quick run of TPOT with small population and short number of generation\n",
"# About 25 minutes to run\n",
"tpot = TPOTRegressor(generations=3, population_size=10, verbosity=3)\n",
"tpot.fit(X_train_stacked, y_stacked)\n",
"tpot.export('tpot_hydrosaver_export.py')"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "lpw8PhAS59EC",
"colab_type": "code",
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
},
"output_extras": [
{
"item_id": 1
}
],
"base_uri": "https://localhost:8080/",
"height": 35
},
"outputId": "18c73a74-802d-48ae-ce6c-ae46ce8a694d",
"executionInfo": {
"status": "ok",
"timestamp": 1517111729035,
"user_tz": -480,
"elapsed": 810,
"user": {
"displayName": "Mike C",
"photoUrl": "//lh3.googleusercontent.com/-xWyBhueI0Vo/AAAAAAAAAAI/AAAAAAAAIg8/rU-qliu5l-M/s50-c-k-no/photo.jpg",
"userId": "110113503404408134511"
}
}
},
"cell_type": "code",
"source": [
"tpot.export('tpot_hydrosaver_export.py')"
],
"execution_count": 16,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"True"
]
},
"metadata": {
"tags": []
},
"execution_count": 16
}
]
},
{
"metadata": {
"id": "UFeZKx9j27Fx",
"colab_type": "code",
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
},
"output_extras": [
{
"item_id": 1
}
],
"base_uri": "https://localhost:8080/",
"height": 299
},
"outputId": "73c3978c-e876-45cd-9317-1e42b2ff3c56",
"executionInfo": {
"status": "ok",
"timestamp": 1517111731056,
"user_tz": -480,
"elapsed": 1665,
"user": {
"displayName": "Mike C",
"photoUrl": "//lh3.googleusercontent.com/-xWyBhueI0Vo/AAAAAAAAAAI/AAAAAAAAIg8/rU-qliu5l-M/s50-c-k-no/photo.jpg",
"userId": "110113503404408134511"
}
}
},
"cell_type": "code",
"source": [
"# What's the pipeline it saved?\n",
"# In this case it found that LassoLarsCV(normalize=False) performed best\n",
"!cat tpot_hydrosaver_export.py"
],
"execution_count": 17,
"outputs": [
{
"output_type": "stream",
"text": [
"import numpy as np\r\n",
"import pandas as pd\r\n",
"from sklearn.linear_model import LassoLarsCV\r\n",
"from sklearn.model_selection import train_test_split\r\n",
"\r\n",
"# NOTE: Make sure that the class is labeled 'target' in the data file\r\n",
"tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)\r\n",
"features = tpot_data.drop('target', axis=1).values\r\n",
"training_features, testing_features, training_target, testing_target = \\\r\n",
" train_test_split(features, tpot_data['target'].values, random_state=42)\r\n",
"\r\n",
"# Score on the training set was:-0.00011788279235816052\r\n",
"exported_pipeline = LassoLarsCV(normalize=False)\r\n",
"\r\n",
"exported_pipeline.fit(training_features, training_target)\r\n",
"results = exported_pipeline.predict(testing_features)\r\n"
],
"name": "stdout"
}
]
},
{
"metadata": {
"id": "u8hGttzUwU3a",
"colab_type": "code",
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
}
}
},
"cell_type": "code",
"source": [
"# final score\n",
"def rmse(y_pred, y_true):\n",
" sqloss = (y_true-y_pred)**2\n",
" return np.sqrt(sqloss.mean())"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "Ypq2uShmIb3B",
"colab_type": "code",
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
},
"output_extras": [
{
"item_id": 1
}
],
"base_uri": "https://localhost:8080/",
"height": 35
},
"outputId": "6ba65187-aaa7-4e2d-acb2-63fb8f7e4b88",
"executionInfo": {
"status": "ok",
"timestamp": 1517111877742,
"user_tz": -480,
"elapsed": 2417,
"user": {
"displayName": "Mike C",
"photoUrl": "//lh3.googleusercontent.com/-xWyBhueI0Vo/AAAAAAAAAAI/AAAAAAAAIg8/rU-qliu5l-M/s50-c-k-no/photo.jpg",
"userId": "110113503404408134511"
}
}
},
"cell_type": "code",
"source": [
"X_val_stacked = timeseries_to_seq(X_val, window=window).reshape((X_val.shape[0], -1))\n",
"y_pred = tpot.predict(X_val_stacked)\n",
"score = rmse(y_pred, y_val)\n",
"score"
],
"execution_count": 27,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"3.8885540109338006"
]
},
"metadata": {
"tags": []
},
"execution_count": 27
}
]
},
{
"metadata": {
"id": "RfqDTdZCIb0X",
"colab_type": "code",
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
}
}
},
"cell_type": "code",
"source": [
"X_test_stacked = timeseries_to_seq(X_test, window=window).reshape((X_test.shape[0], -1))\n",
"y_pred = tpot.predict(X_test_stacked)"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "qhCNUxNExifF",
"colab_type": "code",
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
}
}
},
"cell_type": "code",
"source": [
"# save\n",
"s = pd.Series(y_submit, name='target')\n",
"assert len(s)==439140\n",
"\n",
"import datetime\n",
"ts = datetime.datetime.utcnow().strftime('%Y%m%d_%H-%M-%S')\n",
"\n",
"submission_file = 'submission_%s_score_%2.2f.csv'%(ts,score)\n",
"s.to_csv(submission_file, index=False, header=True, float_format='%2.9s')\n",
"print('upload file', submission_file)"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "Fh1ceEwmiv3h",
"colab_type": "code",
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
}
}
},
"cell_type": "code",
"source": [
"# and download\n",
"import google\n",
"google.colab.files.download(submission_file)"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "mvKjdQsK5mOz",
"colab_type": "code",
"colab": {
"autoexec": {
"startup": false,
"wait_interval": 0
}
}
},
"cell_type": "code",
"source": [
""
],
"execution_count": 0,
"outputs": []
}
]
}
# -*- coding: utf-8 -*-
"""hydrosaver.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/notebook#fileId=1gs18AtviN2Y3jSsVF2rgprAtCA8Jnt_8
"""
# !pip install http://download.pytorch.org/whl/cu80/torch-0.3.0.post4-cp36-cp36m-linux_x86_64.whl
#!pip install http://download.pytorch.org/whl/cpu/torch-0.3.0.post4-cp36-cp36m-linux_x86_64.whl
#!pip install xgboost tpot pandas-profiling seaborn torchvision tqdm
# %pylab inline
import numpy as np
import pandas as pd
import seaborn as sn
import os
from tqdm import tqdm
eps = 1e-6
seed = 42
np.random.seed(seed)
"""# Download data"""
# from https://stackoverflow.com/a/39225039/221742
import requests
def download_file_from_google_drive(id, destination):
def get_confirm_token(response):
for key, value in response.cookies.items():
if key.startswith('download_warning'):
return value
return None
def save_response_content(response, destination):
CHUNK_SIZE = 32768
with open(destination, "wb") as f:
for chunk in response.iter_content(CHUNK_SIZE):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
URL = "https://docs.google.com/uc?export=download"
session = requests.Session()
response = session.get(URL, params = { 'id' : id }, stream = True)
token = get_confirm_token(response)
if token:
params = { 'id' : id, 'confirm' : token }
response = session.get(URL, params = params, stream = True)
save_response_content(response, destination)
if not os.path.isdir('data/original'):
os.makedirs('data/original')
download_file_from_google_drive('15BqAMEBsTjAzT2eJXED-zA1pdHpGWZLl', './data/original/train.csv')
download_file_from_google_drive('1Xi_lLCKTsgSNECerpIPhQPzUCtmutDeS', './data/original/publishable_test_set.csv')
"""# Load data"""
# So we have some unique NaN values: 'No Data', 'Bad Input', etc. We also have date index col
df_train_val = pd.read_csv('./data/original/train.csv', index_col='timestamp', parse_dates=[0], na_values=['', 'No Data', 'Bad Input', 'Scan Off', 'I/O Timeout'])
df_train_val = df_train_val.dropna(axis=1, how='all') # drop the columns that are all NaN's
df_train_val = df_train_val.resample('1T').first()
df_train_val = df_train_val.drop('DIC88023.PV', 1)
df_test = pd.read_csv('./data/original/publishable_test_set.csv', index_col='timestamp', parse_dates=[0], na_values=['', 'No Data', 'Bad Input', 'Scan Off', 'I/O Timeout'])
df_test = df_test.dropna(axis=1, how='all') # drop the columns that are all NaN's
y_train_val = df_train_val.target
x_train_val = df_train_val.drop('target', 1) # We don't want the answer to be in the input data
x_test = df_test
# normalize the input columns
x_mean = x_train_val.mean()
x_std = x_train_val.mean()
x_train_val = (x_train_val - x_mean)/(x_std + eps)
x_test = (x_test - x_mean)/(x_std + eps)
# TODO I may want to normalize y too
print('mean', x_mean)
print('std', x_std)
# TPOT wont accept NaNs, so we either replace or drop
# Another approach would be to use unique numbers or extra columns for this
# Since we've normalized it, 0 is the nothing value. So let's use that
x_train_val = x_train_val.replace(np.nan, 0)
y_train_val = y_train_val.replace(np.nan, 0)
x_test = x_test.replace(np.nan, 0)
# since it's a timeseries the validation will be in the future
val_split_in = int(len(df_train_val.index)*0.85)
x_val = x_train_val[val_split_in:]
x_train = x_train_val[:val_split_in]
y_val = y_train_val[val_split_in:]
y_train = y_train_val[:val_split_in]
# convert to numpy
X_train = x_train.as_matrix()
y_train = y_train.as_matrix()
X_val = x_val.as_matrix()
y_val = y_val.as_matrix()
X_test = x_test.as_matrix()
"""# Have look into the data"""
df_train_val.info()
df_train_val.describe()
# You can use pandas profiling to get an overview of the data
import pandas_profiling
profile = pandas_profiling.ProfileReport(df_train_val[:2000])
profile.to_file(outputfile="/tmp/myoutputfile.html")
profile
"""# TPOT!
TPOT is an automatic machine learning library that uses genetic algorithms to try different generations of scikit-learn algorihtms.
link: https://epistasislab.github.io/tpot/
"""
# Check data for TPOT compatability
from tpot.base import check_X_y
check_X_y(X_train, y_train, accept_sparse=True)
check_X_y(X_val, y_val, accept_sparse=True)
'ok'
# Ensure the it respects causality, by only giving each sample access to a window of past data
# make padded sequences, we need to make the data in shape (batch, window_of_timesteps, features)
def timeseries_to_seq(x, window=3):
"""
Inputs:
- x: shape (timeseries, features)
- window: e.g. 3
Outputs:
- y: shape shape (window, batch, features)
"""
x_pad = np.pad(x, [[window,0],[0,0]], mode='constant')
y = np.stack([x_pad[i:i+window] for i in range(len(x))], axis=1)
return y
# For now I will just run on a subset of the data, for speed!
subset = 200
window=60*3
x=X_train[:subset]
y_stacked=y_train[:subset]
print(x.shape)
X_train_stacked = timeseries_to_seq(x, window=window).reshape((x.shape[0], -1))
from tpot import TPOTRegressor
# A quick run of TPOT with small population and short number of generation
# About 25 minutes to run
tpot = TPOTRegressor(generations=3, population_size=10, verbosity=3)
tpot.fit(X_train_stacked, y_stacked)
tpot.export('tpot_hydrosaver_export.py')
tpot.export('tpot_hydrosaver_export.py')
# What's the pipeline it saved?
# In this case it found that LassoLarsCV(normalize=False) performed best
#!cat tpot_hydrosaver_export.py
# final score
def rmse(y_pred, y_true):
sqloss = (y_true-y_pred)**2
return np.sqrt(sqloss.mean())
X_val_stacked = timeseries_to_seq(X_val, window=window).reshape((X_val.shape[0], -1))
y_pred = tpot.predict(X_val_stacked)
score = rmse(y_pred, y_val)
score
X_test_stacked = timeseries_to_seq(X_test, window=window).reshape((X_test.shape[0], -1))
y_pred = tpot.predict(X_test_stacked)
# save
s = pd.Series(y_submit, name='target')
assert len(s)==439140
import datetime
ts = datetime.datetime.utcnow().strftime('%Y%m%d_%H-%M-%S')
submission_file = 'submission_%s_score_%2.2f.csv'%(ts,score)
s.to_csv(submission_file, index=False, header=True, float_format='%2.9s')
print('upload file', submission_file)
# and download
import google
google.colab.files.download(submission_file)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment