wassname/hydrosaver.ipynb

## hydrosaver.ipynb
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "hydrosaver.ipynb",
      "version": "0.3.2",
      "views": {},
      "default_view": {},
      "provenance": [],
      "collapsed_sections": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    }
  },
  "cells": [
    {
      "metadata": {
        "id": "ouWjqOFAxk3G",
        "colab_type": "code",
        "colab": {
          "autoexec": {
            "startup": false,
            "wait_interval": 0
          },
          "output_extras": [
            {
              "item_id": 4
            }
          ],
          "base_uri": "https://localhost:8080/",
          "height": 600
        },
        "outputId": "7507890c-e73e-45bd-a287-dfa012032d9b",
        "executionInfo": {
          "status": "ok",
          "timestamp": 1517110247838,
          "user_tz": -480,
          "elapsed": 6020,
          "user": {
            "displayName": "Mike C",
            "photoUrl": "//lh3.googleusercontent.com/-xWyBhueI0Vo/AAAAAAAAAAI/AAAAAAAAIg8/rU-qliu5l-M/s50-c-k-no/photo.jpg",
            "userId": "110113503404408134511"
          }
        }
      },
      "cell_type": "code",
      "source": [
        "# !pip install http://download.pytorch.org/whl/cu80/torch-0.3.0.post4-cp36-cp36m-linux_x86_64.whl \n",
        "!pip install http://download.pytorch.org/whl/cpu/torch-0.3.0.post4-cp36-cp36m-linux_x86_64.whl \n",
        "!pip install xgboost tpot pandas-profiling seaborn torchvision tqdm"
      ],
      "execution_count": 4,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Requirement already satisfied: torch==0.3.0.post4 from http://download.pytorch.org/whl/cpu/torch-0.3.0.post4-cp36-cp36m-linux_x86_64.whl in /usr/local/lib/python3.6/dist-packages\n",
            "Requirement already satisfied: pyyaml in /usr/local/lib/python3.6/dist-packages (from torch==0.3.0.post4)\n",
            "Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from torch==0.3.0.post4)\n",
            "Requirement already satisfied: xgboost in /usr/local/lib/python3.6/dist-packages\n",
            "Requirement already satisfied: tpot in /usr/local/lib/python3.6/dist-packages\n",
            "Requirement already satisfied: pandas-profiling in /usr/local/lib/python3.6/dist-packages\n",
            "Requirement already satisfied: seaborn in /usr/local/lib/python3.6/dist-packages\n",
            "Requirement already satisfied: torchvision in /usr/local/lib/python3.6/dist-packages\n",
            "Requirement already satisfied: tqdm in /usr/local/lib/python3.6/dist-packages\n",
            "Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from xgboost)\n",
            "Requirement already satisfied: scipy in /usr/local/lib/python3.6/dist-packages (from xgboost)\n",
            "Requirement already satisfied: deap>=1.0 in /usr/local/lib/python3.6/dist-packages (from tpot)\n",
            "Requirement already satisfied: update-checker>=0.16 in /usr/local/lib/python3.6/dist-packages (from tpot)\n",
            "Requirement already satisfied: scikit-learn>=0.18.1 in /usr/local/lib/python3.6/dist-packages (from tpot)\n",
            "Requirement already satisfied: stopit>=1.1.1 in /usr/local/lib/python3.6/dist-packages (from tpot)\n",
            "Requirement already satisfied: pandas>=0.20.2 in /usr/local/lib/python3.6/dist-packages (from tpot)\n",
            "Requirement already satisfied: matplotlib>=1.4 in /usr/local/lib/python3.6/dist-packages (from pandas-profiling)\n",
            "Requirement already satisfied: jinja2>=2.8 in /usr/local/lib/python3.6/dist-packages (from pandas-profiling)\n",
            "Requirement already satisfied: six>=1.9 in /usr/local/lib/python3.6/dist-packages (from pandas-profiling)\n",
            "Requirement already satisfied: torch in /usr/local/lib/python3.6/dist-packages (from torchvision)\n",
            "Requirement already satisfied: pillow>=4.1.1 in /usr/local/lib/python3.6/dist-packages (from torchvision)\n",
            "Requirement already satisfied: requests>=2.3.0 in /usr/local/lib/python3.6/dist-packages (from update-checker>=0.16->tpot)\n",
            "Requirement already satisfied: python-dateutil>=2 in /usr/local/lib/python3.6/dist-packages (from pandas>=0.20.2->tpot)\n",
            "Requirement already satisfied: pytz>=2011k in /usr/local/lib/python3.6/dist-packages (from pandas>=0.20.2->tpot)\n",
            "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.6/dist-packages (from matplotlib>=1.4->pandas-profiling)\n",
            "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib>=1.4->pandas-profiling)\n",
            "Requirement already satisfied: MarkupSafe in /usr/local/lib/python3.6/dist-packages (from jinja2>=2.8->pandas-profiling)\n",
            "Requirement already satisfied: pyyaml in /usr/local/lib/python3.6/dist-packages (from torch->torchvision)\n",
            "Requirement already satisfied: idna<2.7,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests>=2.3.0->update-checker>=0.16->tpot)\n",
            "Requirement already satisfied: urllib3<1.23,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests>=2.3.0->update-checker>=0.16->tpot)\n",
            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests>=2.3.0->update-checker>=0.16->tpot)\n",
            "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests>=2.3.0->update-checker>=0.16->tpot)\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "metadata": {
        "id": "iCu7iIqOOBwg",
        "colab_type": "code",
        "colab": {
          "autoexec": {
            "startup": false,
            "wait_interval": 0
          }
        }
      },
      "cell_type": "code",
      "source": [
        ""
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "cOUoN4Iytsl6",
        "colab_type": "code",
        "colab": {
          "autoexec": {
            "startup": false,
            "wait_interval": 0
          },
          "output_extras": [
            {
              "item_id": 1
            }
          ],
          "base_uri": "https://localhost:8080/",
          "height": 35
        },
        "outputId": "e5662991-d80f-42c4-998c-ee969542f125",
        "executionInfo": {
          "status": "ok",
          "timestamp": 1517111898602,
          "user_tz": -480,
          "elapsed": 984,
          "user": {
            "displayName": "Mike C",
            "photoUrl": "//lh3.googleusercontent.com/-xWyBhueI0Vo/AAAAAAAAAAI/AAAAAAAAIg8/rU-qliu5l-M/s50-c-k-no/photo.jpg",
            "userId": "110113503404408134511"
          }
        }
      },
      "cell_type": "code",
      "source": [
        "%pylab inline\n",
        "import numpy as np\n",
        "import pandas as pd\n",
        "import seaborn as sn\n",
        "import os\n",
        "from tqdm import tqdm"
      ],
      "execution_count": 2,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Populating the interactive namespace from numpy and matplotlib\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "metadata": {
        "id": "MYHMWSswtpMW",
        "colab_type": "code",
        "colab": {
          "autoexec": {
            "startup": false,
            "wait_interval": 0
          }
        }
      },
      "cell_type": "code",
      "source": [
        "eps = 1e-6\n",
        "seed = 42\n",
        "np.random.seed(seed)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "P5QboV3_vVbt",
        "colab_type": "code",
        "colab": {
          "autoexec": {
            "startup": false,
            "wait_interval": 0
          }
        }
      },
      "cell_type": "code",
      "source": [
        ""
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "DG73KWmRvvqD",
        "colab_type": "text"
      },
      "cell_type": "markdown",
      "source": [
        "# Download data"
      ]
    },
    {
      "metadata": {
        "id": "i_S6eqlBu0Ti",
        "colab_type": "code",
        "colab": {
          "autoexec": {
            "startup": false,
            "wait_interval": 0
          }
        }
      },
      "cell_type": "code",
      "source": [
        "# from https://stackoverflow.com/a/39225039/221742\n",
        "import requests\n",
        "\n",
        "def download_file_from_google_drive(id, destination):\n",
        "    def get_confirm_token(response):\n",
        "        for key, value in response.cookies.items():\n",
        "            if key.startswith('download_warning'):\n",
        "                return value\n",
        "\n",
        "        return None\n",
        "\n",
        "    def save_response_content(response, destination):\n",
        "        CHUNK_SIZE = 32768\n",
        "\n",
        "        with open(destination, \"wb\") as f:\n",
        "            for chunk in response.iter_content(CHUNK_SIZE):\n",
        "                if chunk: # filter out keep-alive new chunks\n",
        "                    f.write(chunk)\n",
        "\n",
        "    URL = \"https://docs.google.com/uc?export=download\"\n",
        "\n",
        "    session = requests.Session()\n",
        "\n",
        "    response = session.get(URL, params = { 'id' : id }, stream = True)\n",
        "    token = get_confirm_token(response)\n",
        "\n",
        "    if token:\n",
        "        params = { 'id' : id, 'confirm' : token }\n",
        "        response = session.get(URL, params = params, stream = True)\n",
        "\n",
        "    save_response_content(response, destination)    \n",
        "    \n",
        "if not os.path.isdir('data/original'):\n",
        "  os.makedirs('data/original')\n",
        "download_file_from_google_drive('15BqAMEBsTjAzT2eJXED-zA1pdHpGWZLl', './data/original/train.csv')\n",
        "download_file_from_google_drive('1Xi_lLCKTsgSNECerpIPhQPzUCtmutDeS', './data/original/publishable_test_set.csv')\n"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "Wgr_5JJDv9A0",
        "colab_type": "text"
      },
      "cell_type": "markdown",
      "source": [
        "# Load data"
      ]
    },
    {
      "metadata": {
        "id": "MjzCmx7ZtzRN",
        "colab_type": "code",
        "colab": {
          "autoexec": {
            "startup": false,
            "wait_interval": 0
          }
        }
      },
      "cell_type": "code",
      "source": [
        "# So we have some unique NaN values: 'No Data', 'Bad Input', etc. We also have date index col\n",
        "df_train_val = pd.read_csv('./data/original/train.csv', index_col='timestamp', parse_dates=[0], na_values=['', 'No Data', 'Bad Input', 'Scan Off', 'I/O Timeout'])\n",
        "df_train_val = df_train_val.dropna(axis=1,  how='all') # drop the columns that are all NaN's\n",
        "df_train_val = df_train_val.resample('1T').first()\n",
        "df_train_val = df_train_val.drop('DIC88023.PV', 1)\n",
        "\n",
        "df_test = pd.read_csv('./data/original/publishable_test_set.csv', index_col='timestamp', parse_dates=[0], na_values=['', 'No Data', 'Bad Input', 'Scan Off', 'I/O Timeout'])\n",
        "df_test = df_test.dropna(axis=1,  how='all') # drop the columns that are all NaN's\n",
        "\n",
        "y_train_val = df_train_val.target\n",
        "x_train_val = df_train_val.drop('target', 1) # We don't want the answer to be in the input data\n",
        "\n",
        "x_test = df_test\n",
        "\n",
        "# normalize the input columns\n",
        "x_mean = x_train_val.mean()\n",
        "x_std = x_train_val.mean()\n",
        "\n",
        "x_train_val = (x_train_val - x_mean)/(x_std + eps)\n",
        "x_test = (x_test - x_mean)/(x_std + eps)\n",
        "\n",
        "# TODO I may want to normalize y too\n",
        "\n",
        "print('mean', x_mean)\n",
        "print('std', x_std)\n",
        "\n",
        "# TPOT wont accept NaNs, so we either replace or drop\n",
        "# Another approach would be to use unique numbers or extra columns for this\n",
        "# Since we've normalized it, 0 is the nothing value. So let's use that\n",
        "\n",
        "\n",
        "x_train_val = x_train_val.replace(np.nan, 0)\n",
        "y_train_val = y_train_val.replace(np.nan, 0)\n",
        "x_test = x_test.replace(np.nan, 0)\n",
        "\n",
        "# since it's a timeseries the validation will be in the future\n",
        "val_split_in = int(len(df_train_val.index)*0.85)\n",
        "x_val = x_train_val[val_split_in:]\n",
        "x_train = x_train_val[:val_split_in]\n",
        "y_val = y_train_val[val_split_in:]\n",
        "y_train = y_train_val[:val_split_in]\n",
        "\n",
        "# convert to numpy\n",
        "X_train = x_train.as_matrix()\n",
        "y_train = y_train.as_matrix()\n",
        "X_val = x_val.as_matrix()\n",
        "y_val = y_val.as_matrix()\n",
        "X_test = x_test.as_matrix()"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "DOW31bu1LmCZ",
        "colab_type": "code",
        "colab": {
          "autoexec": {
            "startup": false,
            "wait_interval": 0
          }
        }
      },
      "cell_type": "code",
      "source": [
        ""
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "4B2eXef2LmTq",
        "colab_type": "text"
      },
      "cell_type": "markdown",
      "source": [
        "# Have look into the data"
      ]
    },
    {
      "metadata": {
        "id": "eK3sF_pewzCe",
        "colab_type": "code",
        "colab": {
          "autoexec": {
            "startup": false,
            "wait_interval": 0
          }
        }
      },
      "cell_type": "code",
      "source": [
        "df_train_val.info()"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "ixLBMdpYtzlr",
        "colab_type": "code",
        "colab": {
          "autoexec": {
            "startup": false,
            "wait_interval": 0
          },
          "output_extras": [
            {
              "item_id": 1
            }
          ],
          "base_uri": "https://localhost:8080/",
          "height": 333
        },
        "outputId": "677c9e2a-7995-448d-d8bb-97260d6bcf29",
        "executionInfo": {
          "status": "ok",
          "timestamp": 1517110268232,
          "user_tz": -480,
          "elapsed": 1591,
          "user": {
            "displayName": "Mike C",
            "photoUrl": "//lh3.googleusercontent.com/-xWyBhueI0Vo/AAAAAAAAAAI/AAAAAAAAIg8/rU-qliu5l-M/s50-c-k-no/photo.jpg",
            "userId": "110113503404408134511"
          }
        }
      },
      "cell_type": "code",
      "source": [
        "df_train_val.describe()"
      ],
      "execution_count": 10,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>WQI8100XCL1.CPV</th>\n",
              "      <th>XI84201.PV</th>\n",
              "      <th>XI84202.PV</th>\n",
              "      <th>XI84123.PV</th>\n",
              "      <th>XI84124.PV</th>\n",
              "      <th>XI84125.PV</th>\n",
              "      <th>FX87211.CPV1</th>\n",
              "      <th>FIC87211.PV</th>\n",
              "      <th>FIC87211.SV</th>\n",
              "      <th>FX87211.P01</th>\n",
              "      <th>...</th>\n",
              "      <th>NIC88002.PV</th>\n",
              "      <th>PIC88007.PV</th>\n",
              "      <th>LIC88006.PV</th>\n",
              "      <th>AIC88055.PV</th>\n",
              "      <th>FIC88022.PV</th>\n",
              "      <th>DIC88023.PV</th>\n",
              "      <th>SI88033.PV</th>\n",
              "      <th>SI88034.PV</th>\n",
              "      <th>MQI88024.CPV</th>\n",
              "      <th>target</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>count</th>\n",
              "      <td>567467.000000</td>\n",
              "      <td>568873.000000</td>\n",
              "      <td>568873.000000</td>\n",
              "      <td>568872.000000</td>\n",
              "      <td>568868.000000</td>\n",
              "      <td>568864.000000</td>\n",
              "      <td>568748.000000</td>\n",
              "      <td>568873.000000</td>\n",
              "      <td>568873.000000</td>\n",
              "      <td>568748.000000</td>\n",
              "      <td>...</td>\n",
              "      <td>568748.000000</td>\n",
              "      <td>568748.000000</td>\n",
              "      <td>568748.000000</td>\n",
              "      <td>568747.000000</td>\n",
              "      <td>568873.000000</td>\n",
              "      <td>568873.000000</td>\n",
              "      <td>568873.000000</td>\n",
              "      <td>568873.000000</td>\n",
              "      <td>568748.000000</td>\n",
              "      <td>568873.000000</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>mean</th>\n",
              "      <td>702.185281</td>\n",
              "      <td>105.143377</td>\n",
              "      <td>78.833709</td>\n",
              "      <td>0.046374</td>\n",
              "      <td>4.729106</td>\n",
              "      <td>23.870133</td>\n",
              "      <td>701.753521</td>\n",
              "      <td>4099.623820</td>\n",
              "      <td>4369.277083</td>\n",
              "      <td>21.063556</td>\n",
              "      <td>...</td>\n",
              "      <td>24.874992</td>\n",
              "      <td>8.484779</td>\n",
              "      <td>59.188180</td>\n",
              "      <td>35.818553</td>\n",
              "      <td>828.305065</td>\n",
              "      <td>52.464238</td>\n",
              "      <td>39.967696</td>\n",
              "      <td>23.986799</td>\n",
              "      <td>688.014555</td>\n",
              "      <td>52.463664</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>std</th>\n",
              "      <td>223.248174</td>\n",
              "      <td>39.179601</td>\n",
              "      <td>15.586560</td>\n",
              "      <td>0.020848</td>\n",
              "      <td>2.002726</td>\n",
              "      <td>10.889995</td>\n",
              "      <td>222.411906</td>\n",
              "      <td>1270.383514</td>\n",
              "      <td>623.180916</td>\n",
              "      <td>2.717285</td>\n",
              "      <td>...</td>\n",
              "      <td>7.449179</td>\n",
              "      <td>5.059790</td>\n",
              "      <td>31.757723</td>\n",
              "      <td>32.317187</td>\n",
              "      <td>241.930273</td>\n",
              "      <td>12.142946</td>\n",
              "      <td>34.175055</td>\n",
              "      <td>33.253739</td>\n",
              "      <td>212.166525</td>\n",
              "      <td>12.142832</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>min</th>\n",
              "      <td>-431.185300</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>-319.424500</td>\n",
              "      <td>-141.249900</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>15.000000</td>\n",
              "      <td>...</td>\n",
              "      <td>-0.305536</td>\n",
              "      <td>-0.660624</td>\n",
              "      <td>0.100289</td>\n",
              "      <td>-0.185952</td>\n",
              "      <td>-8.621460</td>\n",
              "      <td>-1.073746</td>\n",
              "      <td>-0.091523</td>\n",
              "      <td>-0.042978</td>\n",
              "      <td>-19.506880</td>\n",
              "      <td>-1.073746</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>25%</th>\n",
              "      <td>704.654800</td>\n",
              "      <td>85.875800</td>\n",
              "      <td>76.717600</td>\n",
              "      <td>0.043750</td>\n",
              "      <td>4.615610</td>\n",
              "      <td>23.181300</td>\n",
              "      <td>705.026125</td>\n",
              "      <td>3963.361000</td>\n",
              "      <td>4000.860000</td>\n",
              "      <td>19.000000</td>\n",
              "      <td>...</td>\n",
              "      <td>20.495800</td>\n",
              "      <td>4.414151</td>\n",
              "      <td>34.129130</td>\n",
              "      <td>-0.042471</td>\n",
              "      <td>823.644000</td>\n",
              "      <td>53.834340</td>\n",
              "      <td>0.061580</td>\n",
              "      <td>0.043636</td>\n",
              "      <td>684.447975</td>\n",
              "      <td>53.834050</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>50%</th>\n",
              "      <td>771.509900</td>\n",
              "      <td>104.959000</td>\n",
              "      <td>80.717500</td>\n",
              "      <td>0.053122</td>\n",
              "      <td>5.664090</td>\n",
              "      <td>27.982300</td>\n",
              "      <td>771.734550</td>\n",
              "      <td>4365.759000</td>\n",
              "      <td>4366.150000</td>\n",
              "      <td>21.000000</td>\n",
              "      <td>...</td>\n",
              "      <td>25.944700</td>\n",
              "      <td>9.353214</td>\n",
              "      <td>45.527020</td>\n",
              "      <td>64.933850</td>\n",
              "      <td>893.697500</td>\n",
              "      <td>54.826150</td>\n",
              "      <td>61.135900</td>\n",
              "      <td>0.058294</td>\n",
              "      <td>748.692300</td>\n",
              "      <td>54.824650</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>75%</th>\n",
              "      <td>813.349800</td>\n",
              "      <td>117.191000</td>\n",
              "      <td>86.680900</td>\n",
              "      <td>0.059375</td>\n",
              "      <td>5.875010</td>\n",
              "      <td>30.394600</td>\n",
              "      <td>813.457500</td>\n",
              "      <td>4711.450000</td>\n",
              "      <td>4711.220000</td>\n",
              "      <td>23.000000</td>\n",
              "      <td>...</td>\n",
              "      <td>30.657177</td>\n",
              "      <td>12.259470</td>\n",
              "      <td>99.866600</td>\n",
              "      <td>64.951930</td>\n",
              "      <td>945.023000</td>\n",
              "      <td>55.838750</td>\n",
              "      <td>69.629300</td>\n",
              "      <td>66.137960</td>\n",
              "      <td>795.153125</td>\n",
              "      <td>55.838450</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>max</th>\n",
              "      <td>1172.474000</td>\n",
              "      <td>726.869000</td>\n",
              "      <td>100.000000</td>\n",
              "      <td>0.200000</td>\n",
              "      <td>15.812500</td>\n",
              "      <td>51.659300</td>\n",
              "      <td>1146.185000</td>\n",
              "      <td>15645.750000</td>\n",
              "      <td>14000.000000</td>\n",
              "      <td>35.000000</td>\n",
              "      <td>...</td>\n",
              "      <td>58.597800</td>\n",
              "      <td>36.658300</td>\n",
              "      <td>100.127200</td>\n",
              "      <td>64.983190</td>\n",
              "      <td>1303.840000</td>\n",
              "      <td>77.728490</td>\n",
              "      <td>99.626400</td>\n",
              "      <td>98.255230</td>\n",
              "      <td>1399.242000</td>\n",
              "      <td>77.728490</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "<p>8 rows × 23 columns</p>\n",
              "</div>"
            ],
            "text/plain": [
              "       WQI8100XCL1.CPV     XI84201.PV     XI84202.PV     XI84123.PV  \\\n",
              "count    567467.000000  568873.000000  568873.000000  568872.000000   \n",
              "mean        702.185281     105.143377      78.833709       0.046374   \n",
              "std         223.248174      39.179601      15.586560       0.020848   \n",
              "min        -431.185300       0.000000       0.000000       0.000000   \n",
              "25%         704.654800      85.875800      76.717600       0.043750   \n",
              "50%         771.509900     104.959000      80.717500       0.053122   \n",
              "75%         813.349800     117.191000      86.680900       0.059375   \n",
              "max        1172.474000     726.869000     100.000000       0.200000   \n",
              "\n",
              "          XI84124.PV     XI84125.PV   FX87211.CPV1    FIC87211.PV  \\\n",
              "count  568868.000000  568864.000000  568748.000000  568873.000000   \n",
              "mean        4.729106      23.870133     701.753521    4099.623820   \n",
              "std         2.002726      10.889995     222.411906    1270.383514   \n",
              "min         0.000000       0.000000    -319.424500    -141.249900   \n",
              "25%         4.615610      23.181300     705.026125    3963.361000   \n",
              "50%         5.664090      27.982300     771.734550    4365.759000   \n",
              "75%         5.875010      30.394600     813.457500    4711.450000   \n",
              "max        15.812500      51.659300    1146.185000   15645.750000   \n",
              "\n",
              "         FIC87211.SV    FX87211.P01      ...          NIC88002.PV  \\\n",
              "count  568873.000000  568748.000000      ...        568748.000000   \n",
              "mean     4369.277083      21.063556      ...            24.874992   \n",
              "std       623.180916       2.717285      ...             7.449179   \n",
              "min         0.000000      15.000000      ...            -0.305536   \n",
              "25%      4000.860000      19.000000      ...            20.495800   \n",
              "50%      4366.150000      21.000000      ...            25.944700   \n",
              "75%      4711.220000      23.000000      ...            30.657177   \n",
              "max     14000.000000      35.000000      ...            58.597800   \n",
              "\n",
              "         PIC88007.PV    LIC88006.PV    AIC88055.PV    FIC88022.PV  \\\n",
              "count  568748.000000  568748.000000  568747.000000  568873.000000   \n",
              "mean        8.484779      59.188180      35.818553     828.305065   \n",
              "std         5.059790      31.757723      32.317187     241.930273   \n",
              "min        -0.660624       0.100289      -0.185952      -8.621460   \n",
              "25%         4.414151      34.129130      -0.042471     823.644000   \n",
              "50%         9.353214      45.527020      64.933850     893.697500   \n",
              "75%        12.259470      99.866600      64.951930     945.023000   \n",
              "max        36.658300     100.127200      64.983190    1303.840000   \n",
              "\n",
              "         DIC88023.PV     SI88033.PV     SI88034.PV   MQI88024.CPV  \\\n",
              "count  568873.000000  568873.000000  568873.000000  568748.000000   \n",
              "mean       52.464238      39.967696      23.986799     688.014555   \n",
              "std        12.142946      34.175055      33.253739     212.166525   \n",
              "min        -1.073746      -0.091523      -0.042978     -19.506880   \n",
              "25%        53.834340       0.061580       0.043636     684.447975   \n",
              "50%        54.826150      61.135900       0.058294     748.692300   \n",
              "75%        55.838750      69.629300      66.137960     795.153125   \n",
              "max        77.728490      99.626400      98.255230    1399.242000   \n",
              "\n",
              "              target  \n",
              "count  568873.000000  \n",
              "mean       52.463664  \n",
              "std        12.142832  \n",
              "min        -1.073746  \n",
              "25%        53.834050  \n",
              "50%        54.824650  \n",
              "75%        55.838450  \n",
              "max        77.728490  \n",
              "\n",
              "[8 rows x 23 columns]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 10
        }
      ]
    },
    {
      "metadata": {
        "id": "LSicdcLVLp7i",
        "colab_type": "code",
        "colab": {
          "autoexec": {
            "startup": false,
            "wait_interval": 0
          }
        }
      },
      "cell_type": "code",
      "source": [
        "# You can use pandas profiling to get an overview of the data\n",
        "import pandas_profiling\n",
        "profile = pandas_profiling.ProfileReport(df_train_val[:2000])\n",
        "profile.to_file(outputfile=\"/tmp/myoutputfile.html\")\n",
        "profile"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "X9gXoF9CxTst",
        "colab_type": "text"
      },
      "cell_type": "markdown",
      "source": [
        "# TPOT!\n",
        "\n",
        "TPOT is an automatic machine learning library that uses genetic algorithms to try different generations of scikit-learn algorihtms.\n",
        "\n",
        "link: https://epistasislab.github.io/tpot/"
      ]
    },
    {
      "metadata": {
        "id": "_26g069T5KCG",
        "colab_type": "code",
        "colab": {
          "autoexec": {
            "startup": false,
            "wait_interval": 0
          },
          "output_extras": [
            {
              "item_id": 1
            }
          ],
          "base_uri": "https://localhost:8080/",
          "height": 35
        },
        "outputId": "eb67d081-78f6-42e4-c8e3-35d923022724",
        "executionInfo": {
          "status": "ok",
          "timestamp": 1517110277832,
          "user_tz": -480,
          "elapsed": 779,
          "user": {
            "displayName": "Mike C",
            "photoUrl": "//lh3.googleusercontent.com/-xWyBhueI0Vo/AAAAAAAAAAI/AAAAAAAAIg8/rU-qliu5l-M/s50-c-k-no/photo.jpg",
            "userId": "110113503404408134511"
          }
        }
      },
      "cell_type": "code",
      "source": [
        "# Check data for TPOT compatability\n",
        "from tpot.base import check_X_y\n",
        "check_X_y(X_train, y_train, accept_sparse=True)\n",
        "check_X_y(X_val, y_val, accept_sparse=True)\n",
        "'ok'"
      ],
      "execution_count": 12,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "'ok'"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 12
        }
      ]
    },
    {
      "metadata": {
        "id": "pwEFjEGKYHEV",
        "colab_type": "code",
        "colab": {
          "autoexec": {
            "startup": false,
            "wait_interval": 0
          }
        }
      },
      "cell_type": "code",
      "source": [
        "# Ensure the it respects causality, by only giving each sample access to a window of past data\n",
        "# make padded sequences, we need to make the data in shape (batch, window_of_timesteps, features)\n",
        "\n",
        "def timeseries_to_seq(x, window=3):\n",
        "  \"\"\"\n",
        "  Inputs:\n",
        "  - x: shape (timeseries, features)\n",
        "  - window: e.g. 3\n",
        "  Outputs:\n",
        "  - y: shape shape (window, batch, features)\n",
        "  \"\"\"\n",
        "  x_pad = np.pad(x, [[window,0],[0,0]], mode='constant')\n",
        "  y = np.stack([x_pad[i:i+window] for i in range(len(x))], axis=1)\n",
        "  return y"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "mijUpEIFYOza",
        "colab_type": "code",
        "colab": {
          "autoexec": {
            "startup": false,
            "wait_interval": 0
          },
          "output_extras": [
            {
              "item_id": 1
            }
          ],
          "base_uri": "https://localhost:8080/",
          "height": 35
        },
        "outputId": "ef055268-8e12-4b3c-a06c-cc6d19a7949e",
        "executionInfo": {
          "status": "ok",
          "timestamp": 1517110279696,
          "user_tz": -480,
          "elapsed": 644,
          "user": {
            "displayName": "Mike C",
            "photoUrl": "//lh3.googleusercontent.com/-xWyBhueI0Vo/AAAAAAAAAAI/AAAAAAAAIg8/rU-qliu5l-M/s50-c-k-no/photo.jpg",
            "userId": "110113503404408134511"
          }
        }
      },
      "cell_type": "code",
      "source": [
        "# For now I will just run on a subset of the data, for speed!\n",
        "subset = 200\n",
        "window=60*3\n",
        "x=X_train[:subset]\n",
        "y_stacked=y_train[:subset]\n",
        "print(x.shape)\n",
        "X_train_stacked = timeseries_to_seq(x, window=window).reshape((x.shape[0], -1))"
      ],
      "execution_count": 14,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "(200, 22)\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "metadata": {
        "id": "s1h_9IETxU0d",
        "colab_type": "code",
        "colab": {
          "autoexec": {
            "startup": false,
            "wait_interval": 0
          }
        }
      },
      "cell_type": "code",
      "source": [
        "from tpot import TPOTRegressor\n",
        "# A quick run of TPOT with small population and short number of generation\n",
        "# About 25 minutes to run\n",
        "tpot = TPOTRegressor(generations=3, population_size=10, verbosity=3)\n",
        "tpot.fit(X_train_stacked, y_stacked)\n",
        "tpot.export('tpot_hydrosaver_export.py')"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "lpw8PhAS59EC",
        "colab_type": "code",
        "colab": {
          "autoexec": {
            "startup": false,
            "wait_interval": 0
          },
          "output_extras": [
            {
              "item_id": 1
            }
          ],
          "base_uri": "https://localhost:8080/",
          "height": 35
        },
        "outputId": "18c73a74-802d-48ae-ce6c-ae46ce8a694d",
        "executionInfo": {
          "status": "ok",
          "timestamp": 1517111729035,
          "user_tz": -480,
          "elapsed": 810,
          "user": {
            "displayName": "Mike C",
            "photoUrl": "//lh3.googleusercontent.com/-xWyBhueI0Vo/AAAAAAAAAAI/AAAAAAAAIg8/rU-qliu5l-M/s50-c-k-no/photo.jpg",
            "userId": "110113503404408134511"
          }
        }
      },
      "cell_type": "code",
      "source": [
        "tpot.export('tpot_hydrosaver_export.py')"
      ],
      "execution_count": 16,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "True"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 16
        }
      ]
    },
    {
      "metadata": {
        "id": "UFeZKx9j27Fx",
        "colab_type": "code",
        "colab": {
          "autoexec": {
            "startup": false,
            "wait_interval": 0
          },
          "output_extras": [
            {
              "item_id": 1
            }
          ],
          "base_uri": "https://localhost:8080/",
          "height": 299
        },
        "outputId": "73c3978c-e876-45cd-9317-1e42b2ff3c56",
        "executionInfo": {
          "status": "ok",
          "timestamp": 1517111731056,
          "user_tz": -480,
          "elapsed": 1665,
          "user": {
            "displayName": "Mike C",
            "photoUrl": "//lh3.googleusercontent.com/-xWyBhueI0Vo/AAAAAAAAAAI/AAAAAAAAIg8/rU-qliu5l-M/s50-c-k-no/photo.jpg",
            "userId": "110113503404408134511"
          }
        }
      },
      "cell_type": "code",
      "source": [
        "# What's the pipeline it saved?\n",
        "# In this case it found that LassoLarsCV(normalize=False) performed best\n",
        "!cat tpot_hydrosaver_export.py"
      ],
      "execution_count": 17,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "import numpy as np\r\n",
            "import pandas as pd\r\n",
            "from sklearn.linear_model import LassoLarsCV\r\n",
            "from sklearn.model_selection import train_test_split\r\n",
            "\r\n",
            "# NOTE: Make sure that the class is labeled 'target' in the data file\r\n",
            "tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)\r\n",
            "features = tpot_data.drop('target', axis=1).values\r\n",
            "training_features, testing_features, training_target, testing_target = \\\r\n",
            "            train_test_split(features, tpot_data['target'].values, random_state=42)\r\n",
            "\r\n",
            "# Score on the training set was:-0.00011788279235816052\r\n",
            "exported_pipeline = LassoLarsCV(normalize=False)\r\n",
            "\r\n",
            "exported_pipeline.fit(training_features, training_target)\r\n",
            "results = exported_pipeline.predict(testing_features)\r\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "metadata": {
        "id": "u8hGttzUwU3a",
        "colab_type": "code",
        "colab": {
          "autoexec": {
            "startup": false,
            "wait_interval": 0
          }
        }
      },
      "cell_type": "code",
      "source": [
        "# final score\n",
        "def rmse(y_pred, y_true):\n",
        "    sqloss = (y_true-y_pred)**2\n",
        "    return np.sqrt(sqloss.mean())"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "Ypq2uShmIb3B",
        "colab_type": "code",
        "colab": {
          "autoexec": {
            "startup": false,
            "wait_interval": 0
          },
          "output_extras": [
            {
              "item_id": 1
            }
          ],
          "base_uri": "https://localhost:8080/",
          "height": 35
        },
        "outputId": "6ba65187-aaa7-4e2d-acb2-63fb8f7e4b88",
        "executionInfo": {
          "status": "ok",
          "timestamp": 1517111877742,
          "user_tz": -480,
          "elapsed": 2417,
          "user": {
            "displayName": "Mike C",
            "photoUrl": "//lh3.googleusercontent.com/-xWyBhueI0Vo/AAAAAAAAAAI/AAAAAAAAIg8/rU-qliu5l-M/s50-c-k-no/photo.jpg",
            "userId": "110113503404408134511"
          }
        }
      },
      "cell_type": "code",
      "source": [
        "X_val_stacked = timeseries_to_seq(X_val, window=window).reshape((X_val.shape[0], -1))\n",
        "y_pred = tpot.predict(X_val_stacked)\n",
        "score = rmse(y_pred, y_val)\n",
        "score"
      ],
      "execution_count": 27,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "3.8885540109338006"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 27
        }
      ]
    },
    {
      "metadata": {
        "id": "RfqDTdZCIb0X",
        "colab_type": "code",
        "colab": {
          "autoexec": {
            "startup": false,
            "wait_interval": 0
          }
        }
      },
      "cell_type": "code",
      "source": [
        "X_test_stacked = timeseries_to_seq(X_test, window=window).reshape((X_test.shape[0], -1))\n",
        "y_pred = tpot.predict(X_test_stacked)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "qhCNUxNExifF",
        "colab_type": "code",
        "colab": {
          "autoexec": {
            "startup": false,
            "wait_interval": 0
          }
        }
      },
      "cell_type": "code",
      "source": [
        "# save\n",
        "s = pd.Series(y_submit, name='target')\n",
        "assert len(s)==439140\n",
        "\n",
        "import datetime\n",
        "ts = datetime.datetime.utcnow().strftime('%Y%m%d_%H-%M-%S')\n",
        "\n",
        "submission_file = 'submission_%s_score_%2.2f.csv'%(ts,score)\n",
        "s.to_csv(submission_file, index=False, header=True, float_format='%2.9s')\n",
        "print('upload file', submission_file)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "Fh1ceEwmiv3h",
        "colab_type": "code",
        "colab": {
          "autoexec": {
            "startup": false,
            "wait_interval": 0
          }
        }
      },
      "cell_type": "code",
      "source": [
        "# and download\n",
        "import google\n",
        "google.colab.files.download(submission_file)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "mvKjdQsK5mOz",
        "colab_type": "code",
        "colab": {
          "autoexec": {
            "startup": false,
            "wait_interval": 0
          }
        }
      },
      "cell_type": "code",
      "source": [
        ""
      ],
      "execution_count": 0,
      "outputs": []
    }
  ]
}

## hydrosaver.py
# -*- coding: utf-8 -*-
"""hydrosaver.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/notebook#fileId=1gs18AtviN2Y3jSsVF2rgprAtCA8Jnt_8
"""

# !pip install http://download.pytorch.org/whl/cu80/torch-0.3.0.post4-cp36-cp36m-linux_x86_64.whl
#!pip install http://download.pytorch.org/whl/cpu/torch-0.3.0.post4-cp36-cp36m-linux_x86_64.whl
#!pip install xgboost tpot pandas-profiling seaborn torchvision tqdm


# %pylab inline
import numpy as np
import pandas as pd
import seaborn as sn
import os
from tqdm import tqdm

eps = 1e-6
seed = 42
np.random.seed(seed)


"""# Download data"""

# from https://stackoverflow.com/a/39225039/221742
import requests

def download_file_from_google_drive(id, destination):
    def get_confirm_token(response):
        for key, value in response.cookies.items():
            if key.startswith('download_warning'):
                return value

        return None

    def save_response_content(response, destination):
        CHUNK_SIZE = 32768

        with open(destination, "wb") as f:
            for chunk in response.iter_content(CHUNK_SIZE):
                if chunk: # filter out keep-alive new chunks
                    f.write(chunk)

    URL = "https://docs.google.com/uc?export=download"

    session = requests.Session()

    response = session.get(URL, params = { 'id' : id }, stream = True)
    token = get_confirm_token(response)

    if token:
        params = { 'id' : id, 'confirm' : token }
        response = session.get(URL, params = params, stream = True)

    save_response_content(response, destination)

if not os.path.isdir('data/original'):
  os.makedirs('data/original')
download_file_from_google_drive('15BqAMEBsTjAzT2eJXED-zA1pdHpGWZLl', './data/original/train.csv')
download_file_from_google_drive('1Xi_lLCKTsgSNECerpIPhQPzUCtmutDeS', './data/original/publishable_test_set.csv')

"""# Load data"""

# So we have some unique NaN values: 'No Data', 'Bad Input', etc. We also have date index col
df_train_val = pd.read_csv('./data/original/train.csv', index_col='timestamp', parse_dates=[0], na_values=['', 'No Data', 'Bad Input', 'Scan Off', 'I/O Timeout'])
df_train_val = df_train_val.dropna(axis=1,  how='all') # drop the columns that are all NaN's
df_train_val = df_train_val.resample('1T').first()
df_train_val = df_train_val.drop('DIC88023.PV', 1)

df_test = pd.read_csv('./data/original/publishable_test_set.csv', index_col='timestamp', parse_dates=[0], na_values=['', 'No Data', 'Bad Input', 'Scan Off', 'I/O Timeout'])
df_test = df_test.dropna(axis=1,  how='all') # drop the columns that are all NaN's

y_train_val = df_train_val.target
x_train_val = df_train_val.drop('target', 1) # We don't want the answer to be in the input data

x_test = df_test

# normalize the input columns
x_mean = x_train_val.mean()
x_std = x_train_val.mean()

x_train_val = (x_train_val - x_mean)/(x_std + eps)
x_test = (x_test - x_mean)/(x_std + eps)

# TODO I may want to normalize y too

print('mean', x_mean)
print('std', x_std)

# TPOT wont accept NaNs, so we either replace or drop
# Another approach would be to use unique numbers or extra columns for this
# Since we've normalized it, 0 is the nothing value. So let's use that


x_train_val = x_train_val.replace(np.nan, 0)
y_train_val = y_train_val.replace(np.nan, 0)
x_test = x_test.replace(np.nan, 0)

# since it's a timeseries the validation will be in the future
val_split_in = int(len(df_train_val.index)*0.85)
x_val = x_train_val[val_split_in:]
x_train = x_train_val[:val_split_in]
y_val = y_train_val[val_split_in:]
y_train = y_train_val[:val_split_in]

# convert to numpy
X_train = x_train.as_matrix()
y_train = y_train.as_matrix()
X_val = x_val.as_matrix()
y_val = y_val.as_matrix()
X_test = x_test.as_matrix()


"""# Have look into the data"""

df_train_val.info()

df_train_val.describe()

# You can use pandas profiling to get an overview of the data
import pandas_profiling
profile = pandas_profiling.ProfileReport(df_train_val[:2000])
profile.to_file(outputfile="/tmp/myoutputfile.html")
profile

"""# TPOT!

TPOT is an automatic machine learning library that uses genetic algorithms to try different generations of scikit-learn algorihtms.

link: https://epistasislab.github.io/tpot/
"""

# Check data for TPOT compatability
from tpot.base import check_X_y
check_X_y(X_train, y_train, accept_sparse=True)
check_X_y(X_val, y_val, accept_sparse=True)
'ok'

# Ensure the it respects causality, by only giving each sample access to a window of past data
# make padded sequences, we need to make the data in shape (batch, window_of_timesteps, features)

def timeseries_to_seq(x, window=3):
  """
  Inputs:
  - x: shape (timeseries, features)
  - window: e.g. 3
  Outputs:
  - y: shape shape (window, batch, features)
  """
  x_pad = np.pad(x, [[window,0],[0,0]], mode='constant')
  y = np.stack([x_pad[i:i+window] for i in range(len(x))], axis=1)
  return y

# For now I will just run on a subset of the data, for speed!
subset = 200
window=60*3
x=X_train[:subset]
y_stacked=y_train[:subset]
print(x.shape)
X_train_stacked = timeseries_to_seq(x, window=window).reshape((x.shape[0], -1))

from tpot import TPOTRegressor
# A quick run of TPOT with small population and short number of generation
# About 25 minutes to run
tpot = TPOTRegressor(generations=3, population_size=10, verbosity=3)
tpot.fit(X_train_stacked, y_stacked)
tpot.export('tpot_hydrosaver_export.py')

tpot.export('tpot_hydrosaver_export.py')

# What's the pipeline it saved?
# In this case it found that LassoLarsCV(normalize=False) performed best
#!cat tpot_hydrosaver_export.py

# final score
def rmse(y_pred, y_true):
    sqloss = (y_true-y_pred)**2
    return np.sqrt(sqloss.mean())

X_val_stacked = timeseries_to_seq(X_val, window=window).reshape((X_val.shape[0], -1))
y_pred = tpot.predict(X_val_stacked)
score = rmse(y_pred, y_val)
score

X_test_stacked = timeseries_to_seq(X_test, window=window).reshape((X_test.shape[0], -1))
y_pred = tpot.predict(X_test_stacked)

# save
s = pd.Series(y_submit, name='target')
assert len(s)==439140

import datetime
ts = datetime.datetime.utcnow().strftime('%Y%m%d_%H-%M-%S')

submission_file = 'submission_%s_score_%2.2f.csv'%(ts,score)
s.to_csv(submission_file, index=False, header=True, float_format='%2.9s')
print('upload file', submission_file)

# and download
import google
google.colab.files.download(submission_file)
	# -- coding: utf-8 --
	"""hydrosaver.ipynb

	Automatically generated by Colaboratory.

	Original file is located at
	https://colab.research.google.com/notebook#fileId=1gs18AtviN2Y3jSsVF2rgprAtCA8Jnt_8
	"""

	# !pip install http://download.pytorch.org/whl/cu80/torch-0.3.0.post4-cp36-cp36m-linux_x86_64.whl
	#!pip install http://download.pytorch.org/whl/cpu/torch-0.3.0.post4-cp36-cp36m-linux_x86_64.whl
	#!pip install xgboost tpot pandas-profiling seaborn torchvision tqdm



	# %pylab inline
	import numpy as np
	import pandas as pd
	import seaborn as sn
	import os
	from tqdm import tqdm

	eps = 1e-6
	seed = 42
	np.random.seed(seed)



	"""# Download data"""

	# from https://stackoverflow.com/a/39225039/221742
	import requests

	def download_file_from_google_drive(id, destination):
	def get_confirm_token(response):
	for key, value in response.cookies.items():
	if key.startswith('download_warning'):
	return value

	return None

	def save_response_content(response, destination):
	CHUNK_SIZE = 32768

	with open(destination, "wb") as f:
	for chunk in response.iter_content(CHUNK_SIZE):
	if chunk: # filter out keep-alive new chunks
	f.write(chunk)

	URL = "https://docs.google.com/uc?export=download"

	session = requests.Session()

	response = session.get(URL, params = { 'id' : id }, stream = True)
	token = get_confirm_token(response)

	if token:
	params = { 'id' : id, 'confirm' : token }
	response = session.get(URL, params = params, stream = True)

	save_response_content(response, destination)

	if not os.path.isdir('data/original'):
	os.makedirs('data/original')
	download_file_from_google_drive('15BqAMEBsTjAzT2eJXED-zA1pdHpGWZLl', './data/original/train.csv')
	download_file_from_google_drive('1Xi_lLCKTsgSNECerpIPhQPzUCtmutDeS', './data/original/publishable_test_set.csv')

	"""# Load data"""

	# So we have some unique NaN values: 'No Data', 'Bad Input', etc. We also have date index col
	df_train_val = pd.read_csv('./data/original/train.csv', index_col='timestamp', parse_dates=[0], na_values=['', 'No Data', 'Bad Input', 'Scan Off', 'I/O Timeout'])
	df_train_val = df_train_val.dropna(axis=1, how='all') # drop the columns that are all NaN's
	df_train_val = df_train_val.resample('1T').first()
	df_train_val = df_train_val.drop('DIC88023.PV', 1)

	df_test = pd.read_csv('./data/original/publishable_test_set.csv', index_col='timestamp', parse_dates=[0], na_values=['', 'No Data', 'Bad Input', 'Scan Off', 'I/O Timeout'])
	df_test = df_test.dropna(axis=1, how='all') # drop the columns that are all NaN's

	y_train_val = df_train_val.target
	x_train_val = df_train_val.drop('target', 1) # We don't want the answer to be in the input data

	x_test = df_test

	# normalize the input columns
	x_mean = x_train_val.mean()
	x_std = x_train_val.mean()

	x_train_val = (x_train_val - x_mean)/(x_std + eps)
	x_test = (x_test - x_mean)/(x_std + eps)

	# TODO I may want to normalize y too

	print('mean', x_mean)
	print('std', x_std)

	# TPOT wont accept NaNs, so we either replace or drop
	# Another approach would be to use unique numbers or extra columns for this
	# Since we've normalized it, 0 is the nothing value. So let's use that


	x_train_val = x_train_val.replace(np.nan, 0)
	y_train_val = y_train_val.replace(np.nan, 0)
	x_test = x_test.replace(np.nan, 0)

	# since it's a timeseries the validation will be in the future
	val_split_in = int(len(df_train_val.index)*0.85)
	x_val = x_train_val[val_split_in:]
	x_train = x_train_val[:val_split_in]
	y_val = y_train_val[val_split_in:]
	y_train = y_train_val[:val_split_in]

	# convert to numpy
	X_train = x_train.as_matrix()
	y_train = y_train.as_matrix()
	X_val = x_val.as_matrix()
	y_val = y_val.as_matrix()
	X_test = x_test.as_matrix()



	"""# Have look into the data"""

	df_train_val.info()

	df_train_val.describe()

	# You can use pandas profiling to get an overview of the data
	import pandas_profiling
	profile = pandas_profiling.ProfileReport(df_train_val[:2000])
	profile.to_file(outputfile="/tmp/myoutputfile.html")
	profile

	"""# TPOT!

	TPOT is an automatic machine learning library that uses genetic algorithms to try different generations of scikit-learn algorihtms.

	link: https://epistasislab.github.io/tpot/
	"""

	# Check data for TPOT compatability
	from tpot.base import check_X_y
	check_X_y(X_train, y_train, accept_sparse=True)
	check_X_y(X_val, y_val, accept_sparse=True)
	'ok'

	# Ensure the it respects causality, by only giving each sample access to a window of past data
	# make padded sequences, we need to make the data in shape (batch, window_of_timesteps, features)

	def timeseries_to_seq(x, window=3):
	"""
	Inputs:
	- x: shape (timeseries, features)
	- window: e.g. 3
	Outputs:
	- y: shape shape (window, batch, features)
	"""
	x_pad = np.pad(x, [[window,0],[0,0]], mode='constant')
	y = np.stack([x_pad[i:i+window] for i in range(len(x))], axis=1)
	return y

	# For now I will just run on a subset of the data, for speed!
	subset = 200
	window=60*3
	x=X_train[:subset]
	y_stacked=y_train[:subset]
	print(x.shape)
	X_train_stacked = timeseries_to_seq(x, window=window).reshape((x.shape[0], -1))

	from tpot import TPOTRegressor
	# A quick run of TPOT with small population and short number of generation
	# About 25 minutes to run
	tpot = TPOTRegressor(generations=3, population_size=10, verbosity=3)
	tpot.fit(X_train_stacked, y_stacked)
	tpot.export('tpot_hydrosaver_export.py')

	tpot.export('tpot_hydrosaver_export.py')

	# What's the pipeline it saved?
	# In this case it found that LassoLarsCV(normalize=False) performed best
	#!cat tpot_hydrosaver_export.py

	# final score
	def rmse(y_pred, y_true):
	sqloss = (y_true-y_pred)**2
	return np.sqrt(sqloss.mean())

	X_val_stacked = timeseries_to_seq(X_val, window=window).reshape((X_val.shape[0], -1))
	y_pred = tpot.predict(X_val_stacked)
	score = rmse(y_pred, y_val)
	score

	X_test_stacked = timeseries_to_seq(X_test, window=window).reshape((X_test.shape[0], -1))
	y_pred = tpot.predict(X_test_stacked)

	# save
	s = pd.Series(y_submit, name='target')
	assert len(s)==439140

	import datetime
	ts = datetime.datetime.utcnow().strftime('%Y%m%d_%H-%M-%S')

	submission_file = 'submission_%s_score_%2.2f.csv'%(ts,score)
	s.to_csv(submission_file, index=False, header=True, float_format='%2.9s')
	print('upload file', submission_file)

	# and download
	import google
	google.colab.files.download(submission_file)