Created
April 9, 2019 17:12
-
-
Save Hsankesara/a5ba2e47dfa1452ab30ad2f51c5ac441 to your computer and use it in GitHub Desktop.
Sustainable Industry: Rinse Over Run competition. Scored 58 rank out of 1200+ participants. link: https://www.drivendata.org/competitions/56/predict-cleaning-time-series/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Sustainable Industry: Rinse Over Run competition. Scored 58 rank out of 1200+ participants. link: https://www.drivendata.org/competitions/56/predict-cleaning-time-series/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"metadata": { | |
"trusted": true, | |
"_uuid": "f447d1bb4323165dc185ed4c3de674cb1eabdf66" | |
}, | |
"cell_type": "code", | |
"source": "!ls", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "ldX6xHXQUXip", | |
"colab_type": "code", | |
"outputId": "c2325f9f-3601-48da-83c7-5a3d5e5af44e", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
}, | |
"trusted": true, | |
"_uuid": "672962fb990c848adcf502d762c660006ab71711" | |
}, | |
"cell_type": "code", | |
"source": "import pandas as pd\nimport numpy as np\nfrom matplotlib import pyplot as plt\nimport seaborn as sns\nfrom tqdm import tqdm\nimport gc\ntqdm.pandas()\ngc.collect()", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "N_uO6NYwUu6z", | |
"colab_type": "text", | |
"_uuid": "c43fdafc5668da60601b3619b84b4adefc433194" | |
}, | |
"cell_type": "markdown", | |
"source": "## Getting Data" | |
}, | |
{ | |
"metadata": { | |
"id": "uAudXj1GUx77", | |
"colab_type": "code", | |
"outputId": "3763ec6c-c566-4de4-802d-09c2e436a185", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 765 | |
}, | |
"trusted": true, | |
"_uuid": "0e0b1c43bbeadee6f0f3b74db8659b609434146d" | |
}, | |
"cell_type": "code", | |
"source": "!wget https://s3.amazonaws.com/drivendata/data/56/public/train_values.zip\n!wget https://s3.amazonaws.com/drivendata/data/56/public/test_values.zip\n!wget https://s3.amazonaws.com/drivendata/data/56/public/submission_format.csv\n!wget https://s3.amazonaws.com/drivendata/data/56/public/train_labels.csv", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "pGmeEpy0U3mf", | |
"colab_type": "code", | |
"colab": {}, | |
"trusted": true, | |
"_uuid": "9df58bfc359434662a820cc538946fa51f98e19a" | |
}, | |
"cell_type": "code", | |
"source": "!mkdir data", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "oXgGSzb5U9w-", | |
"colab_type": "code", | |
"outputId": "120088c0-5a87-4f88-f6b8-b9f2944ff359", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 85 | |
}, | |
"trusted": true, | |
"_uuid": "73a5379f050e9710c7cc3c0a296655c22ca6a97d" | |
}, | |
"cell_type": "code", | |
"source": "!unzip train_values.zip -d data/\n!unzip test_values.zip -d data/\n!mv train_labels.csv data/\n!mv submission_format.csv data/", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "Envm7L7YVOye", | |
"colab_type": "code", | |
"colab": {}, | |
"trusted": true, | |
"_uuid": "7b3e4eef5b155888faf42dedcc55f16dad5145c7" | |
}, | |
"cell_type": "code", | |
"source": "!rm train_values.zip\n!rm test_values.zip", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "9vyBgnibV7ZN", | |
"colab_type": "code", | |
"colab": {}, | |
"trusted": true, | |
"_uuid": "226bdebe5096408d18562c8e3b63abcc984d8578" | |
}, | |
"cell_type": "code", | |
"source": "df_train = pd.read_csv('data/train_values.csv')\ndf_test = pd.read_csv('data/test_values.csv')\ndf_labels = pd.read_csv('data/train_labels.csv')\ndf_sub = pd.read_csv('data/submission_format.csv')", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "1tAjPP4hrY4V", | |
"colab_type": "text", | |
"_uuid": "26a4f12df532d266ed0bfc0ac24a9ef516bb4e4e" | |
}, | |
"cell_type": "markdown", | |
"source": "## Feature Engineering" | |
}, | |
{ | |
"metadata": { | |
"id": "K5E0PN8O3qUt", | |
"colab_type": "code", | |
"outputId": "d53bf399-ad7c-4f72-9d15-9746c9f41330", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
}, | |
"trusted": true, | |
"_uuid": "ea38a076f9141e681fad95b43451a81de4d7d180" | |
}, | |
"cell_type": "code", | |
"source": "gc.collect()", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "K-SNw9EKcdWC", | |
"colab_type": "code", | |
"outputId": "d1a5fd07-9d16-42e3-9317-d24c79707d27", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 204 | |
}, | |
"trusted": true, | |
"_uuid": "9d0df8b4d0ab8254f6a1e0c3b3d506e50a91f9e7" | |
}, | |
"cell_type": "code", | |
"source": "df_labels.head()", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "I8vJtlOpxTcR", | |
"colab_type": "code", | |
"colab": {}, | |
"trusted": true, | |
"_uuid": "156a9ba974d55c07d2d4e8d14f29bf63cee7a555" | |
}, | |
"cell_type": "code", | |
"source": "df_train.return_temperature = np.square(df_train.return_temperature)\ndf_test.return_temperature = np.square(df_test.return_temperature)", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "5V_7N4qIzhO2", | |
"colab_type": "code", | |
"outputId": "5053c840-e200-47b5-d8f2-199d0512ddaf", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
}, | |
"trusted": true, | |
"_uuid": "45b11fa955d66ae7cec55642b1687acf7d6723c3" | |
}, | |
"cell_type": "code", | |
"source": "df_test.return_turbidity.min(), df_train.return_turbidity.min()", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "F8xBwKCAx8zk", | |
"colab_type": "code", | |
"colab": {}, | |
"trusted": true, | |
"_uuid": "0966c68eb83eeb287e393f7159da3160692c03d3" | |
}, | |
"cell_type": "code", | |
"source": "df_train.return_turbidity = np.log(df_train.return_turbidity + 1)\ndf_test.return_turbidity = np.log(df_test.return_turbidity + 1)", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "9nuwEYocyAuo", | |
"colab_type": "code", | |
"colab": {}, | |
"trusted": true, | |
"_uuid": "5db479c627270e982067678f736254fb2c79c2ce" | |
}, | |
"cell_type": "code", | |
"source": "df_train['tank_level_diff12'] = df_train['tank_level_pre_rinse'] - df_train['tank_level_caustic']\ndf_train['tank_level_diff23'] = df_train['tank_level_caustic'] - df_train['tank_level_acid']\ndf_train['tank_level_diff34'] = df_train['tank_level_acid'] - df_train['tank_level_clean_water']\ndf_test['tank_level_diff12'] = df_test['tank_level_pre_rinse'] - df_test['tank_level_caustic']\ndf_test['tank_level_diff23'] = df_test['tank_level_caustic'] - df_test['tank_level_acid']\ndf_test['tank_level_diff34'] = df_test['tank_level_acid'] - df_test['tank_level_clean_water']", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "-44WEH5l2HuN", | |
"colab_type": "code", | |
"colab": {}, | |
"trusted": true, | |
"_uuid": "caefb89e2cfb4bb3e2cf1ebdc3a29347f597cbbc" | |
}, | |
"cell_type": "code", | |
"source": "df_train.drop(['tank_level_pre_rinse', 'tank_level_caustic', 'tank_level_acid', 'tank_level_clean_water'], axis=1, inplace=True)\ndf_test.drop(['tank_level_pre_rinse', 'tank_level_caustic', 'tank_level_acid', 'tank_level_clean_water'], axis=1, inplace=True)", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "nZj9eU981GsW", | |
"colab_type": "code", | |
"outputId": "66e383a3-e330-4105-f189-b49a4db1b260", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 423 | |
}, | |
"trusted": true, | |
"_uuid": "11664523eec52be6fb6f4f19447f9104b08f9d12" | |
}, | |
"cell_type": "code", | |
"source": "df_test.head()", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "W7-JNx4X196C", | |
"colab_type": "code", | |
"colab": {}, | |
"trusted": true, | |
"_uuid": "a75a4ac72f9b34f9fe9c2b5dc7258a3150171806" | |
}, | |
"cell_type": "code", | |
"source": "df_train['tank_temp_diff12'] = df_train['tank_temperature_pre_rinse'] - df_train['tank_temperature_caustic']\ndf_train['tank_temp_diff23'] = df_train['tank_temperature_caustic'] - df_train['tank_temperature_acid']\ndf_test['tank_temp_diff12'] = df_test['tank_temperature_pre_rinse'] - df_test['tank_temperature_caustic']\ndf_test['tank_temp_diff23'] = df_test['tank_temperature_caustic'] - df_test['tank_temperature_caustic']", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "n8BGdaBKhn3c", | |
"colab_type": "code", | |
"colab": {}, | |
"trusted": true, | |
"_uuid": "aeef202b6857974278117fca7463db685a7d5899" | |
}, | |
"cell_type": "code", | |
"source": "df_train.drop(['tank_temperature_pre_rinse', 'tank_temperature_caustic', 'tank_temperature_acid'], axis=1, inplace=True)\ndf_test.drop(['tank_temperature_pre_rinse', 'tank_temperature_caustic', 'tank_temperature_acid'], axis=1, inplace=True)", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "Fn5PMgWUhp7f", | |
"colab_type": "code", | |
"outputId": "a082fdff-b186-4f8b-d9cb-c00209bbb469", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 423 | |
}, | |
"trusted": true, | |
"_uuid": "3dda70a287c0a7275115ccb7f8544c349d71e1e7" | |
}, | |
"cell_type": "code", | |
"source": "df_train.head()", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "4g_KMx0Ajfnv", | |
"colab_type": "text", | |
"_uuid": "f70a0ab2f187821490ed835d94200f49a1fb538d" | |
}, | |
"cell_type": "markdown", | |
"source": "## Data Generation" | |
}, | |
{ | |
"metadata": { | |
"id": "Jxn3jFCDi0BZ", | |
"colab_type": "code", | |
"colab": {}, | |
"trusted": true, | |
"_uuid": "f74e42ede2379ae7a03ccfae15641d3c6227177a" | |
}, | |
"cell_type": "code", | |
"source": "trainy = df_train['target_time_period']\ntrainx = df_train.drop(['target_time_period'], axis=1)\ntesty = df_test['target_time_period']\ntestx = df_test.drop(['target_time_period'], axis=1)", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "w_BkVYDnjxdF", | |
"colab_type": "code", | |
"colab": {}, | |
"trusted": true, | |
"_uuid": "9df730f93dd228b4b481ee2f9953b3b2572fa4a6" | |
}, | |
"cell_type": "code", | |
"source": "trainy = trainy.values * 1", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "5BdfRPBIEoVe", | |
"colab_type": "code", | |
"outputId": "689ece2a-1d88-43fc-abd5-9bb4c92953e1", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
}, | |
"trusted": true, | |
"_uuid": "8ee967b8d8ffd61d04506746781ca7382cd15097" | |
}, | |
"cell_type": "code", | |
"source": "del df_train\ndel df_test\ngc.collect()", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "-YT_H_2-m3Wf", | |
"colab_type": "code", | |
"outputId": "4e0b1f58-b0ad-499a-de28-e2c2f15e4a68", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 423 | |
}, | |
"trusted": true, | |
"_uuid": "7497aee8668b61a2b1afbdb05e31942f8a90f192" | |
}, | |
"cell_type": "code", | |
"source": "trainx.head()", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "CKSIZBqCq7Af", | |
"colab_type": "code", | |
"outputId": "a60a0383-6344-4ba1-a359-a43931dbaa27", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 663 | |
}, | |
"trusted": true, | |
"_uuid": "b1b0eada99cca464098414c2ce08371b1d92e003" | |
}, | |
"cell_type": "code", | |
"source": "trainx.info()", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "0aa0IJaoq69j", | |
"colab_type": "code", | |
"colab": {}, | |
"trusted": true, | |
"_uuid": "61fadb15211fc34b23f3b679a346480d9d70d462" | |
}, | |
"cell_type": "code", | |
"source": "trainx.loc[:, trainx.dtypes == bool] = trainx.loc[:, trainx.dtypes == bool].astype('int')", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "V_Ho2yrXq660", | |
"colab_type": "code", | |
"outputId": "437a5caf-34ba-4f06-83b1-f74983ef741e", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 663 | |
}, | |
"trusted": true, | |
"_uuid": "c86234ec9c50450785f30852c877234badab90ff" | |
}, | |
"cell_type": "code", | |
"source": "trainx.info()", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "GVirhdwS1yHZ", | |
"colab_type": "code", | |
"colab": {}, | |
"trusted": true, | |
"_uuid": "8370608b615f1edf10095d5a04b00867b870116f" | |
}, | |
"cell_type": "code", | |
"source": "trainx.timestamp = (pd.to_datetime(trainx.timestamp, format=\"%Y-%m-%d %H:%M:%S\") - pd.datetime.now()).dt.total_seconds()", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "V29mS62kq635", | |
"colab_type": "code", | |
"outputId": "2452bac1-b6d8-44d5-8348-56145d8674e8", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
}, | |
"trusted": true, | |
"_uuid": "74a22a80c65753dd1169b85eb1332736335c9540" | |
}, | |
"cell_type": "code", | |
"source": "pids = trainx.process_id.unique()\nfor pid in tqdm(pids):\n process_sample = trainx[trainx.process_id == pid].timestamp\n process_sample = process_sample -process_sample.min()\n trainx.loc[trainx.process_id==pid, 'timestamp'] = process_sample", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "zS8dKgPiwF0U", | |
"colab_type": "code", | |
"outputId": "4edc6ca6-7e66-410d-e62b-e6f2916b166f", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 253 | |
}, | |
"trusted": true, | |
"_uuid": "53f1492c58003e94b4047922774a03e560dbff51" | |
}, | |
"cell_type": "code", | |
"source": "trainx.head()", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "SdkG9c31xJVn", | |
"colab_type": "code", | |
"outputId": "63b09a6f-819e-436d-e5a6-900407b47f9d", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
}, | |
"trusted": true, | |
"_uuid": "9d62387eb6314c582291d2a5388575a3f44eb6da" | |
}, | |
"cell_type": "code", | |
"source": "pids = testx.process_id.unique()\ntestx.timestamp = (pd.to_datetime(testx.timestamp, format=\"%Y-%m-%d %H:%M:%S\") - pd.datetime.now()).dt.total_seconds()\nfor pid in tqdm(pids):\n process_sample = testx[testx.process_id == pid].timestamp\n process_sample = process_sample - process_sample.min()\n testx.loc[testx.process_id==pid, 'timestamp'] = process_sample", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "76vmXSWRzQz8", | |
"colab_type": "code", | |
"outputId": "b1e3d669-6596-40b7-97c7-3639fedcc558", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 253 | |
}, | |
"trusted": true, | |
"_uuid": "3302ba234933fd86472163722920747617c2a134" | |
}, | |
"cell_type": "code", | |
"source": "testx.head()", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "Ws2E22vV1Pu4", | |
"colab_type": "code", | |
"colab": {}, | |
"trusted": true, | |
"_uuid": "0e50d315e4e34a2f9cee97350129bf3a6e3794ee" | |
}, | |
"cell_type": "code", | |
"source": "from sklearn.preprocessing import OneHotEncoder, LabelEncoder\nfrom sklearn.pipeline import Pipeline", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "InXy5HA26-bJ", | |
"colab_type": "code", | |
"colab": {}, | |
"trusted": true, | |
"_uuid": "00f4d6810924818c9fafcd19f8594c8e4508fd77" | |
}, | |
"cell_type": "code", | |
"source": "le = LabelEncoder()\nenc = OneHotEncoder()", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "Jp17q_c06-1z", | |
"colab_type": "code", | |
"colab": {}, | |
"trusted": true, | |
"_uuid": "19b5c86ce039dc295e53deed2d63ac4298202694" | |
}, | |
"cell_type": "code", | |
"source": "#le_encoded = trainx[['phase', 'pipeline']].apply(le.fit_transform)\n#enc.fit(le_encoded)\n#onehotlabels = enc.transform(le_encode).toarray()\n#onehotlabels.shape", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "3t3YNFeZ6-8z", | |
"colab_type": "code", | |
"colab": {}, | |
"trusted": true, | |
"_uuid": "cfda21025effef3542c0745e7f4ce2cb6b4c086a" | |
}, | |
"cell_type": "code", | |
"source": "trainx[['phase']] = trainx[['phase']].apply(le.fit_transform)\ntestx[['phase']] = testx[['phase']].apply(le.transform)\ntrainx[['pipeline']] = trainx[['pipeline']].apply(le.fit_transform)\ntestx[['pipeline']] = testx[['pipeline']].apply(le.transform)", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "hLBJ1l9i6_Cg", | |
"colab_type": "code", | |
"outputId": "bc7f13f2-22c0-4697-efa1-585047e12f90", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 253 | |
}, | |
"trusted": true, | |
"_uuid": "6d36096d35b81bdcd86bee81b5715de949a2ae13" | |
}, | |
"cell_type": "code", | |
"source": "trainx.head()", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "pE4EDyMwMlja", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
}, | |
"outputId": "1a8ee748-7dfd-45bd-8de2-fb4924468769", | |
"trusted": true, | |
"_uuid": "7537afdb4ec2a55f495abe11d8fcf2d7199902a9" | |
}, | |
"cell_type": "code", | |
"source": "trainx_meta = trainx[['row_id', 'process_id', 'object_id', 'return_turbidity', 'return_flow']]\ntrainx.drop(['row_id', 'process_id', 'object_id'], axis=1, inplace=True)\ntestx_meta = testx[['row_id', 'process_id', 'object_id', 'return_turbidity', 'return_flow']]\ntestx.drop(['row_id', 'process_id', 'object_id'], axis=1, inplace=True)\ngc.collect()", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "XALfVBtHOC4s", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 1241 | |
}, | |
"outputId": "05cecf7d-696c-45b1-b0dd-a536091d1b52", | |
"trusted": true, | |
"_uuid": "32f012e217d74ac2e74564ddbff75bcb6bfba8c4" | |
}, | |
"cell_type": "code", | |
"source": "print(trainx.head(), testx.head())", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "snCGsSUZHYW3", | |
"colab_type": "text", | |
"_uuid": "05cc4cb1481e531015bbab6cb2a5568204286c0c" | |
}, | |
"cell_type": "markdown", | |
"source": "## Modeling" | |
}, | |
{ | |
"metadata": { | |
"id": "fMPonhvwHXsI", | |
"colab_type": "code", | |
"colab": {}, | |
"trusted": true, | |
"_uuid": "8bda2fdd94f168c1bb4c024125a6806d5ddf440c" | |
}, | |
"cell_type": "code", | |
"source": "import lightgbm as lgb\nfrom sklearn.metrics import f1_score\nfrom sklearn.model_selection import StratifiedKFold", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "z-Zaa3Sng_SY", | |
"colab_type": "code", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 34 | |
}, | |
"outputId": "f5d9c7ac-37cc-4b05-8c45-89308f7fa4c0", | |
"trusted": true, | |
"_uuid": "7d34342456f407cc7420a22b082125ffa153bf1f" | |
}, | |
"cell_type": "code", | |
"source": "gc.collect()", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "RR08NsmY6_An", | |
"colab_type": "code", | |
"outputId": "7daf3c9c-e03d-40f4-8434-016c184bc249", | |
"colab": { | |
"base_uri": "https://localhost:8080/", | |
"height": 292 | |
}, | |
"trusted": true, | |
"_uuid": "29c45c6b6dad6ddf20bc735da7bf5dad7619f627" | |
}, | |
"cell_type": "code", | |
"source": "from sklearn.metrics import roc_curve, precision_recall_curve\ndef threshold_search(y_true, y_proba, plot=False):\n precision, recall, thresholds = precision_recall_curve(y_true, y_proba)\n thresholds = np.append(thresholds, 1.001) \n F = 2 / (1/precision + 1/recall)\n best_score = np.max(F)\n best_th = thresholds[np.argmax(F)]\n if plot:\n plt.plot(thresholds, F, '-b')\n plt.plot([best_th], [best_score], '*r')\n plt.show()\n search_result = {'threshold': best_th , 'f1': best_score}\n return search_result \n\n\ndef run_cv_model(train, test, target, model_fn, params={}, eval_fn=None, label='model'):\n kf = StratifiedKFold(n_splits=5, random_state=42, shuffle=False)\n fold_splits = kf.split(train, target)\n pred_train = np.zeros((train.shape[0], 1))\n all_coefficients = np.zeros((5, 4))\n feature_importance_df = pd.DataFrame()\n pred_full_test = 0\n cv_scores = []\n i = 1\n for dev_index, val_index in fold_splits:\n print('Started ' + label + ' fold ' + str(i) + '/5')\n if isinstance(train, pd.DataFrame):\n dev_X, val_X = train.iloc[dev_index], train.iloc[val_index]\n dev_y, val_y = target[dev_index], target[val_index]\n else:\n dev_X, val_X = train[dev_index], train[val_index]\n dev_y, val_y = target[dev_index], target[val_index]\n params2 = params.copy()\n pred_val_y, pred_test_y, importances = model_fn(dev_X, dev_y, val_X, val_y, test, params2)\n gc.collect()\n pred_full_test = pred_full_test + pred_test_y\n pred_train[val_index] = pred_val_y\n if eval_fn is not None:\n current_f1_result = threshold_search(val_y, pred_val_y)\n cv_score = current_f1_result['f1']\n cv_scores.append(cv_score)\n print(label + ' cv score {}: F1 {}'.format(i, cv_score))\n fold_importance_df = pd.DataFrame()\n fold_importance_df['feature'] = train.columns.values\n fold_importance_df['importance'] = importances\n fold_importance_df['fold'] = i\n feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0) \n i += 1\n print('{} cv F1 scores : {}'.format(label, cv_scores))\n print('{} cv mean F1 score : {}'.format(label, np.mean(cv_scores)))\n print('{} cv std F1 score : {}'.format(label, np.mean(cv_scores)))\n pred_full_test = pred_full_test / 5.0\n results = {'label': label,\n 'train': pred_train, 'test': pred_full_test,\n 'cv': cv_scores,\n 'importance': feature_importance_df}\n return results\n\nparams = {\n 'objective' :'binary',\n 'learning_rate' : 0.02,\n 'num_leaves' : 76,\n 'feature_fraction': 0.64, \n 'bagging_fraction': 0.8, \n 'bagging_freq':1,\n 'boosting_type' : 'gbdt',\n 'metric': 'binary_logloss',\n 'min_split_gain': 0.01,\n 'min_child_samples': 150,\n 'min_child_weight': 0.1,\n 'verbosity': -1,\n 'data_random_seed': 3,\n 'early_stop': 100,\n 'verbose_eval': 100,\n 'num_rounds': 1000\n}\n\ndef runLGB(train_X, train_y, test_X, test_y, test_X2, params):\n print('Prep LGB')\n d_train = lgb.Dataset(train_X, label=train_y)\n d_valid = lgb.Dataset(test_X, label=test_y)\n watchlist = [d_train, d_valid]\n print('Train LGB')\n num_rounds = params.pop('num_rounds')\n verbose_eval = params.pop('verbose_eval')\n early_stop = None\n if params.get('early_stop'):\n early_stop = params.pop('early_stop')\n model = lgb.train(params,\n train_set=d_train,\n num_boost_round=num_rounds,\n valid_sets=watchlist,\n verbose_eval=verbose_eval,\n early_stopping_rounds=early_stop)\n print('Predict 1/2')\n pred_test_y = model.predict(test_X, num_iteration=model.best_iteration)\n print(test_y, pred_test_y, pred_test_y > 0.33)\n f1 = f1_score(test_y, pred_test_y > 0.33)\n print(\"f1 score = \", f1)\n print('Predict 2/2')\n pred_test_y2 = model.predict(test_X2, num_iteration=model.best_iteration)\n return pred_test_y.reshape(-1, 1), pred_test_y2.reshape(-1, 1), model.feature_importance()\n\nresults = run_cv_model(trainx, testx, trainy, runLGB, params, label='lgb', eval_fn=f1_score)", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "HPqmtRA4TT8C", | |
"colab_type": "code", | |
"colab": {}, | |
"trusted": true, | |
"_uuid": "a786b0e836c3fff441b50a9b3249cfc5b7ad7e02" | |
}, | |
"cell_type": "code", | |
"source": "thresh_res = threshold_search(trainy, results['train'])\nthresh_res", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "nPMR51ZRTUIQ", | |
"colab_type": "code", | |
"colab": {}, | |
"trusted": true, | |
"_uuid": "7711b8cb9921e48dd4fb36572de8d551bc6c0ea9" | |
}, | |
"cell_type": "code", | |
"source": "trainx_meta = trainx_meta.join(pd.Series((results['train'] > thresh_res['threshold'])[:,0], name='expected_y'))", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true, | |
"_uuid": "7a1a7ec84e5bfc0cf8652c9717f7f4978e45c96b" | |
}, | |
"cell_type": "code", | |
"source": "from sklearn.utils import check_array\ndef mean_absolute_percentage_error(y_true, y_pred): \n #y_true, y_pred = check_array(y_true, y_pred)\n return np.mean(np.abs((y_true - y_pred) / np.maximum(y_true, 290000)))", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true, | |
"_uuid": "3a6695b783db894ad17e1c2b5b92b02d38a532b6" | |
}, | |
"cell_type": "code", | |
"source": "pids = trainx_meta.process_id.unique()\nvalues = np.zeros(pids.shape[0])\nfor idx, pid in tqdm(enumerate(pids)):\n process_sample = trainx_meta[trainx_meta.process_id==pid]\n values[idx] = np.sum(process_sample[(process_sample.expected_y == True) & (process_sample.return_flow > 0)]['return_flow'] * process_sample[(process_sample.expected_y == True) & (process_sample.return_flow > 0)].return_turbidity)", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true, | |
"_uuid": "f4f3c68546d8a1caa2323d14e07c86988f135491" | |
}, | |
"cell_type": "code", | |
"source": "values = values[:,]", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true, | |
"_uuid": "b7325b07926ce175369f0739b37d1ef588c2cf9a" | |
}, | |
"cell_type": "code", | |
"source": "print(mean_absolute_percentage_error(df_labels.final_rinse_total_turbidity_liter.values, values))", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "E0tq80b2TUFJ", | |
"colab_type": "code", | |
"colab": {}, | |
"trusted": true, | |
"_uuid": "fcdcf62c0cdaf06341369446ae9646227f922314" | |
}, | |
"cell_type": "code", | |
"source": "testx_meta = testx_meta.join(pd.Series((results['test'] > thresh_res['threshold'])[:,0], name='expected_y'))", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "n4n-Hb0vTUCv", | |
"colab_type": "code", | |
"colab": {}, | |
"trusted": true, | |
"_uuid": "75e8b51fa6afe866424416cc395b1764b00883c2" | |
}, | |
"cell_type": "code", | |
"source": "pids = testx_meta.process_id.unique()\nvalues = np.zeros(pids.shape[0])\nfor idx, pid in tqdm(enumerate(pids)):\n process_sample = testx_meta[testx_meta.process_id==pid]\n values[idx] = np.sum(process_sample[(process_sample.expected_y == True) & (process_sample.return_flow > 0)]['return_flow'] * process_sample[(process_sample.expected_y == True) & (process_sample.return_flow > 0)].return_turbidity)", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"id": "Mw1NraVMTUAa", | |
"colab_type": "code", | |
"colab": {}, | |
"trusted": true, | |
"_uuid": "c8aeabd71f53bc86e9a77c6dd41f740345fd9c2f" | |
}, | |
"cell_type": "code", | |
"source": "df_sub.head()", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true, | |
"_uuid": "97f745aa275cbefec27c1b95f583a862a3605728" | |
}, | |
"cell_type": "code", | |
"source": "submission = pd.DataFrame({'process_id':pids, 'final_rinse_total_turbidity_liter':values})", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true, | |
"_uuid": "80751dda08b3bc7067b0558ca6228797a48c5ab1" | |
}, | |
"cell_type": "code", | |
"source": "submission.to_csv('submission.csv', index=False)", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true, | |
"_uuid": "eb4141dd8cd1c831fdbceb44c827f449fc382f52" | |
}, | |
"cell_type": "code", | |
"source": "testx_meta[testx_meta['expected_y'] == True]", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true, | |
"_uuid": "48e24ff65ff6c880bfe814a344995115613fb50e" | |
}, | |
"cell_type": "code", | |
"source": "gc.collect()", | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"metadata": { | |
"trusted": true, | |
"_uuid": "00c4a974858c7c26594e2bedcd965e3850273b50" | |
}, | |
"cell_type": "code", | |
"source": "", | |
"execution_count": null, | |
"outputs": [] | |
} | |
], | |
"metadata": { | |
"colab": { | |
"name": "Sustainable Industry: Rinse Over Run #1.ipynb", | |
"version": "0.3.2", | |
"provenance": [], | |
"collapsed_sections": [ | |
"N_uO6NYwUu6z", | |
"cqKvAzDMV8Xl" | |
] | |
}, | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"name": "python", | |
"version": "3.6.6", | |
"mimetype": "text/x-python", | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"pygments_lexer": "ipython3", | |
"nbconvert_exporter": "python", | |
"file_extension": ".py" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 1 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment