Created
April 9, 2019 17:12
-
-
Save Hsankesara/a5ba2e47dfa1452ab30ad2f51c5ac441 to your computer and use it in GitHub Desktop.
Sustainable Industry: Rinse Over Run competition. Scored 58 rank out of 1200+ participants. link: https://www.drivendata.org/competitions/56/predict-cleaning-time-series/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Sustainable Industry: Rinse Over Run competition. Scored 58 rank out of 1200+ participants. link: https://www.drivendata.org/competitions/56/predict-cleaning-time-series/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{"cells":[{"metadata":{"id":"y9xOU8iunHZf","colab_type":"code","outputId":"a0efe14b-662c-43d3-d01d-687bffca8b78","colab":{"base_uri":"https://localhost:8080/","height":34},"trusted":true,"_uuid":"7c4b190b89cc19cc9b72bc6da574ca7038f698e0"},"cell_type":"code","source":"import pandas as pd\nimport numpy as np\nfrom matplotlib import pyplot as plt\nimport seaborn as sns\nfrom tqdm import tqdm\nimport gc\ntqdm.pandas()\ngc.collect()","execution_count":null,"outputs":[]},{"metadata":{"id":"N_uO6NYwUu6z","colab_type":"text","_uuid":"9b9ce552e0ea7af5f5b1c9f0584052ccaaec14c5"},"cell_type":"markdown","source":"## Getting Data"},{"metadata":{"id":"uAudXj1GUx77","colab_type":"code","outputId":"62f59daf-daa7-4a53-eb91-2a4a2aee6f1c","colab":{"base_uri":"https://localhost:8080/","height":765},"trusted":true,"_uuid":"0b160071bfa0c0f389d55a880b3bfe7d5b28fa1e"},"cell_type":"code","source":"!wget https://s3.amazonaws.com/drivendata/data/56/public/train_values.zip\n!wget https://s3.amazonaws.com/drivendata/data/56/public/test_values.zip\n!wget https://s3.amazonaws.com/drivendata/data/56/public/submission_format.csv\n!wget https://s3.amazonaws.com/drivendata/data/56/public/train_labels.csv\n!wget https://s3.amazonaws.com/drivendata/data/56/public/recipe_metadata.csv","execution_count":null,"outputs":[]},{"metadata":{"id":"pGmeEpy0U3mf","colab_type":"code","colab":{},"trusted":true,"_uuid":"f62bafea54aa872ef5bb99aa392156caec53bcc7"},"cell_type":"code","source":"!mkdir data","execution_count":null,"outputs":[]},{"metadata":{"id":"oXgGSzb5U9w-","colab_type":"code","outputId":"a5001a1a-eb2f-4ddf-e187-24406b57517b","colab":{"base_uri":"https://localhost:8080/","height":85},"trusted":true,"_uuid":"844297ef915e99a29900616f2562ecdbbb7e6e10"},"cell_type":"code","source":"!unzip train_values.zip -d data/\n!unzip test_values.zip -d data/\n!mv train_labels.csv data/\n!mv submission_format.csv data/\n!mv recipe_metadata.csv data/","execution_count":null,"outputs":[]},{"metadata":{"id":"Envm7L7YVOye","colab_type":"code","colab":{},"trusted":true,"_uuid":"d0c658a30f5f50ef427c93192400381fd877576e"},"cell_type":"code","source":"!rm train_values.zip\n!rm test_values.zip","execution_count":null,"outputs":[]},{"metadata":{"id":"Y17WyGIjTwR8","colab_type":"text","_uuid":"2fac4b79fdd5c8f9e3273af8d6a478b9b2812a17"},"cell_type":"markdown","source":"## Baseline Model"},{"metadata":{"id":"hq_JWxTUVn0V","colab_type":"code","colab":{},"trusted":true,"_uuid":"b7ee93eedf8aee7138d38f9b53454de6e9a0c8c5"},"cell_type":"code","source":"%matplotlib inline\n# mute warnings for this blog post\nimport warnings\nwarnings.filterwarnings(\"ignore\")\n\nfrom pathlib import Path\n\nimport numpy as np\nimport pandas as pd\n\npd.set_option('display.max_columns', 40)\n\nimport matplotlib.pyplot as plt\n\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.ensemble import RandomForestRegressor","execution_count":null,"outputs":[]},{"metadata":{"id":"IE50pchoVyCb","colab_type":"code","colab":{},"trusted":true,"_uuid":"c54437ba69b169cc227d33ca26fad428420af06c"},"cell_type":"code","source":"DATA_DIR = Path('./data/')","execution_count":null,"outputs":[]},{"metadata":{"id":"tuc6cK48nML4","colab_type":"code","colab":{},"trusted":true,"_uuid":"069fdaa8cbbb86c22d978bc40f60ce7b6ec08734"},"cell_type":"code","source":"# for training our model\ntrain_values = pd.read_csv(DATA_DIR / 'train_values.csv',index_col=0,\n parse_dates=['timestamp'])\n\ntrain_labels = pd.read_csv(DATA_DIR / 'train_labels.csv',\n index_col=0)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true,"_uuid":"a25ced4b376216052872fda08febf90638315d78"},"cell_type":"code","source":"df_recipe = pd.read_csv(DATA_DIR / 'recipe_metadata.csv')","execution_count":null,"outputs":[]},{"metadata":{"id":"FKtL8Tf9qOPM","colab_type":"code","outputId":"751d4180-2bb3-43cb-d3ee-bbc8094c4c27","colab":{"base_uri":"https://localhost:8080/","height":425},"trusted":true,"_uuid":"c48054b399ea840b44f3791a26e55bf5ee3e83f4"},"cell_type":"code","source":"train_values.head()","execution_count":null,"outputs":[]},{"metadata":{"id":"_C_d67n5WAvK","colab_type":"code","colab":{},"trusted":true,"_uuid":"54f91be26140561da8d6e7fafee599c76a9d5c9c"},"cell_type":"code","source":"# subset to final rinse phase observations \nfinal_phases = train_values[(train_values.target_time_period)]\n\n# let's look at just one process\nfinal_phase = final_phases[final_phases.process_id == 20017]","execution_count":null,"outputs":[]},{"metadata":{"id":"cSENTEr9WJSX","colab_type":"code","colab":{},"trusted":true,"_uuid":"dcada17195dd9711476dd17d3ec243afee3ade98"},"cell_type":"code","source":"# calculate target variable\nfinal_phase = final_phase.assign(target=np.maximum(final_phase.return_flow, 0) * final_phase.return_turbidity)","execution_count":null,"outputs":[]},{"metadata":{"id":"E83ZkTSEW9ME","colab_type":"code","outputId":"923c3862-21d6-478e-c502-b229c9125e35","colab":{"base_uri":"https://localhost:8080/","height":355},"trusted":true,"_uuid":"34f6b3de5acfb9a6cb0d79fdeefd0a3d968e659a"},"cell_type":"code","source":"# plot flow, turbidity, and target \nfig, ax = plt.subplots(nrows=1, ncols=3, figsize=(20, 5))\n\nax[0].plot(final_phase.return_flow)\nax[0].set_title('Return flow in final phase')\n\nax[1].plot(final_phase.return_turbidity, c='orange')\nax[1].set_title('Return turbidity in final phase')\n\nax[2].plot(final_phase.target, c='green')\nax[2].set_title('Turbidity in final phase in NTU.L');","execution_count":null,"outputs":[]},{"metadata":{"id":"QOi7FY_rW_T7","colab_type":"code","colab":{},"trusted":true,"_uuid":"d3b56c6ffe5107599977773c6329b54353198c76"},"cell_type":"code","source":"train_values = train_values[train_values.phase != 'final_rinse']","execution_count":null,"outputs":[]},{"metadata":{"id":"5HWVT2WYXE8y","colab_type":"code","outputId":"7e7073bd-9917-43f4-b328-66cc3de65442","colab":{"base_uri":"https://localhost:8080/","height":277},"trusted":true,"_uuid":"88b936f21618459ab75d7581b8cec4bcbc7250a2"},"cell_type":"code","source":"train_values.groupby('process_id').phase.nunique().value_counts().sort_index().plot.bar()\nplt.title(\"Number of Processes with $N$ Phases\");","execution_count":null,"outputs":[]},{"metadata":{"id":"u4sWmF5BXH8O","colab_type":"code","colab":{},"trusted":true,"_uuid":"da1b77a096156601d5be0944544587fa26588e76"},"cell_type":"code","source":"# create a unique phase identifier by joining process_id and phase\ntrain_values['process_phase'] = train_values.process_id.astype(str) + '_' + train_values.phase.astype(str)\nprocess_phases = train_values.process_phase.unique()\n\n# randomly select 80% of phases to keep\nrng = np.random.RandomState(2019)\nto_keep = rng.choice(\n process_phases,\n size=np.int(len(process_phases) * 0.8),\n replace=False)\n\ntrain_limited = train_values[train_values.process_phase.isin(to_keep)]\n\n# subset labels to match our training data\ntrain_labels = train_labels.loc[train_limited.process_id.unique()]","execution_count":null,"outputs":[]},{"metadata":{"id":"jDwA-F94XMe_","colab_type":"code","outputId":"54774b09-5f82-4334-e58a-7ad2fc5b3eb0","colab":{"base_uri":"https://localhost:8080/","height":278},"trusted":true,"_uuid":"51d1222bf206e41203bfab36cc1e2c6bd99f8f6e"},"cell_type":"code","source":"train_limited.groupby('process_id').phase.nunique().value_counts().sort_index().plot.bar()\nplt.title(\"Number of Processes with $N$ Phases (Subset for Training)\");","execution_count":null,"outputs":[]},{"metadata":{"trusted":true,"_uuid":"bcaf624a29e25b92cc740339331aa0cd6148b3c6"},"cell_type":"code","source":"df_recipe['recipe'] = df_recipe['pre_rinse'].astype(str) + df_recipe['caustic'].astype(str) + df_recipe['intermediate_rinse'].astype(str) + df_recipe['acid'].astype(str) + df_recipe['final_rinse'].astype(str)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true,"_uuid":"ba4e10cabe3b5fc2d01bbbb6adbdfad1ad819aa5"},"cell_type":"code","source":"df_recipe = df_recipe[['process_id', 'recipe']].set_index('process_id')","execution_count":null,"outputs":[]},{"metadata":{"trusted":true,"_uuid":"17c2a79dd4af73d3927feeee4b2197e9a376e987"},"cell_type":"code","source":"df_recipe.head()","execution_count":null,"outputs":[]},{"metadata":{"trusted":true,"_uuid":"1d6077620c195047573b3e958623ed9cf03fe30a"},"cell_type":"code","source":"df_recipe['recipe'] = df_recipe['recipe'].astype('category').cat.codes","execution_count":null,"outputs":[]},{"metadata":{"id":"Znz5-1WjXO0l","colab_type":"code","outputId":"f8321188-695c-449e-9c9e-cad811b1d856","colab":{"base_uri":"https://localhost:8080/","height":286},"trusted":true,"_uuid":"8458a4e35d4cfb5ae577dc5ae27d45b49efe2f84"},"cell_type":"code","source":"def prep_metadata(df):\n # select process_id and pipeline\n meta = df[['process_id', 'pipeline']].drop_duplicates().set_index('process_id') \n \n # convert categorical pipeline data to dummy variables\n meta = pd.get_dummies(meta)\n \n # pipeline L12 not in test data\n if 'L12' not in meta.columns:\n meta['pipeline_L12'] = 0\n \n # calculate number of phases for each process_object\n meta['num_phases'] = df.groupby('process_id')['phase'].apply(lambda x: x.nunique())\n \n return meta\n\n# show example for first 5,000 observations\nprep_metadata(train_limited.head(5000))","execution_count":null,"outputs":[]},{"metadata":{"id":"eC4wM7FBJodj","colab_type":"code","outputId":"76154254-1624-4dba-9a22-348854fdc869","colab":{"base_uri":"https://localhost:8080/","height":258},"trusted":true,"_uuid":"a94d48f6f91a522a0b17529be27a87d1bba9fb89"},"cell_type":"code","source":"train_values.columns","execution_count":null,"outputs":[]},{"metadata":{"id":"aR3kcbRYeFp7","colab_type":"code","colab":{},"trusted":true,"_uuid":"2aac7cbcfdf42dec70aff4629f888ce833a5c5d5"},"cell_type":"code","source":"def feature_engineering(df):\n df.loc[df.supply_flow < 0, 'supply_flow'] = df.loc[df.supply_flow < 0, 'supply_flow'] * -1\n df.loc[df.return_flow < 0, 'return_flow'] = df.loc[df.return_flow < 0, 'return_flow'] * -1\n df.loc[df.supply_pressure < 0, 'supply_pressure'] = df.loc[df.supply_pressure < 0, 'supply_pressure'] * -1\n df.loc[df.return_turbidity< 0, 'return_turbidity'] = 0.0\n ####\n df.return_temperature = np.square(df.return_temperature)\n df.return_turbidity = np.log(df.return_turbidity + 1)\n df['tank_level_diff12'] = df['tank_level_pre_rinse'] - df['tank_level_caustic']\n df['tank_level_diff23'] = df['tank_level_caustic'] - df['tank_level_acid']\n df['tank_level_diff34'] = df['tank_level_acid'] - df['tank_level_clean_water']\n df = df.drop(['tank_level_pre_rinse', 'tank_level_caustic', 'tank_level_acid', 'tank_level_clean_water'], axis=1)\n df['tank_temp_diff12'] = df['tank_temperature_pre_rinse'] - df['tank_temperature_caustic']\n df['tank_temp_diff23'] = df['tank_temperature_caustic'] - df['tank_temperature_acid']\n df = df.drop(['tank_temperature_pre_rinse', 'tank_temperature_caustic', 'tank_temperature_acid'], axis=1)\n df.loc[:, df.dtypes == bool] = df.loc[:, df.dtypes == bool].astype('int')\n return df","execution_count":null,"outputs":[]},{"metadata":{"id":"fy17bIjoeYRu","colab_type":"code","colab":{},"trusted":true,"_uuid":"9ae988cd71bf6ca8d898c8b0bcf87ef2d9147a52"},"cell_type":"code","source":"train_limited = feature_engineering(train_limited)","execution_count":null,"outputs":[]},{"metadata":{"id":"IImzXuHGXT8K","colab_type":"code","colab":{},"trusted":true,"_uuid":"61e9ddc590bdbd29393874f9cfced5494ab5b629"},"cell_type":"code","source":"# variables we'll use to create our time series features\nts_cols = [\n 'process_id',\n 'supply_flow',\n 'supply_pressure',\n 'return_temperature',\n 'return_conductivity',\n 'return_turbidity',\n 'return_flow',\n 'tank_concentration_caustic',\n 'tank_concentration_acid',\n 'tank_level_diff12',\n 'tank_level_diff23',\n 'tank_level_diff34',\n 'tank_temp_diff12',\n 'tank_temp_diff23',\n]\n\nbinary_cols = [\n 'process_id',\n 'supply_pump',\n 'supply_pre_rinse', 'supply_caustic', 'return_caustic', 'supply_acid',\n 'return_acid', 'supply_clean_water', 'return_recovery_water',\n 'return_drain', 'object_low_level',\n 'tank_lsh_caustic', \n 'tank_lsh_acid',\n 'tank_lsh_clean_water',\n 'tank_lsh_pre_rinse',\n]","execution_count":null,"outputs":[]},{"metadata":{"id":"X3w2BFK6XXMp","colab_type":"code","outputId":"f480fbeb-cb03-4aac-880c-5db9444fcdd0","colab":{"base_uri":"https://localhost:8080/","height":346},"trusted":true,"_uuid":"5f3b5ea3d871015b77308a883b363076920c45d2"},"cell_type":"code","source":"def prep_time_series_features(df, columns=None):\n if columns is None:\n columns = df.columns\n ts_df = df[ts_cols].set_index('process_id')\n # create features: min, max, mean, standard deviation, and mean of the last five observations\n ts_features = ts_df.groupby('process_id').agg(['min', 'max', 'mean', 'std', lambda x: x.tail(5).mean()])\n bn_features = df[binary_cols].set_index('process_id')\n bn_features = bn_features.groupby('process_id').agg(['mean', 'std', lambda x: x.tail(5).mean()])\n feature_mat = pd.concat([ts_features,bn_features ], axis=1)\n return feature_mat\n\n# show example for first 5,000 observations\nprep_time_series_features(train_limited.head(5000), columns=ts_cols)","execution_count":null,"outputs":[]},{"metadata":{"id":"veSTzIRLXZLv","colab_type":"code","colab":{},"trusted":true,"_uuid":"65405c79715289225eee9df9f2e27e8748b78740"},"cell_type":"code","source":"def create_feature_matrix(df):\n metadata = prep_metadata(df)\n time_series = prep_time_series_features(df)\n \n # join metadata and time series features into a single dataframe\n feature_matrix = pd.concat([metadata, time_series], axis=1)\n \n return feature_matrix","execution_count":null,"outputs":[]},{"metadata":{"id":"wn-5TjyvXcnl","colab_type":"code","colab":{},"trusted":true,"_uuid":"b3a088d14b67f5b581655af036469c429ac1f076"},"cell_type":"code","source":"%%time\ntrain_features = create_feature_matrix(train_limited)","execution_count":null,"outputs":[]},{"metadata":{"id":"2VmDoffLXe3H","colab_type":"code","outputId":"958a7241-fa43-43e7-f02b-af3c145136ab","colab":{"base_uri":"https://localhost:8080/","height":301},"trusted":true,"_uuid":"d76d2c2d9cc2ebd493d2b9f8d397e933133fd6af"},"cell_type":"code","source":"train_features.head()","execution_count":null,"outputs":[]},{"metadata":{"trusted":true,"_uuid":"b5f95eef462c2cd05e380bac39ac6e9b259178f8"},"cell_type":"code","source":"train_features = train_features.join(df_recipe)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true,"_uuid":"5a895f598ba91a5f0b538d6d7dfad21423a84239"},"cell_type":"code","source":"train_features.head()","execution_count":null,"outputs":[]},{"metadata":{"id":"tUpLybrMdWW3","colab_type":"code","colab":{},"trusted":true,"_uuid":"e7f6c58f7d745e701461d52e2891cf551b1a6947"},"cell_type":"code","source":"train_label_transformed = np.log(train_labels + 1)","execution_count":null,"outputs":[]},{"metadata":{"id":"0-tBroG3ZBR1","colab_type":"code","colab":{},"trusted":true,"_uuid":"a1f21c70db7cc0993c7cf60940527564afc46de6"},"cell_type":"code","source":"def mean_absolute_percentage_error(y_true, y_pred): \n #y_true, y_pred = check_array(y_true, y_pred)\n return np.mean(np.abs((y_true - y_pred) / np.maximum(y_true, 290000)))","execution_count":null,"outputs":[]},{"metadata":{"id":"xdB2ERxWDIAz","colab_type":"text","_uuid":"a47778d646f199326a0a09de4aefc0b810ccab8b"},"cell_type":"markdown","source":"### lgbm"},{"metadata":{"id":"ynI8h0JidrLj","colab_type":"code","colab":{},"trusted":true,"_uuid":"623fe356a3355faf397599bd1f74b27d6bb32db8"},"cell_type":"code","source":"import lightgbm as lgb\n","execution_count":null,"outputs":[]},{"metadata":{"id":"O6azR6OpHlKZ","colab_type":"code","colab":{},"trusted":true,"_uuid":"288be59fbaf4cbb1a3d8b423b83c5d2fafe9f235"},"cell_type":"code","source":"from sklearn.model_selection import train_test_split","execution_count":null,"outputs":[]},{"metadata":{"id":"OES_VmjlHt-R","colab_type":"code","colab":{},"trusted":true,"_uuid":"9f7425fa8fc4301ea8b6e484823f897d149fcb29"},"cell_type":"code","source":"trainx, valx, trainy, valy, trainy_real, valy_real = train_test_split(train_features, train_label_transformed, train_labels, test_size=0.2, random_state=42)","execution_count":null,"outputs":[]},{"metadata":{"id":"TRztwzKVH6Rd","colab_type":"code","outputId":"ebf03e69-80d7-42a7-e4a0-cf0e395cc1b0","colab":{"base_uri":"https://localhost:8080/","height":34},"trusted":true,"_uuid":"b985fbbe43c34a8dacab8bd4227b63c0f35f22f1"},"cell_type":"code","source":"trainx.shape, valx.shape, trainy.shape, valy.shape, trainy_real.shape, valy_real.shape","execution_count":null,"outputs":[]},{"metadata":{"id":"3cRQ6CcneYSi","colab_type":"code","colab":{},"trusted":true,"_uuid":"c5bfba98dd88b451981eb8eeda2032fc52d7c9da"},"cell_type":"code","source":"d_train = lgb.Dataset(trainx, label=trainy)\nd_valid = lgb.Dataset(valx, label=valy)","execution_count":null,"outputs":[]},{"metadata":{"id":"R3uyNjfGIZ3Z","colab_type":"code","colab":{},"trusted":true,"_uuid":"93e9d19996f1b827b10ab461b58e4d8c71020f5a"},"cell_type":"code","source":"watchlist = [d_train, d_valid]","execution_count":null,"outputs":[]},{"metadata":{"id":"CnI28Ka_Iamm","colab_type":"code","colab":{},"trusted":true,"_uuid":"f35004d48c71f47c6c9943803bd9f50bc1d481c9"},"cell_type":"code","source":"params = {'application': 'regression',\n 'boosting': 'gbdt',\n 'metric': 'mape',\n 'num_leaves': 125,\n 'max_depth': 16,\n 'learning_rate': 0.01,\n 'bagging_fraction': 0.9,\n 'feature_fraction': 0.8,\n 'min_split_gain': 0.01,\n 'min_child_samples': 150,\n 'min_child_weight': 0.1,\n 'verbosity': -1,\n 'data_random_seed': 3,\n 'early_stop': 150,\n 'verbose_eval': 100,\n 'num_rounds': 10000,\n 'seed': 42}\n","execution_count":null,"outputs":[]},{"metadata":{"id":"HH9meg0OXhWz","colab_type":"code","outputId":"8943c1f7-a641-44be-efa7-701e0565af5a","colab":{"base_uri":"https://localhost:8080/","height":306},"trusted":true,"_uuid":"956d4d054a409fc6d5925015b4826fd79ab0c57e"},"cell_type":"code","source":"%%time\nnum_rounds = params.pop('num_rounds')\nverbose_eval = params.pop('verbose_eval')\nearly_stop = params.pop('early_stop')\nmodel = lgb.train(params,train_set=d_train,num_boost_round=num_rounds,valid_sets=watchlist,verbose_eval=verbose_eval,early_stopping_rounds=early_stop)","execution_count":null,"outputs":[]},{"metadata":{"id":"qx1fdRc4ZRMG","colab_type":"code","outputId":"94be4987-6836-4b5e-d5fd-4c2006e08cac","colab":{"base_uri":"https://localhost:8080/","height":68},"trusted":true,"_uuid":"a2c76025b395d6df7cff6cd3b7a1079256277087"},"cell_type":"code","source":"print(mean_absolute_percentage_error(np.ravel(train_labels), np.exp(model.predict(train_features)) - 1))\nprint(mean_absolute_percentage_error(np.ravel(trainy_real), np.exp(model.predict(trainx)) - 1))\nprint(mean_absolute_percentage_error(np.ravel(valy_real), np.exp(model.predict(valx)) - 1))","execution_count":null,"outputs":[]},{"metadata":{"id":"w2TPftcmDMOP","colab_type":"text","_uuid":"8f1b372311f9591534adde8ce56ff8174fb033b3"},"cell_type":"markdown","source":"### Xgboost"},{"metadata":{"id":"iEw-BDOdDRJm","colab_type":"code","colab":{},"trusted":true,"_uuid":"684d8dceed008e1d8771f13e78e7eff1d5fec9d6"},"cell_type":"code","source":"from xgboost import XGBRegressor\nimport re","execution_count":null,"outputs":[]},{"metadata":{"id":"elriCzA-D8sp","colab_type":"code","colab":{},"trusted":true,"_uuid":"13e92dd62e8a9c736a15738ae6daaf2b48288823"},"cell_type":"code","source":"regex = re.compile(r\"\\[|\\]|<\", re.IGNORECASE)","execution_count":null,"outputs":[]},{"metadata":{"id":"8roKFL5oEKvx","colab_type":"code","colab":{},"trusted":true,"_uuid":"e7d13ddaf4fa7bd841e4a5aa4529b16c4bf97d3e"},"cell_type":"code","source":" trainx.columns = [str(x) for x in trainx.columns]","execution_count":null,"outputs":[]},{"metadata":{"id":"Ymj-SAYdE7QY","colab_type":"code","colab":{},"trusted":true,"_uuid":"75dfcb090f7eeb21481293ab52ba7f19ed701955"},"cell_type":"code","source":" valx.columns = [str(x) for x in valx.columns]","execution_count":null,"outputs":[]},{"metadata":{"id":"SRTmYbx3EA_m","colab_type":"code","colab":{},"trusted":true,"_uuid":"1f8cbe0d2cf7f39766cdf1f11722c61e3e6e88e5"},"cell_type":"code","source":"trainx.columns = [regex.sub(\"_\", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in trainx.columns.values]","execution_count":null,"outputs":[]},{"metadata":{"id":"GAzp5FiVFJxW","colab_type":"code","colab":{},"trusted":true,"_uuid":"ee244962185fa3b293e6ee015147a1600dfbeffe"},"cell_type":"code","source":"valx.columns = [regex.sub(\"_\", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in valx.columns.values]","execution_count":null,"outputs":[]},{"metadata":{"id":"6VTZ91I4DRGa","colab_type":"code","colab":{},"trusted":true,"_uuid":"99cbd7fe0e920b7629fc27d1cd2fd8c7610f5c1b"},"cell_type":"code","source":"xgb_model = XGBRegressor(colsample_bytree=0.4,\n gamma=0, \n learning_rate=0.07,\n max_depth=3,\n min_child_weight=1.5,\n n_estimators=10000, \n reg_alpha=0.75,\n reg_lambda=0.45,\n subsample=0.6,\n seed=42)","execution_count":null,"outputs":[]},{"metadata":{"id":"QjQRVOJeDRDL","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":173},"outputId":"b3c80ccc-84b5-4da7-8aba-34b382ee49ee","trusted":true,"_uuid":"10d247402d8c8b3ef092cf71697f61812f628e9d"},"cell_type":"code","source":"%%time\nxgb_model.fit(trainx, trainy)","execution_count":null,"outputs":[]},{"metadata":{"id":"aVCJHQ9sErh3","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":51},"outputId":"b147ed63-97e5-4c65-c8f2-dc43a8e6c732","trusted":true,"_uuid":"d7fe2e62c4d10c0f291d40bb533799998550e571"},"cell_type":"code","source":"print(mean_absolute_percentage_error(np.ravel(trainy_real), np.exp(xgb_model.predict(trainx)) - 1))\nprint(mean_absolute_percentage_error(np.ravel(valy_real), np.exp(xgb_model.predict(valx)) - 1))","execution_count":null,"outputs":[]},{"metadata":{"_uuid":"935ea991d58ce606651c0eea949f20e7bfe06e15"},"cell_type":"markdown","source":"### Random Forest"},{"metadata":{"trusted":true,"_uuid":"1baead23cec097a95815c3a9400606288e24251d"},"cell_type":"code","source":"from sklearn.ensemble import RandomForestRegressor","execution_count":null,"outputs":[]},{"metadata":{"trusted":true,"_uuid":"9c46756c15ee56b3dcf83065e5c42d5b0905aa74"},"cell_type":"code","source":"rf = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=30,\n max_features='sqrt', max_leaf_nodes=None,\n min_impurity_decrease=0.0, min_impurity_split=None,\n min_samples_leaf=2, min_samples_split=5,\n min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,\n oob_score=False, random_state=None, verbose=0, warm_start=False)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true,"_uuid":"5ef177626c59652baec15d841a1fe17a67bd6dec"},"cell_type":"code","source":"%%time\nrf.fit(trainx, trainy)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true,"_uuid":"76a474e86e07de83e3b37ad77e267454bb44fb19"},"cell_type":"code","source":"print(mean_absolute_percentage_error(np.ravel(trainy_real), np.exp(rf.predict(trainx)) - 1))\nprint(mean_absolute_percentage_error(np.ravel(valy_real), np.exp(rf.predict(valx)) - 1))","execution_count":null,"outputs":[]},{"metadata":{"id":"jUryogA1FeEm","colab_type":"text","_uuid":"ed01d0112b4c3b64a9097fae02f3cfd831fc5ee1"},"cell_type":"markdown","source":"### Combine models"},{"metadata":{"id":"10es8H4yDjuz","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":51},"outputId":"4d4a26ee-5758-48d9-f2f9-31893bb85e15","trusted":true,"_uuid":"977fbfc0e8bea142d22c2e9738051a3bc26e7443"},"cell_type":"code","source":"print(mean_absolute_percentage_error(np.ravel(trainy_real), (0.1 * np.exp(xgb_model.predict(trainx)) + 0.9 * np.exp(model.predict(trainx)) - 1)))\nprint(mean_absolute_percentage_error(np.ravel(valy_real), (0.1 * np.exp(xgb_model.predict(valx)) + 0.9 * np.exp(model.predict(valx)) - 1)))","execution_count":null,"outputs":[]},{"metadata":{"id":"60CpGzDgHaX_","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":51},"outputId":"3c841c20-6c81-4907-d089-43f3e6d2ac2b","trusted":true,"_uuid":"612e8906f2661eb1b07787174d05049028e59e75"},"cell_type":"code","source":"print(mean_absolute_percentage_error(np.ravel(valy_real), np.exp(np.minimum(model.predict(valx), xgb_model.predict(valx), rf.predict(valx))) - 1))\nprint(mean_absolute_percentage_error(np.ravel(valy_real), np.exp(np.maximum(model.predict(valx), xgb_model.predict(valx))) - 1))","execution_count":null,"outputs":[]},{"metadata":{"trusted":true,"_uuid":"1210373b3b91210818001bfd5f025c6af9f3e366"},"cell_type":"code","source":"print(mean_absolute_percentage_error(np.ravel(valy_real), np.exp(np.minimum(model.predict(valx), rf.predict(valx))) - 1))\nprint(mean_absolute_percentage_error(np.ravel(valy_real), np.exp(np.minimum(xgb_model.predict(valx), rf.predict(valx))) - 1))","execution_count":null,"outputs":[]},{"metadata":{"id":"acXZnRPADSDG","colab_type":"text","_uuid":"bbbccd1826cff67b278eaca574f407ba6532ab1d"},"cell_type":"markdown","source":"### Submisssion time"},{"metadata":{"id":"rZW42qmkXkvG","colab_type":"code","colab":{},"trusted":true,"_uuid":"8389596ca98c9819865a27b66cb31f07b1495c87"},"cell_type":"code","source":"# load the test data\ntest_values = pd.read_csv(DATA_DIR / 'test_values.csv',\n index_col=0,\n parse_dates=['timestamp'])","execution_count":null,"outputs":[]},{"metadata":{"id":"phtULbxXe3Cr","colab_type":"code","colab":{},"trusted":true,"_uuid":"5c71e4847d64cf73e260252d7262769ae30e8e83"},"cell_type":"code","source":"test_values = feature_engineering(test_values)","execution_count":null,"outputs":[]},{"metadata":{"id":"HkbOJRQqXmtL","colab_type":"code","colab":{},"trusted":true,"_uuid":"21ac27f5ce196eb4e0c988021d4dd900c41bff19"},"cell_type":"code","source":"# create metadata and time series features\ntest_features = create_feature_matrix(test_values)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true,"_uuid":"9a8b3eac7b531f062e044957275d38f9754d4e7d"},"cell_type":"code","source":"test_features = test_features.join(df_recipe)","execution_count":null,"outputs":[]},{"metadata":{"id":"MI_Ky8ydXoyh","colab_type":"code","outputId":"fe1d479f-5e74-4926-b8ed-01277b8a8943","colab":{"base_uri":"https://localhost:8080/","height":301},"trusted":true,"_uuid":"645622f5d2db907cb0ce6b4f8ac819d6babf5cc1"},"cell_type":"code","source":"test_features.head()","execution_count":null,"outputs":[]},{"metadata":{"id":"siX_iqCmXqHd","colab_type":"code","colab":{},"trusted":true,"_uuid":"a4b0a36601168f2f5cd6637dbc75fc24332a6f94"},"cell_type":"code","source":"#pred1 = model.predict(test_features)","execution_count":null,"outputs":[]},{"metadata":{"colab_type":"code","id":"oXjYWVw2IYUG","colab":{},"trusted":true,"_uuid":"e2288e137c6152a56992aa9e510df5f33e8c7f5f"},"cell_type":"code","source":" test_features.columns = [str(x) for x in test_features.columns]","execution_count":null,"outputs":[]},{"metadata":{"colab_type":"code","id":"jELwuqdlIYUK","colab":{},"trusted":true,"_uuid":"a157b08fdf7a2b9c404cfe181571e81c65c6b443"},"cell_type":"code","source":"test_features.columns = [regex.sub(\"_\", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in test_features.columns.values]","execution_count":null,"outputs":[]},{"metadata":{"id":"BReu2pWFJPv0","colab_type":"code","colab":{},"trusted":true,"_uuid":"75f63a8cf61a548d506733aeaaa6fb9e54e8dbb5"},"cell_type":"code","source":"test_features = test_features[trainx.columns]","execution_count":null,"outputs":[]},{"metadata":{"id":"-sWm7nFiIirp","colab_type":"code","colab":{},"trusted":true,"_uuid":"1e440d0f1954541e590a4945cd9dfec700505579"},"cell_type":"code","source":"pred2 = xgb_model.predict(test_features)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true,"_uuid":"d5c89c7eff30708272cbfc45ff00a97c68619fc6"},"cell_type":"code","source":"pred3 = rf.predict(test_features)","execution_count":null,"outputs":[]},{"metadata":{"id":"9PMlLGAEItq3","colab_type":"code","colab":{},"trusted":true,"_uuid":"93ac19c5aeff07d17fd8be78b138cdfccda1c37a"},"cell_type":"code","source":"preds = np.exp(np.minimum(pred2, pred3)) - 1","execution_count":null,"outputs":[]},{"metadata":{"id":"WadT7kvCXsxv","colab_type":"code","colab":{},"trusted":true,"_uuid":"579742258871923af573fb42d2890eec2e1525f0"},"cell_type":"code","source":"submission_format = pd.read_csv(DATA_DIR / 'submission_format.csv', index_col=0)","execution_count":null,"outputs":[]},{"metadata":{"id":"t7DNfDKJXuzt","colab_type":"code","colab":{},"trusted":true,"_uuid":"71e5476430d1f0089d5f549c92c0ff343e94627b"},"cell_type":"code","source":"# confirm everything is in the right order\nassert np.all(test_features.index == submission_format.index)","execution_count":null,"outputs":[]},{"metadata":{"id":"R52Mqb7bXxJr","colab_type":"code","colab":{},"trusted":true,"_uuid":"c7b2392829d8450bf02ee8abbfb39f9194a25aad"},"cell_type":"code","source":"my_submission = pd.DataFrame(data=preds,\n columns=submission_format.columns,\n index=submission_format.index)","execution_count":null,"outputs":[]},{"metadata":{"id":"GFM6mmeAXzaw","colab_type":"code","colab":{},"trusted":true,"_uuid":"22f308df1c4aabf7bf94ea062b911c0e8d955350"},"cell_type":"code","source":"my_submission.to_csv('submission.csv')","execution_count":null,"outputs":[]},{"metadata":{"id":"BvceqrBlX1G8","colab_type":"code","outputId":"d26912b0-ddbd-4566-fd5a-498d670bf296","colab":{"base_uri":"https://localhost:8080/","height":187},"trusted":true,"_uuid":"1b839bf0206c1d700fd47cc729958243dfd8ad8b"},"cell_type":"code","source":"!head submission.csv","execution_count":null,"outputs":[]},{"metadata":{"id":"QUprjvrqX262","colab_type":"code","colab":{},"trusted":true,"_uuid":"743da0b5a0b50a5d41b2b7284caf6c1c340ac064"},"cell_type":"code","source":"","execution_count":null,"outputs":[]},{"metadata":{"id":"pC-tlb2S2wEn","colab_type":"code","outputId":"da54c0cd-29b4-407f-c489-209bad9c669b","colab":{"base_uri":"https://localhost:8080/","height":1969},"trusted":true,"_uuid":"167ba0f484ae1d058aedc9639bb1174b06bb9ddb"},"cell_type":"code","source":"","execution_count":null,"outputs":[]},{"metadata":{"id":"4YguCqE83Iyu","colab_type":"code","colab":{},"trusted":true,"_uuid":"62c777e7470616e646d0b826e34fdcb750fc3f1c"},"cell_type":"code","source":"","execution_count":null,"outputs":[]},{"metadata":{"trusted":true,"_uuid":"bb04e656f04858c19aa799f0795a117308be39de"},"cell_type":"code","source":"","execution_count":null,"outputs":[]}],"metadata":{"colab":{"name":"Sustainable_Industry_Rinse_Over_Run_0.ipynb","version":"0.3.2","provenance":[],"collapsed_sections":["N_uO6NYwUu6z"]},"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"name":"python","version":"3.6.6","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat":4,"nbformat_minor":1} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment