Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save Hsankesara/a5ba2e47dfa1452ab30ad2f51c5ac441 to your computer and use it in GitHub Desktop.
Save Hsankesara/a5ba2e47dfa1452ab30ad2f51c5ac441 to your computer and use it in GitHub Desktop.
Sustainable Industry: Rinse Over Run competition. Scored 58 rank out of 1200+ participants. link: https://www.drivendata.org/competitions/56/predict-cleaning-time-series/
Sustainable Industry: Rinse Over Run competition. Scored 58 rank out of 1200+ participants. link: https://www.drivendata.org/competitions/56/predict-cleaning-time-series/
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
{"cells":[{"metadata":{"id":"y9xOU8iunHZf","colab_type":"code","outputId":"a0efe14b-662c-43d3-d01d-687bffca8b78","colab":{"base_uri":"https://localhost:8080/","height":34},"trusted":true,"_uuid":"7c4b190b89cc19cc9b72bc6da574ca7038f698e0"},"cell_type":"code","source":"import pandas as pd\nimport numpy as np\nfrom matplotlib import pyplot as plt\nimport seaborn as sns\nfrom tqdm import tqdm\nimport gc\ntqdm.pandas()\ngc.collect()","execution_count":null,"outputs":[]},{"metadata":{"id":"N_uO6NYwUu6z","colab_type":"text","_uuid":"9b9ce552e0ea7af5f5b1c9f0584052ccaaec14c5"},"cell_type":"markdown","source":"## Getting Data"},{"metadata":{"id":"uAudXj1GUx77","colab_type":"code","outputId":"62f59daf-daa7-4a53-eb91-2a4a2aee6f1c","colab":{"base_uri":"https://localhost:8080/","height":765},"trusted":true,"_uuid":"0b160071bfa0c0f389d55a880b3bfe7d5b28fa1e"},"cell_type":"code","source":"!wget https://s3.amazonaws.com/drivendata/data/56/public/train_values.zip\n!wget https://s3.amazonaws.com/drivendata/data/56/public/test_values.zip\n!wget https://s3.amazonaws.com/drivendata/data/56/public/submission_format.csv\n!wget https://s3.amazonaws.com/drivendata/data/56/public/train_labels.csv","execution_count":null,"outputs":[]},{"metadata":{"id":"pGmeEpy0U3mf","colab_type":"code","colab":{},"trusted":true,"_uuid":"f62bafea54aa872ef5bb99aa392156caec53bcc7"},"cell_type":"code","source":"!mkdir data","execution_count":null,"outputs":[]},{"metadata":{"id":"oXgGSzb5U9w-","colab_type":"code","outputId":"a5001a1a-eb2f-4ddf-e187-24406b57517b","colab":{"base_uri":"https://localhost:8080/","height":85},"trusted":true,"_uuid":"844297ef915e99a29900616f2562ecdbbb7e6e10"},"cell_type":"code","source":"!unzip train_values.zip -d data/\n!unzip test_values.zip -d data/\n!mv train_labels.csv data/\n!mv submission_format.csv data/","execution_count":null,"outputs":[]},{"metadata":{"id":"Envm7L7YVOye","colab_type":"code","colab":{},"trusted":true,"_uuid":"d0c658a30f5f50ef427c93192400381fd877576e"},"cell_type":"code","source":"!rm train_values.zip\n!rm test_values.zip","execution_count":null,"outputs":[]},{"metadata":{"id":"Y17WyGIjTwR8","colab_type":"text","_uuid":"2fac4b79fdd5c8f9e3273af8d6a478b9b2812a17"},"cell_type":"markdown","source":"## Baseline Model"},{"metadata":{"id":"hq_JWxTUVn0V","colab_type":"code","colab":{},"trusted":true,"_uuid":"b7ee93eedf8aee7138d38f9b53454de6e9a0c8c5"},"cell_type":"code","source":"%matplotlib inline\n# mute warnings for this blog post\nimport warnings\nwarnings.filterwarnings(\"ignore\")\n\nfrom pathlib import Path\n\nimport numpy as np\nimport pandas as pd\n\npd.set_option('display.max_columns', 40)\n\nimport matplotlib.pyplot as plt\n\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.ensemble import RandomForestRegressor","execution_count":null,"outputs":[]},{"metadata":{"id":"IE50pchoVyCb","colab_type":"code","colab":{},"trusted":true,"_uuid":"c54437ba69b169cc227d33ca26fad428420af06c"},"cell_type":"code","source":"DATA_DIR = Path('./data/')","execution_count":null,"outputs":[]},{"metadata":{"id":"tuc6cK48nML4","colab_type":"code","colab":{},"trusted":true,"_uuid":"069fdaa8cbbb86c22d978bc40f60ce7b6ec08734"},"cell_type":"code","source":"# for training our model\ntrain_values = pd.read_csv(DATA_DIR / 'train_values.csv',index_col=0,\n parse_dates=['timestamp'])\n\ntrain_labels = pd.read_csv(DATA_DIR / 'train_labels.csv',\n index_col=0)","execution_count":null,"outputs":[]},{"metadata":{"id":"FKtL8Tf9qOPM","colab_type":"code","outputId":"751d4180-2bb3-43cb-d3ee-bbc8094c4c27","colab":{"base_uri":"https://localhost:8080/","height":425},"trusted":true,"_uuid":"c48054b399ea840b44f3791a26e55bf5ee3e83f4"},"cell_type":"code","source":"train_values.head()","execution_count":null,"outputs":[]},{"metadata":{"id":"_C_d67n5WAvK","colab_type":"code","colab":{},"trusted":true,"_uuid":"54f91be26140561da8d6e7fafee599c76a9d5c9c"},"cell_type":"code","source":"# subset to final rinse phase observations \nfinal_phases = train_values[(train_values.target_time_period)]\n\n# let's look at just one process\nfinal_phase = final_phases[final_phases.process_id == 20017]","execution_count":null,"outputs":[]},{"metadata":{"id":"cSENTEr9WJSX","colab_type":"code","colab":{},"trusted":true,"_uuid":"dcada17195dd9711476dd17d3ec243afee3ade98"},"cell_type":"code","source":"# calculate target variable\nfinal_phase = final_phase.assign(target=np.maximum(final_phase.return_flow, 0) * final_phase.return_turbidity)","execution_count":null,"outputs":[]},{"metadata":{"id":"E83ZkTSEW9ME","colab_type":"code","outputId":"923c3862-21d6-478e-c502-b229c9125e35","colab":{"base_uri":"https://localhost:8080/","height":355},"trusted":true,"_uuid":"34f6b3de5acfb9a6cb0d79fdeefd0a3d968e659a"},"cell_type":"code","source":"# plot flow, turbidity, and target \nfig, ax = plt.subplots(nrows=1, ncols=3, figsize=(20, 5))\n\nax[0].plot(final_phase.return_flow)\nax[0].set_title('Return flow in final phase')\n\nax[1].plot(final_phase.return_turbidity, c='orange')\nax[1].set_title('Return turbidity in final phase')\n\nax[2].plot(final_phase.target, c='green')\nax[2].set_title('Turbidity in final phase in NTU.L');","execution_count":null,"outputs":[]},{"metadata":{"id":"QOi7FY_rW_T7","colab_type":"code","colab":{},"trusted":true,"_uuid":"d3b56c6ffe5107599977773c6329b54353198c76"},"cell_type":"code","source":"train_values = train_values[train_values.phase != 'final_rinse']","execution_count":null,"outputs":[]},{"metadata":{"id":"5HWVT2WYXE8y","colab_type":"code","outputId":"7e7073bd-9917-43f4-b328-66cc3de65442","colab":{"base_uri":"https://localhost:8080/","height":277},"trusted":true,"_uuid":"88b936f21618459ab75d7581b8cec4bcbc7250a2"},"cell_type":"code","source":"train_values.groupby('process_id').phase.nunique().value_counts().sort_index().plot.bar()\nplt.title(\"Number of Processes with $N$ Phases\");","execution_count":null,"outputs":[]},{"metadata":{"id":"u4sWmF5BXH8O","colab_type":"code","colab":{},"trusted":true,"_uuid":"da1b77a096156601d5be0944544587fa26588e76"},"cell_type":"code","source":"# create a unique phase identifier by joining process_id and phase\ntrain_values['process_phase'] = train_values.process_id.astype(str) + '_' + train_values.phase.astype(str)\nprocess_phases = train_values.process_phase.unique()\n\n# randomly select 80% of phases to keep\nrng = np.random.RandomState(2019)\nto_keep = rng.choice(\n process_phases,\n size=np.int(len(process_phases) * 0.8),\n replace=False)\n\ntrain_limited = train_values[train_values.process_phase.isin(to_keep)]\n\n# subset labels to match our training data\ntrain_labels = train_labels.loc[train_limited.process_id.unique()]","execution_count":null,"outputs":[]},{"metadata":{"id":"jDwA-F94XMe_","colab_type":"code","outputId":"54774b09-5f82-4334-e58a-7ad2fc5b3eb0","colab":{"base_uri":"https://localhost:8080/","height":278},"trusted":true,"_uuid":"51d1222bf206e41203bfab36cc1e2c6bd99f8f6e"},"cell_type":"code","source":"train_limited.groupby('process_id').phase.nunique().value_counts().sort_index().plot.bar()\nplt.title(\"Number of Processes with $N$ Phases (Subset for Training)\");","execution_count":null,"outputs":[]},{"metadata":{"id":"Znz5-1WjXO0l","colab_type":"code","outputId":"f8321188-695c-449e-9c9e-cad811b1d856","colab":{"base_uri":"https://localhost:8080/","height":286},"trusted":true,"_uuid":"8458a4e35d4cfb5ae577dc5ae27d45b49efe2f84"},"cell_type":"code","source":"def prep_metadata(df):\n # select process_id and pipeline\n meta = df[['process_id', 'pipeline']].drop_duplicates().set_index('process_id') \n \n # convert categorical pipeline data to dummy variables\n meta = pd.get_dummies(meta)\n \n # pipeline L12 not in test data\n if 'L12' not in meta.columns:\n meta['pipeline_L12'] = 0\n \n # calculate number of phases for each process_object\n meta['num_phases'] = df.groupby('process_id')['phase'].apply(lambda x: x.nunique())\n \n return meta\n\n# show example for first 5,000 observations\nprep_metadata(train_limited.head(5000))","execution_count":null,"outputs":[]},{"metadata":{"id":"eC4wM7FBJodj","colab_type":"code","outputId":"76154254-1624-4dba-9a22-348854fdc869","colab":{"base_uri":"https://localhost:8080/","height":258},"trusted":true,"_uuid":"a94d48f6f91a522a0b17529be27a87d1bba9fb89"},"cell_type":"code","source":"train_values.columns","execution_count":null,"outputs":[]},{"metadata":{"id":"aR3kcbRYeFp7","colab_type":"code","colab":{},"trusted":true,"_uuid":"2aac7cbcfdf42dec70aff4629f888ce833a5c5d5"},"cell_type":"code","source":"def feature_engineering(df):\n df.return_temperature = np.square(df.return_temperature)\n df.return_turbidity = np.log(df.return_turbidity + 1)\n df['tank_level_diff12'] = df['tank_level_pre_rinse'] - df['tank_level_caustic']\n df['tank_level_diff23'] = df['tank_level_caustic'] - df['tank_level_acid']\n df['tank_level_diff34'] = df['tank_level_acid'] - df['tank_level_clean_water']\n df = df.drop(['tank_level_pre_rinse', 'tank_level_caustic', 'tank_level_acid', 'tank_level_clean_water'], axis=1)\n df['tank_temp_diff12'] = df['tank_temperature_pre_rinse'] - df['tank_temperature_caustic']\n df['tank_temp_diff23'] = df['tank_temperature_caustic'] - df['tank_temperature_acid']\n df = df.drop(['tank_temperature_pre_rinse', 'tank_temperature_caustic', 'tank_temperature_acid'], axis=1)\n df.loc[:, df.dtypes == bool] = df.loc[:, df.dtypes == bool].astype('int')\n return df","execution_count":null,"outputs":[]},{"metadata":{"id":"fy17bIjoeYRu","colab_type":"code","colab":{},"trusted":true,"_uuid":"9ae988cd71bf6ca8d898c8b0bcf87ef2d9147a52"},"cell_type":"code","source":"train_limited = feature_engineering(train_limited)","execution_count":null,"outputs":[]},{"metadata":{"id":"IImzXuHGXT8K","colab_type":"code","colab":{},"trusted":true,"_uuid":"61e9ddc590bdbd29393874f9cfced5494ab5b629"},"cell_type":"code","source":"# variables we'll use to create our time series features\nts_cols = [\n 'process_id',\n 'supply_flow',\n 'supply_pressure',\n 'return_temperature',\n 'return_conductivity',\n 'return_turbidity',\n 'return_flow',\n 'tank_concentration_caustic',\n 'tank_concentration_acid',\n 'tank_level_diff12',\n 'tank_level_diff23',\n 'tank_level_diff34',\n 'tank_temp_diff12',\n 'tank_temp_diff23',\n]\n\nbinary_cols = [\n 'process_id',\n 'supply_pump',\n 'supply_pre_rinse', 'supply_caustic', 'return_caustic', 'supply_acid',\n 'return_acid', 'supply_clean_water', 'return_recovery_water',\n 'return_drain', 'object_low_level',\n 'tank_lsh_caustic', \n 'tank_lsh_acid',\n 'tank_lsh_clean_water',\n 'tank_lsh_pre_rinse',\n]","execution_count":null,"outputs":[]},{"metadata":{"id":"X3w2BFK6XXMp","colab_type":"code","outputId":"f480fbeb-cb03-4aac-880c-5db9444fcdd0","colab":{"base_uri":"https://localhost:8080/","height":346},"trusted":true,"_uuid":"5f3b5ea3d871015b77308a883b363076920c45d2"},"cell_type":"code","source":"def prep_time_series_features(df, columns=None):\n if columns is None:\n columns = df.columns\n ts_df = df[ts_cols].set_index('process_id')\n # create features: min, max, mean, standard deviation, and mean of the last five observations\n ts_features = ts_df.groupby('process_id').agg(['min', 'max', 'mean', 'std', lambda x: x.tail(5).mean()])\n bn_features = df[binary_cols].set_index('process_id')\n bn_features = bn_features.groupby('process_id').agg(['mean', 'std', lambda x: x.tail(5).mean()])\n feature_mat = pd.concat([ts_features,bn_features ], axis=1)\n return feature_mat\n\n# show example for first 5,000 observations\nprep_time_series_features(train_limited.head(5000), columns=ts_cols)","execution_count":null,"outputs":[]},{"metadata":{"id":"veSTzIRLXZLv","colab_type":"code","colab":{},"trusted":true,"_uuid":"65405c79715289225eee9df9f2e27e8748b78740"},"cell_type":"code","source":"def create_feature_matrix(df):\n metadata = prep_metadata(df)\n time_series = prep_time_series_features(df)\n \n # join metadata and time series features into a single dataframe\n feature_matrix = pd.concat([metadata, time_series], axis=1)\n \n return feature_matrix","execution_count":null,"outputs":[]},{"metadata":{"id":"wn-5TjyvXcnl","colab_type":"code","colab":{},"trusted":true,"_uuid":"b3a088d14b67f5b581655af036469c429ac1f076"},"cell_type":"code","source":"train_features = create_feature_matrix(train_limited)","execution_count":null,"outputs":[]},{"metadata":{"id":"2VmDoffLXe3H","colab_type":"code","outputId":"958a7241-fa43-43e7-f02b-af3c145136ab","colab":{"base_uri":"https://localhost:8080/","height":301},"trusted":true,"_uuid":"d76d2c2d9cc2ebd493d2b9f8d397e933133fd6af"},"cell_type":"code","source":"train_features.head()","execution_count":null,"outputs":[]},{"metadata":{"id":"tUpLybrMdWW3","colab_type":"code","colab":{},"trusted":true,"_uuid":"e7f6c58f7d745e701461d52e2891cf551b1a6947"},"cell_type":"code","source":"train_label_transformed = np.log(train_labels + 1)","execution_count":null,"outputs":[]},{"metadata":{"id":"0-tBroG3ZBR1","colab_type":"code","colab":{},"trusted":true,"_uuid":"a1f21c70db7cc0993c7cf60940527564afc46de6"},"cell_type":"code","source":"def mean_absolute_percentage_error(y_true, y_pred): \n #y_true, y_pred = check_array(y_true, y_pred)\n return np.mean(np.abs((y_true - y_pred) / np.maximum(y_true, 290000)))","execution_count":null,"outputs":[]},{"metadata":{"id":"xdB2ERxWDIAz","colab_type":"text","_uuid":"a47778d646f199326a0a09de4aefc0b810ccab8b"},"cell_type":"markdown","source":"### lgbm"},{"metadata":{"id":"ynI8h0JidrLj","colab_type":"code","colab":{},"trusted":true,"_uuid":"623fe356a3355faf397599bd1f74b27d6bb32db8"},"cell_type":"code","source":"import lightgbm as lgb\n","execution_count":null,"outputs":[]},{"metadata":{"id":"O6azR6OpHlKZ","colab_type":"code","colab":{},"trusted":true,"_uuid":"288be59fbaf4cbb1a3d8b423b83c5d2fafe9f235"},"cell_type":"code","source":"from sklearn.model_selection import train_test_split","execution_count":null,"outputs":[]},{"metadata":{"id":"OES_VmjlHt-R","colab_type":"code","colab":{},"trusted":true,"_uuid":"9f7425fa8fc4301ea8b6e484823f897d149fcb29"},"cell_type":"code","source":"trainx, valx, trainy, valy, trainy_real, valy_real = train_test_split(train_features, train_label_transformed, train_labels, test_size=0.2)","execution_count":null,"outputs":[]},{"metadata":{"id":"TRztwzKVH6Rd","colab_type":"code","outputId":"ebf03e69-80d7-42a7-e4a0-cf0e395cc1b0","colab":{"base_uri":"https://localhost:8080/","height":34},"trusted":true,"_uuid":"b985fbbe43c34a8dacab8bd4227b63c0f35f22f1"},"cell_type":"code","source":"trainx.shape, valx.shape, trainy.shape, valy.shape, trainy_real.shape, valy_real.shape","execution_count":null,"outputs":[]},{"metadata":{"id":"3cRQ6CcneYSi","colab_type":"code","colab":{},"trusted":true,"_uuid":"c5bfba98dd88b451981eb8eeda2032fc52d7c9da"},"cell_type":"code","source":"d_train = lgb.Dataset(trainx, label=trainy)\nd_valid = lgb.Dataset(valx, label=valy)","execution_count":null,"outputs":[]},{"metadata":{"id":"R3uyNjfGIZ3Z","colab_type":"code","colab":{},"trusted":true,"_uuid":"93e9d19996f1b827b10ab461b58e4d8c71020f5a"},"cell_type":"code","source":"watchlist = [d_train, d_valid]","execution_count":null,"outputs":[]},{"metadata":{"id":"CnI28Ka_Iamm","colab_type":"code","colab":{},"trusted":true,"_uuid":"f35004d48c71f47c6c9943803bd9f50bc1d481c9"},"cell_type":"code","source":"params = {'application': 'regression',\n 'boosting': 'gbdt',\n 'metric': 'mape',\n 'num_leaves': 80,\n 'max_depth': 11,\n 'learning_rate': 0.01,\n 'bagging_fraction': 0.9,\n 'feature_fraction': 0.8,\n 'min_split_gain': 0.01,\n 'min_child_samples': 150,\n 'min_child_weight': 0.1,\n 'verbosity': -1,\n 'data_random_seed': 3,\n 'early_stop': 100,\n 'verbose_eval': 100,\n 'num_rounds': 10000,\n 'seed': 42}\n","execution_count":null,"outputs":[]},{"metadata":{"id":"HH9meg0OXhWz","colab_type":"code","outputId":"8943c1f7-a641-44be-efa7-701e0565af5a","colab":{"base_uri":"https://localhost:8080/","height":306},"trusted":true,"_uuid":"956d4d054a409fc6d5925015b4826fd79ab0c57e"},"cell_type":"code","source":"%%time\nnum_rounds = params.pop('num_rounds')\nverbose_eval = params.pop('verbose_eval')\nearly_stop = params.pop('early_stop')\nmodel = lgb.train(params,train_set=d_train,num_boost_round=num_rounds,valid_sets=watchlist,verbose_eval=verbose_eval,early_stopping_rounds=early_stop)","execution_count":null,"outputs":[]},{"metadata":{"id":"qx1fdRc4ZRMG","colab_type":"code","outputId":"94be4987-6836-4b5e-d5fd-4c2006e08cac","colab":{"base_uri":"https://localhost:8080/","height":68},"trusted":true,"_uuid":"a2c76025b395d6df7cff6cd3b7a1079256277087"},"cell_type":"code","source":"print(mean_absolute_percentage_error(np.ravel(train_labels), np.exp(model.predict(train_features)) - 1))\nprint(mean_absolute_percentage_error(np.ravel(trainy_real), np.exp(model.predict(trainx)) - 1))\nprint(mean_absolute_percentage_error(np.ravel(valy_real), np.exp(model.predict(valx)) - 1))","execution_count":null,"outputs":[]},{"metadata":{"id":"w2TPftcmDMOP","colab_type":"text","_uuid":"8f1b372311f9591534adde8ce56ff8174fb033b3"},"cell_type":"markdown","source":"### Xgboost"},{"metadata":{"id":"iEw-BDOdDRJm","colab_type":"code","colab":{},"trusted":true,"_uuid":"684d8dceed008e1d8771f13e78e7eff1d5fec9d6"},"cell_type":"code","source":"from xgboost import XGBRegressor\nimport re","execution_count":null,"outputs":[]},{"metadata":{"id":"elriCzA-D8sp","colab_type":"code","colab":{},"trusted":true,"_uuid":"13e92dd62e8a9c736a15738ae6daaf2b48288823"},"cell_type":"code","source":"regex = re.compile(r\"\\[|\\]|<\", re.IGNORECASE)","execution_count":null,"outputs":[]},{"metadata":{"id":"8roKFL5oEKvx","colab_type":"code","colab":{},"trusted":true,"_uuid":"e7d13ddaf4fa7bd841e4a5aa4529b16c4bf97d3e"},"cell_type":"code","source":" trainx.columns = [str(x) for x in trainx.columns]","execution_count":null,"outputs":[]},{"metadata":{"id":"Ymj-SAYdE7QY","colab_type":"code","colab":{},"trusted":true,"_uuid":"75dfcb090f7eeb21481293ab52ba7f19ed701955"},"cell_type":"code","source":" valx.columns = [str(x) for x in valx.columns]","execution_count":null,"outputs":[]},{"metadata":{"id":"SRTmYbx3EA_m","colab_type":"code","colab":{},"trusted":true,"_uuid":"1f8cbe0d2cf7f39766cdf1f11722c61e3e6e88e5"},"cell_type":"code","source":"trainx.columns = [regex.sub(\"_\", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in trainx.columns.values]","execution_count":null,"outputs":[]},{"metadata":{"id":"GAzp5FiVFJxW","colab_type":"code","colab":{},"trusted":true,"_uuid":"ee244962185fa3b293e6ee015147a1600dfbeffe"},"cell_type":"code","source":"valx.columns = [regex.sub(\"_\", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in valx.columns.values]","execution_count":null,"outputs":[]},{"metadata":{"id":"6VTZ91I4DRGa","colab_type":"code","colab":{},"trusted":true,"_uuid":"99cbd7fe0e920b7629fc27d1cd2fd8c7610f5c1b"},"cell_type":"code","source":"xgb_model = XGBRegressor(colsample_bytree=0.4,\n gamma=0, \n learning_rate=0.07,\n max_depth=3,\n min_child_weight=1.5,\n n_estimators=10000, \n reg_alpha=0.75,\n reg_lambda=0.45,\n subsample=0.6,\n seed=42)","execution_count":null,"outputs":[]},{"metadata":{"id":"QjQRVOJeDRDL","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":173},"outputId":"b3c80ccc-84b5-4da7-8aba-34b382ee49ee","trusted":true,"_uuid":"10d247402d8c8b3ef092cf71697f61812f628e9d"},"cell_type":"code","source":"%%time\nxgb_model.fit(trainx, trainy)","execution_count":null,"outputs":[]},{"metadata":{"id":"aVCJHQ9sErh3","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":51},"outputId":"b147ed63-97e5-4c65-c8f2-dc43a8e6c732","trusted":true,"_uuid":"d7fe2e62c4d10c0f291d40bb533799998550e571"},"cell_type":"code","source":"print(mean_absolute_percentage_error(np.ravel(trainy_real), np.exp(xgb_model.predict(trainx)) - 1))\nprint(mean_absolute_percentage_error(np.ravel(valy_real), np.exp(xgb_model.predict(valx)) - 1))","execution_count":null,"outputs":[]},{"metadata":{"_uuid":"935ea991d58ce606651c0eea949f20e7bfe06e15"},"cell_type":"markdown","source":"### Random Forest"},{"metadata":{"trusted":true,"_uuid":"1baead23cec097a95815c3a9400606288e24251d"},"cell_type":"code","source":"from sklearn.ensemble import RandomForestRegressor","execution_count":null,"outputs":[]},{"metadata":{"trusted":true,"_uuid":"9c46756c15ee56b3dcf83065e5c42d5b0905aa74"},"cell_type":"code","source":"rf = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=30,\n max_features='sqrt', max_leaf_nodes=None,\n min_impurity_decrease=0.0, min_impurity_split=None,\n min_samples_leaf=2, min_samples_split=5,\n min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,\n oob_score=False, random_state=None, verbose=0, warm_start=False)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true,"_uuid":"5ef177626c59652baec15d841a1fe17a67bd6dec"},"cell_type":"code","source":"%%time\nrf.fit(trainx, trainy)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true,"_uuid":"76a474e86e07de83e3b37ad77e267454bb44fb19"},"cell_type":"code","source":"print(mean_absolute_percentage_error(np.ravel(trainy_real), np.exp(rf.predict(trainx)) - 1))\nprint(mean_absolute_percentage_error(np.ravel(valy_real), np.exp(rf.predict(valx)) - 1))","execution_count":null,"outputs":[]},{"metadata":{"id":"jUryogA1FeEm","colab_type":"text","_uuid":"ed01d0112b4c3b64a9097fae02f3cfd831fc5ee1"},"cell_type":"markdown","source":"### Combine models"},{"metadata":{"id":"10es8H4yDjuz","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":51},"outputId":"4d4a26ee-5758-48d9-f2f9-31893bb85e15","trusted":true,"_uuid":"977fbfc0e8bea142d22c2e9738051a3bc26e7443"},"cell_type":"code","source":"print(mean_absolute_percentage_error(np.ravel(trainy_real), (0.1 * np.exp(xgb_model.predict(trainx)) + 0.9 * np.exp(model.predict(trainx)) - 1)))\nprint(mean_absolute_percentage_error(np.ravel(valy_real), (0.1 * np.exp(xgb_model.predict(valx)) + 0.9 * np.exp(model.predict(valx)) - 1)))","execution_count":null,"outputs":[]},{"metadata":{"id":"60CpGzDgHaX_","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":51},"outputId":"3c841c20-6c81-4907-d089-43f3e6d2ac2b","trusted":true,"_uuid":"612e8906f2661eb1b07787174d05049028e59e75"},"cell_type":"code","source":"print(mean_absolute_percentage_error(np.ravel(valy_real), np.exp(np.minimum(model.predict(valx), xgb_model.predict(valx), rf.predict(valx))) - 1))\nprint(mean_absolute_percentage_error(np.ravel(valy_real), np.exp(np.maximum(model.predict(valx), xgb_model.predict(valx))) - 1))","execution_count":null,"outputs":[]},{"metadata":{"trusted":true,"_uuid":"1210373b3b91210818001bfd5f025c6af9f3e366"},"cell_type":"code","source":"print(mean_absolute_percentage_error(np.ravel(valy_real), np.exp(np.minimum(model.predict(valx), rf.predict(valx))) - 1))\nprint(mean_absolute_percentage_error(np.ravel(valy_real), np.exp(np.minimum(xgb_model.predict(valx), rf.predict(valx))) - 1))","execution_count":null,"outputs":[]},{"metadata":{"id":"acXZnRPADSDG","colab_type":"text","_uuid":"bbbccd1826cff67b278eaca574f407ba6532ab1d"},"cell_type":"markdown","source":"### Submisssion time"},{"metadata":{"id":"rZW42qmkXkvG","colab_type":"code","colab":{},"trusted":true,"_uuid":"8389596ca98c9819865a27b66cb31f07b1495c87"},"cell_type":"code","source":"# load the test data\ntest_values = pd.read_csv(DATA_DIR / 'test_values.csv',\n index_col=0,\n parse_dates=['timestamp'])","execution_count":null,"outputs":[]},{"metadata":{"id":"phtULbxXe3Cr","colab_type":"code","colab":{},"trusted":true,"_uuid":"5c71e4847d64cf73e260252d7262769ae30e8e83"},"cell_type":"code","source":"test_values = feature_engineering(test_values)","execution_count":null,"outputs":[]},{"metadata":{"id":"HkbOJRQqXmtL","colab_type":"code","colab":{},"trusted":true,"_uuid":"21ac27f5ce196eb4e0c988021d4dd900c41bff19"},"cell_type":"code","source":"# create metadata and time series features\ntest_features = create_feature_matrix(test_values)","execution_count":null,"outputs":[]},{"metadata":{"id":"MI_Ky8ydXoyh","colab_type":"code","outputId":"fe1d479f-5e74-4926-b8ed-01277b8a8943","colab":{"base_uri":"https://localhost:8080/","height":301},"trusted":true,"_uuid":"645622f5d2db907cb0ce6b4f8ac819d6babf5cc1"},"cell_type":"code","source":"test_features.head()","execution_count":null,"outputs":[]},{"metadata":{"id":"siX_iqCmXqHd","colab_type":"code","colab":{},"trusted":true,"_uuid":"a4b0a36601168f2f5cd6637dbc75fc24332a6f94"},"cell_type":"code","source":"#pred1 = model.predict(test_features)","execution_count":null,"outputs":[]},{"metadata":{"colab_type":"code","id":"oXjYWVw2IYUG","colab":{},"trusted":true,"_uuid":"e2288e137c6152a56992aa9e510df5f33e8c7f5f"},"cell_type":"code","source":" test_features.columns = [str(x) for x in test_features.columns]","execution_count":null,"outputs":[]},{"metadata":{"colab_type":"code","id":"jELwuqdlIYUK","colab":{},"trusted":true,"_uuid":"a157b08fdf7a2b9c404cfe181571e81c65c6b443"},"cell_type":"code","source":"test_features.columns = [regex.sub(\"_\", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in test_features.columns.values]","execution_count":null,"outputs":[]},{"metadata":{"id":"BReu2pWFJPv0","colab_type":"code","colab":{},"trusted":true,"_uuid":"75f63a8cf61a548d506733aeaaa6fb9e54e8dbb5"},"cell_type":"code","source":"test_features = test_features[trainx.columns]","execution_count":null,"outputs":[]},{"metadata":{"id":"-sWm7nFiIirp","colab_type":"code","colab":{},"trusted":true,"_uuid":"1e440d0f1954541e590a4945cd9dfec700505579"},"cell_type":"code","source":"pred2 = xgb_model.predict(test_features)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true,"_uuid":"d5c89c7eff30708272cbfc45ff00a97c68619fc6"},"cell_type":"code","source":"pred3 = rf.predict(test_features)","execution_count":null,"outputs":[]},{"metadata":{"id":"9PMlLGAEItq3","colab_type":"code","colab":{},"trusted":true,"_uuid":"93ac19c5aeff07d17fd8be78b138cdfccda1c37a"},"cell_type":"code","source":"preds = np.exp(np.minimum(pred2, pred3)) - 1","execution_count":null,"outputs":[]},{"metadata":{"id":"WadT7kvCXsxv","colab_type":"code","colab":{},"trusted":true,"_uuid":"579742258871923af573fb42d2890eec2e1525f0"},"cell_type":"code","source":"submission_format = pd.read_csv(DATA_DIR / 'submission_format.csv', index_col=0)","execution_count":null,"outputs":[]},{"metadata":{"id":"t7DNfDKJXuzt","colab_type":"code","colab":{},"trusted":true,"_uuid":"71e5476430d1f0089d5f549c92c0ff343e94627b"},"cell_type":"code","source":"# confirm everything is in the right order\nassert np.all(test_features.index == submission_format.index)","execution_count":null,"outputs":[]},{"metadata":{"id":"R52Mqb7bXxJr","colab_type":"code","colab":{},"trusted":true,"_uuid":"c7b2392829d8450bf02ee8abbfb39f9194a25aad"},"cell_type":"code","source":"my_submission = pd.DataFrame(data=preds,\n columns=submission_format.columns,\n index=submission_format.index)","execution_count":null,"outputs":[]},{"metadata":{"id":"GFM6mmeAXzaw","colab_type":"code","colab":{},"trusted":true,"_uuid":"22f308df1c4aabf7bf94ea062b911c0e8d955350"},"cell_type":"code","source":"my_submission.to_csv('submission.csv')","execution_count":null,"outputs":[]},{"metadata":{"id":"BvceqrBlX1G8","colab_type":"code","outputId":"d26912b0-ddbd-4566-fd5a-498d670bf296","colab":{"base_uri":"https://localhost:8080/","height":187},"trusted":true,"_uuid":"1b839bf0206c1d700fd47cc729958243dfd8ad8b"},"cell_type":"code","source":"!head submission.csv","execution_count":null,"outputs":[]},{"metadata":{"id":"QUprjvrqX262","colab_type":"code","colab":{},"trusted":true,"_uuid":"743da0b5a0b50a5d41b2b7284caf6c1c340ac064"},"cell_type":"code","source":"","execution_count":null,"outputs":[]},{"metadata":{"id":"pC-tlb2S2wEn","colab_type":"code","outputId":"da54c0cd-29b4-407f-c489-209bad9c669b","colab":{"base_uri":"https://localhost:8080/","height":1969},"trusted":true,"_uuid":"167ba0f484ae1d058aedc9639bb1174b06bb9ddb"},"cell_type":"code","source":"","execution_count":null,"outputs":[]},{"metadata":{"id":"4YguCqE83Iyu","colab_type":"code","colab":{},"trusted":true,"_uuid":"62c777e7470616e646d0b826e34fdcb750fc3f1c"},"cell_type":"code","source":"","execution_count":null,"outputs":[]},{"metadata":{"trusted":true,"_uuid":"bb04e656f04858c19aa799f0795a117308be39de"},"cell_type":"code","source":"","execution_count":null,"outputs":[]}],"metadata":{"colab":{"name":"Sustainable_Industry_Rinse_Over_Run_0.ipynb","version":"0.3.2","provenance":[],"collapsed_sections":["N_uO6NYwUu6z"]},"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"}},"nbformat":4,"nbformat_minor":1}
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment