Last active
April 2, 2022 15:15
-
-
Save adamnovotnycom/a09294f179d8e483d5411eb5c8c4e00f to your computer and use it in GitHub Desktop.
sklearn_pipe.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"sklearn_pipe.ipynb","provenance":[],"collapsed_sections":["Px8yvYANQkV_"],"toc_visible":true,"mount_file_id":"1dzgBGllszE7-0j9cu-WI_i8HPvlDhAhr","authorship_tag":"ABX9TyMnY1Al295O8NuFyE9rdMde"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"markdown","metadata":{"id":"jJShBSUIWBHY"},"source":["# Sklearn pipeline"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"0L2bqfaHOMBn","executionInfo":{"status":"ok","timestamp":1631992634080,"user_tz":240,"elapsed":1089,"user":{"displayName":"Adam Novotny","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgMOZAxPPEuLDBB3LpK2USLaQVH2rbEb08f2sV-=s64","userId":"10515788909603796811"}},"outputId":"1ed1ced4-9240-4572-e99a-5c8cbc8236d3"},"source":["import datetime\n","import multiprocessing\n","import pandas as pd\n","from sklearn.base import BaseEstimator, TransformerMixin\n","from sklearn.ensemble import RandomForestClassifier\n","from sklearn.impute import SimpleImputer\n","from sklearn.linear_model import LogisticRegression\n","from sklearn import metrics\n","from sklearn.pipeline import FeatureUnion, Pipeline \n","from sklearn.preprocessing import StandardScaler, OneHotEncoder\n","from sklearn.model_selection import GridSearchCV, TimeSeriesSplit\n","import sys\n","print(sys.version)\n","print(pd.__version__)\n","print(multiprocessing.cpu_count())"],"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["3.7.12 (default, Sep 10 2021, 00:21:48) \n","[GCC 7.5.0]\n","1.1.5\n","2\n"]}]},{"cell_type":"markdown","metadata":{"id":"Px8yvYANQkV_"},"source":["## Load raw stock price data"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":203},"id":"tX-hbcJ1Ou_5","executionInfo":{"status":"ok","timestamp":1631992634679,"user_tz":240,"elapsed":460,"user":{"displayName":"Adam Novotny","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgMOZAxPPEuLDBB3LpK2USLaQVH2rbEb08f2sV-=s64","userId":"10515788909603796811"}},"outputId":"8dcb1e31-ec86-4003-80c6-b7969b773d26"},"source":["df = pd.read_csv(\n"," \"/content/drive/My Drive/Colab Notebooks/custom_scikit_pipeline/SPY_yahoo_finance.csv\",\n"," header=0\n",")\n","df.columns = [x.lower().replace(\" \", \"_\") for x in df.columns]\n","df.head(5)"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/html":["<div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>date</th>\n"," <th>open</th>\n"," <th>high</th>\n"," <th>low</th>\n"," <th>close</th>\n"," <th>adj_close</th>\n"," <th>volume</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>1993-01-29</td>\n"," <td>43.96875</td>\n"," <td>43.96875</td>\n"," <td>43.75000</td>\n"," <td>43.93750</td>\n"," <td>25.799770</td>\n"," <td>1003200</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>1993-02-01</td>\n"," <td>43.96875</td>\n"," <td>44.25000</td>\n"," <td>43.96875</td>\n"," <td>44.25000</td>\n"," <td>25.983273</td>\n"," <td>480500</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>1993-02-02</td>\n"," <td>44.21875</td>\n"," <td>44.37500</td>\n"," <td>44.12500</td>\n"," <td>44.34375</td>\n"," <td>26.038315</td>\n"," <td>201300</td>\n"," </tr>\n"," <tr>\n"," <th>3</th>\n"," <td>1993-02-03</td>\n"," <td>44.40625</td>\n"," <td>44.84375</td>\n"," <td>44.37500</td>\n"," <td>44.81250</td>\n"," <td>26.313566</td>\n"," <td>529400</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n"," <td>1993-02-04</td>\n"," <td>44.96875</td>\n"," <td>45.09375</td>\n"," <td>44.46875</td>\n"," <td>45.00000</td>\n"," <td>26.423655</td>\n"," <td>531500</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>"],"text/plain":[" date open high low close adj_close volume\n","0 1993-01-29 43.96875 43.96875 43.75000 43.93750 25.799770 1003200\n","1 1993-02-01 43.96875 44.25000 43.96875 44.25000 25.983273 480500\n","2 1993-02-02 44.21875 44.37500 44.12500 44.34375 26.038315 201300\n","3 1993-02-03 44.40625 44.84375 44.37500 44.81250 26.313566 529400\n","4 1993-02-04 44.96875 45.09375 44.46875 45.00000 26.423655 531500"]},"metadata":{},"execution_count":2}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":388},"id":"u7xYwOCcWZFE","executionInfo":{"status":"ok","timestamp":1631992634682,"user_tz":240,"elapsed":6,"user":{"displayName":"Adam Novotny","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgMOZAxPPEuLDBB3LpK2USLaQVH2rbEb08f2sV-=s64","userId":"10515788909603796811"}},"outputId":"ade1ef11-952e-44db-ecca-f156361a9ac3"},"source":["df.describe(include=\"all\")"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/html":["<div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>date</th>\n"," <th>open</th>\n"," <th>high</th>\n"," <th>low</th>\n"," <th>close</th>\n"," <th>adj_close</th>\n"," <th>volume</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>count</th>\n"," <td>7193</td>\n"," <td>7193.000000</td>\n"," <td>7193.000000</td>\n"," <td>7193.000000</td>\n"," <td>7193.000000</td>\n"," <td>7193.000000</td>\n"," <td>7.193000e+03</td>\n"," </tr>\n"," <tr>\n"," <th>unique</th>\n"," <td>7193</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," </tr>\n"," <tr>\n"," <th>top</th>\n"," <td>2000-10-09</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," </tr>\n"," <tr>\n"," <th>freq</th>\n"," <td>1</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," </tr>\n"," <tr>\n"," <th>mean</th>\n"," <td>NaN</td>\n"," <td>149.569786</td>\n"," <td>150.446365</td>\n"," <td>148.596184</td>\n"," <td>149.573008</td>\n"," <td>124.130425</td>\n"," <td>8.432958e+07</td>\n"," </tr>\n"," <tr>\n"," <th>std</th>\n"," <td>NaN</td>\n"," <td>80.710651</td>\n"," <td>81.049916</td>\n"," <td>80.339373</td>\n"," <td>80.732359</td>\n"," <td>86.543832</td>\n"," <td>9.571367e+07</td>\n"," </tr>\n"," <tr>\n"," <th>min</th>\n"," <td>NaN</td>\n"," <td>43.343750</td>\n"," <td>43.531250</td>\n"," <td>42.812500</td>\n"," <td>43.406250</td>\n"," <td>25.487831</td>\n"," <td>5.200000e+03</td>\n"," </tr>\n"," <tr>\n"," <th>25%</th>\n"," <td>NaN</td>\n"," <td>100.739998</td>\n"," <td>101.593750</td>\n"," <td>99.790001</td>\n"," <td>100.699997</td>\n"," <td>71.142609</td>\n"," <td>8.162800e+06</td>\n"," </tr>\n"," <tr>\n"," <th>50%</th>\n"," <td>NaN</td>\n"," <td>128.125000</td>\n"," <td>128.860001</td>\n"," <td>127.269997</td>\n"," <td>128.187500</td>\n"," <td>93.903046</td>\n"," <td>5.864900e+07</td>\n"," </tr>\n"," <tr>\n"," <th>75%</th>\n"," <td>NaN</td>\n"," <td>190.369995</td>\n"," <td>191.820007</td>\n"," <td>188.789993</td>\n"," <td>190.300003</td>\n"," <td>168.407654</td>\n"," <td>1.195754e+08</td>\n"," </tr>\n"," <tr>\n"," <th>max</th>\n"," <td>NaN</td>\n"," <td>445.589996</td>\n"," <td>447.109985</td>\n"," <td>445.070007</td>\n"," <td>446.970001</td>\n"," <td>446.970001</td>\n"," <td>8.710263e+08</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>"],"text/plain":[" date open ... adj_close volume\n","count 7193 7193.000000 ... 7193.000000 7.193000e+03\n","unique 7193 NaN ... NaN NaN\n","top 2000-10-09 NaN ... NaN NaN\n","freq 1 NaN ... NaN NaN\n","mean NaN 149.569786 ... 124.130425 8.432958e+07\n","std NaN 80.710651 ... 86.543832 9.571367e+07\n","min NaN 43.343750 ... 25.487831 5.200000e+03\n","25% NaN 100.739998 ... 71.142609 8.162800e+06\n","50% NaN 128.125000 ... 93.903046 5.864900e+07\n","75% NaN 190.369995 ... 168.407654 1.195754e+08\n","max NaN 445.589996 ... 446.970001 8.710263e+08\n","\n","[11 rows x 7 columns]"]},"metadata":{},"execution_count":3}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"x4Bjkda4Xzj2","executionInfo":{"status":"ok","timestamp":1631992634683,"user_tz":240,"elapsed":6,"user":{"displayName":"Adam Novotny","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgMOZAxPPEuLDBB3LpK2USLaQVH2rbEb08f2sV-=s64","userId":"10515788909603796811"}},"outputId":"f9965006-be46-4772-c355-8674a21ce72e"},"source":["df.dtypes"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["date object\n","open float64\n","high float64\n","low float64\n","close float64\n","adj_close float64\n","volume int64\n","dtype: object"]},"metadata":{},"execution_count":4}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":554},"id":"dd6TIeOsX5hm","executionInfo":{"status":"ok","timestamp":1631992634822,"user_tz":240,"elapsed":144,"user":{"displayName":"Adam Novotny","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgMOZAxPPEuLDBB3LpK2USLaQVH2rbEb08f2sV-=s64","userId":"10515788909603796811"}},"outputId":"e61c86cd-9581-424a-8094-0d0ab3fec22f"},"source":["df[\"date\"] = pd.to_datetime(df[\"date\"])\n","df = df.sort_values(by=\"date\", ascending=True)\n","df.describe(include=\"all\")"],"execution_count":null,"outputs":[{"output_type":"stream","name":"stderr","text":["/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:3: FutureWarning: Treating datetime data as categorical rather than numeric in `.describe` is deprecated and will be removed in a future version of pandas. Specify `datetime_is_numeric=True` to silence this warning and adopt the future behavior now.\n"," This is separate from the ipykernel package so we can avoid doing imports until\n"]},{"output_type":"execute_result","data":{"text/html":["<div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>date</th>\n"," <th>open</th>\n"," <th>high</th>\n"," <th>low</th>\n"," <th>close</th>\n"," <th>adj_close</th>\n"," <th>volume</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>count</th>\n"," <td>7193</td>\n"," <td>7193.000000</td>\n"," <td>7193.000000</td>\n"," <td>7193.000000</td>\n"," <td>7193.000000</td>\n"," <td>7193.000000</td>\n"," <td>7.193000e+03</td>\n"," </tr>\n"," <tr>\n"," <th>unique</th>\n"," <td>7193</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," </tr>\n"," <tr>\n"," <th>top</th>\n"," <td>2007-09-10 00:00:00</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," </tr>\n"," <tr>\n"," <th>freq</th>\n"," <td>1</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," </tr>\n"," <tr>\n"," <th>first</th>\n"," <td>1993-01-29 00:00:00</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," </tr>\n"," <tr>\n"," <th>last</th>\n"," <td>2021-08-20 00:00:00</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," </tr>\n"," <tr>\n"," <th>mean</th>\n"," <td>NaN</td>\n"," <td>149.569786</td>\n"," <td>150.446365</td>\n"," <td>148.596184</td>\n"," <td>149.573008</td>\n"," <td>124.130425</td>\n"," <td>8.432958e+07</td>\n"," </tr>\n"," <tr>\n"," <th>std</th>\n"," <td>NaN</td>\n"," <td>80.710651</td>\n"," <td>81.049916</td>\n"," <td>80.339373</td>\n"," <td>80.732359</td>\n"," <td>86.543832</td>\n"," <td>9.571367e+07</td>\n"," </tr>\n"," <tr>\n"," <th>min</th>\n"," <td>NaN</td>\n"," <td>43.343750</td>\n"," <td>43.531250</td>\n"," <td>42.812500</td>\n"," <td>43.406250</td>\n"," <td>25.487831</td>\n"," <td>5.200000e+03</td>\n"," </tr>\n"," <tr>\n"," <th>25%</th>\n"," <td>NaN</td>\n"," <td>100.739998</td>\n"," <td>101.593750</td>\n"," <td>99.790001</td>\n"," <td>100.699997</td>\n"," <td>71.142609</td>\n"," <td>8.162800e+06</td>\n"," </tr>\n"," <tr>\n"," <th>50%</th>\n"," <td>NaN</td>\n"," <td>128.125000</td>\n"," <td>128.860001</td>\n"," <td>127.269997</td>\n"," <td>128.187500</td>\n"," <td>93.903046</td>\n"," <td>5.864900e+07</td>\n"," </tr>\n"," <tr>\n"," <th>75%</th>\n"," <td>NaN</td>\n"," <td>190.369995</td>\n"," <td>191.820007</td>\n"," <td>188.789993</td>\n"," <td>190.300003</td>\n"," <td>168.407654</td>\n"," <td>1.195754e+08</td>\n"," </tr>\n"," <tr>\n"," <th>max</th>\n"," <td>NaN</td>\n"," <td>445.589996</td>\n"," <td>447.109985</td>\n"," <td>445.070007</td>\n"," <td>446.970001</td>\n"," <td>446.970001</td>\n"," <td>8.710263e+08</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>"],"text/plain":[" date open ... adj_close volume\n","count 7193 7193.000000 ... 7193.000000 7.193000e+03\n","unique 7193 NaN ... NaN NaN\n","top 2007-09-10 00:00:00 NaN ... NaN NaN\n","freq 1 NaN ... NaN NaN\n","first 1993-01-29 00:00:00 NaN ... NaN NaN\n","last 2021-08-20 00:00:00 NaN ... NaN NaN\n","mean NaN 149.569786 ... 124.130425 8.432958e+07\n","std NaN 80.710651 ... 86.543832 9.571367e+07\n","min NaN 43.343750 ... 25.487831 5.200000e+03\n","25% NaN 100.739998 ... 71.142609 8.162800e+06\n","50% NaN 128.125000 ... 93.903046 5.864900e+07\n","75% NaN 190.369995 ... 168.407654 1.195754e+08\n","max NaN 445.589996 ... 446.970001 8.710263e+08\n","\n","[13 rows x 7 columns]"]},"metadata":{},"execution_count":5}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"vcaCtBgfx7ml","executionInfo":{"status":"ok","timestamp":1631992634823,"user_tz":240,"elapsed":10,"user":{"displayName":"Adam Novotny","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgMOZAxPPEuLDBB3LpK2USLaQVH2rbEb08f2sV-=s64","userId":"10515788909603796811"}},"outputId":"c00840d0-5c8d-4e30-d165-60e4642c654d"},"source":["df.dtypes"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["date datetime64[ns]\n","open float64\n","high float64\n","low float64\n","close float64\n","adj_close float64\n","volume int64\n","dtype: object"]},"metadata":{},"execution_count":6}]},{"cell_type":"markdown","metadata":{"id":"64qZxzKqQqMT"},"source":["## Illustrative feature engineering"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":203},"id":"TLou4u3dQpJp","executionInfo":{"status":"ok","timestamp":1631992634823,"user_tz":240,"elapsed":10,"user":{"displayName":"Adam Novotny","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgMOZAxPPEuLDBB3LpK2USLaQVH2rbEb08f2sV-=s64","userId":"10515788909603796811"}},"outputId":"3ceada9d-a4e9-480e-e1e7-94641c27147b"},"source":["df[\"open_close_delta\"] = df[\"close\"] / df[\"open\"]\n","df.head(5)"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/html":["<div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>date</th>\n"," <th>open</th>\n"," <th>high</th>\n"," <th>low</th>\n"," <th>close</th>\n"," <th>adj_close</th>\n"," <th>volume</th>\n"," <th>open_close_delta</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>1993-01-29</td>\n"," <td>43.96875</td>\n"," <td>43.96875</td>\n"," <td>43.75000</td>\n"," <td>43.93750</td>\n"," <td>25.799770</td>\n"," <td>1003200</td>\n"," <td>0.999289</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>1993-02-01</td>\n"," <td>43.96875</td>\n"," <td>44.25000</td>\n"," <td>43.96875</td>\n"," <td>44.25000</td>\n"," <td>25.983273</td>\n"," <td>480500</td>\n"," <td>1.006397</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>1993-02-02</td>\n"," <td>44.21875</td>\n"," <td>44.37500</td>\n"," <td>44.12500</td>\n"," <td>44.34375</td>\n"," <td>26.038315</td>\n"," <td>201300</td>\n"," <td>1.002827</td>\n"," </tr>\n"," <tr>\n"," <th>3</th>\n"," <td>1993-02-03</td>\n"," <td>44.40625</td>\n"," <td>44.84375</td>\n"," <td>44.37500</td>\n"," <td>44.81250</td>\n"," <td>26.313566</td>\n"," <td>529400</td>\n"," <td>1.009148</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n"," <td>1993-02-04</td>\n"," <td>44.96875</td>\n"," <td>45.09375</td>\n"," <td>44.46875</td>\n"," <td>45.00000</td>\n"," <td>26.423655</td>\n"," <td>531500</td>\n"," <td>1.000695</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>"],"text/plain":[" date open high ... adj_close volume open_close_delta\n","0 1993-01-29 43.96875 43.96875 ... 25.799770 1003200 0.999289\n","1 1993-02-01 43.96875 44.25000 ... 25.983273 480500 1.006397\n","2 1993-02-02 44.21875 44.37500 ... 26.038315 201300 1.002827\n","3 1993-02-03 44.40625 44.84375 ... 26.313566 529400 1.009148\n","4 1993-02-04 44.96875 45.09375 ... 26.423655 531500 1.000695\n","\n","[5 rows x 8 columns]"]},"metadata":{},"execution_count":7}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":307},"id":"xmP32uclbuLx","executionInfo":{"status":"ok","timestamp":1631992634824,"user_tz":240,"elapsed":9,"user":{"displayName":"Adam Novotny","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgMOZAxPPEuLDBB3LpK2USLaQVH2rbEb08f2sV-=s64","userId":"10515788909603796811"}},"outputId":"5807a038-d61b-4717-b62d-ae455b6a10e4"},"source":["df[\"day_of_week\"] = df[\"date\"].dt.dayofweek\n","df[\"day_of_week\"] = df[\"day_of_week\"].apply(lambda x: \"monday\" if x == 0 else x)\n","df.head(5)"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/html":["<div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>date</th>\n"," <th>open</th>\n"," <th>high</th>\n"," <th>low</th>\n"," <th>close</th>\n"," <th>adj_close</th>\n"," <th>volume</th>\n"," <th>open_close_delta</th>\n"," <th>day_of_week</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>1993-01-29</td>\n"," <td>43.96875</td>\n"," <td>43.96875</td>\n"," <td>43.75000</td>\n"," <td>43.93750</td>\n"," <td>25.799770</td>\n"," <td>1003200</td>\n"," <td>0.999289</td>\n"," <td>4</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>1993-02-01</td>\n"," <td>43.96875</td>\n"," <td>44.25000</td>\n"," <td>43.96875</td>\n"," <td>44.25000</td>\n"," <td>25.983273</td>\n"," <td>480500</td>\n"," <td>1.006397</td>\n"," <td>monday</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>1993-02-02</td>\n"," <td>44.21875</td>\n"," <td>44.37500</td>\n"," <td>44.12500</td>\n"," <td>44.34375</td>\n"," <td>26.038315</td>\n"," <td>201300</td>\n"," <td>1.002827</td>\n"," <td>1</td>\n"," </tr>\n"," <tr>\n"," <th>3</th>\n"," <td>1993-02-03</td>\n"," <td>44.40625</td>\n"," <td>44.84375</td>\n"," <td>44.37500</td>\n"," <td>44.81250</td>\n"," <td>26.313566</td>\n"," <td>529400</td>\n"," <td>1.009148</td>\n"," <td>2</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n"," <td>1993-02-04</td>\n"," <td>44.96875</td>\n"," <td>45.09375</td>\n"," <td>44.46875</td>\n"," <td>45.00000</td>\n"," <td>26.423655</td>\n"," <td>531500</td>\n"," <td>1.000695</td>\n"," <td>3</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>"],"text/plain":[" date open high ... volume open_close_delta day_of_week\n","0 1993-01-29 43.96875 43.96875 ... 1003200 0.999289 4\n","1 1993-02-01 43.96875 44.25000 ... 480500 1.006397 monday\n","2 1993-02-02 44.21875 44.37500 ... 201300 1.002827 1\n","3 1993-02-03 44.40625 44.84375 ... 529400 1.009148 2\n","4 1993-02-04 44.96875 45.09375 ... 531500 1.000695 3\n","\n","[5 rows x 9 columns]"]},"metadata":{},"execution_count":8}]},{"cell_type":"markdown","metadata":{"id":"44Pr1QsabryC"},"source":["## Define Label"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"J6CiP2GpReQ2","executionInfo":{"status":"ok","timestamp":1631992634824,"user_tz":240,"elapsed":8,"user":{"displayName":"Adam Novotny","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgMOZAxPPEuLDBB3LpK2USLaQVH2rbEb08f2sV-=s64","userId":"10515788909603796811"}},"outputId":"79c44bda-0d97-4e01-9358-d1c599f31706"},"source":["df[\"return\"] = df[\"adj_close\"] / df[\"adj_close\"].shift(1)\n","df[\"label\"] = df[\"return\"].shift(-1) # today's features are used to forecast tomorrow's return\n","# setup label as a classification problem {0, 1}\n","df[\"label\"] = df[\"label\"].apply(lambda x: 1.0 if x > 1.005 else 0.0)\n","print(df.loc[:, [\"date\", \"adj_close\", \"return\", \"label\"]].head(5))\n","print(df[\"label\"].value_counts(ascending=False))"],"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":[" date adj_close return label\n","0 1993-01-29 25.799770 NaN 1.0\n","1 1993-02-01 25.983273 1.007113 0.0\n","2 1993-02-02 26.038315 1.002118 1.0\n","3 1993-02-03 26.313566 1.010571 0.0\n","4 1993-02-04 26.423655 1.004184 0.0\n","0.0 5144\n","1.0 2049\n","Name: label, dtype: int64\n"]}]},{"cell_type":"markdown","metadata":{"id":"xj5NGSs9Zp1i"},"source":["## Train/test split\n","Time series dataset: Train test split by date to avoid leakage"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"_KiNBDEtY7VY","executionInfo":{"status":"ok","timestamp":1631992634960,"user_tz":240,"elapsed":7,"user":{"displayName":"Adam Novotny","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgMOZAxPPEuLDBB3LpK2USLaQVH2rbEb08f2sV-=s64","userId":"10515788909603796811"}},"outputId":"f3e00f4e-f5fb-4afd-8b0f-72464aa376cc"},"source":["train_df = df.loc[pd.Timestamp(\"2016-12-31\") >= df[\"date\"], :]\n","print(len(train_df))\n","print(train_df[\"date\"].describe())\n","test_df = df.loc[pd.Timestamp(\"2016-12-31\") < df[\"date\"], :]\n","print(len(test_df))\n","print(test_df[\"date\"].describe())"],"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["6026\n","count 6026\n","unique 6026\n","top 1999-08-23 00:00:00\n","freq 1\n","first 1993-01-29 00:00:00\n","last 2016-12-30 00:00:00\n","Name: date, dtype: object\n","1167\n","count 1167\n","unique 1167\n","top 2017-10-30 00:00:00\n","freq 1\n","first 2017-01-03 00:00:00\n","last 2021-08-20 00:00:00\n","Name: date, dtype: object\n"]},{"output_type":"stream","name":"stderr","text":["/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:3: FutureWarning: Treating datetime data as categorical rather than numeric in `.describe` is deprecated and will be removed in a future version of pandas. Specify `datetime_is_numeric=True` to silence this warning and adopt the future behavior now.\n"," This is separate from the ipykernel package so we can avoid doing imports until\n","/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:6: FutureWarning: Treating datetime data as categorical rather than numeric in `.describe` is deprecated and will be removed in a future version of pandas. Specify `datetime_is_numeric=True` to silence this warning and adopt the future behavior now.\n"," \n"]}]},{"cell_type":"markdown","metadata":{"id":"AlfnH-gwwAUX"},"source":["## Feature transformation pipeline"]},{"cell_type":"code","metadata":{"id":"WGX99oBTXkqj"},"source":["numerical_features = [\"volume\", \"open_close_delta\"]\n","categorical_features = [\"day_of_week\"]"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"DQsWyNcFW5_s"},"source":["class FeatureSelector(BaseEstimator, TransformerMixin):\n"," def __init__(self, feature_names):\n"," self.feature_names = feature_names \n"," def fit( self, X, y = None ):\n"," return self\n"," def transform(self, X, y=None):\n"," return X.loc[:, self.feature_names].copy(deep=True)"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"cojolSPBXXmu"},"source":["numerical_pipeline = Pipeline(steps = [ \n"," (\"num_selector\", FeatureSelector(numerical_features)),\n"," (\"imputer\", SimpleImputer(strategy=\"median\")),\n"," (\"std_scaler\", StandardScaler()) \n","])"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"MwPqBp3EYeGQ"},"source":["categorical_pipeline = Pipeline(steps = [ \n"," (\"num_selector\", FeatureSelector(categorical_features)),\n"," (\"ohe\", OneHotEncoder(\n"," handle_unknown=\"ignore\", \n"," sparse=False,\n"," categories=[\n"," df[\"day_of_week\"].unique()\n"," ])\n"," ) \n","])"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"ssLPKpHRwMg9"},"source":["### Example feature engineering inside pipeline"]},{"cell_type":"code","metadata":{"id":"1saG5wjPyhcg"},"source":["class DailyTrendFeature(BaseEstimator, TransformerMixin):\n"," def __init__(self):\n"," pass\n"," def fit( self, X, y = None ):\n"," return self\n"," def transform(self, X, y=None):\n"," X.loc[:, \"open_close_delta\"] = X[\"close\"] / X[\"open\"]\n"," def daily_trend(row):\n"," if 0.99 > row[\"open_close_delta\"]: # assume 'down' day when prices fall > 1% from open\n"," row[\"daily_trend\"] = \"down\"\n"," elif 1.01 < row[\"open_close_delta\"]: # assume 'up' day when prices rise > 1% from open\n"," row[\"daily_trend\"] = \"up\"\n"," else:\n"," row[\"daily_trend\"] = \"flat\"\n"," return row\n"," X = X.apply(daily_trend, axis=1)\n"," return X\n","\n","daily_trend_feature_pipeline = Pipeline(steps = [ \n"," (\"selector\", FeatureSelector([\"open\", \"close\"])),\n"," (\"feature_engineering\", DailyTrendFeature()),\n"," (\"selector_new\", FeatureSelector([\"daily_trend\"])),\n"," (\"ohe\", OneHotEncoder(\n"," handle_unknown=\"ignore\", \n"," sparse=False,\n"," categories=[\n"," [\"up\", \"down\", \"flat\"],\n"," ])\n"," ) \n","])"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"USuhkKMumkCv","executionInfo":{"status":"ok","timestamp":1631992635108,"user_tz":240,"elapsed":151,"user":{"displayName":"Adam Novotny","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgMOZAxPPEuLDBB3LpK2USLaQVH2rbEb08f2sV-=s64","userId":"10515788909603796811"}},"outputId":"abb02a0e-a6fe-4897-9a0d-f94f1bab63be"},"source":["def test_new_feature_pipeline():\n"," test_df = train_df.sample(5).copy(deep=True).reset_index()\n"," print(test_df.loc[:, [\"return\"]])\n"," sample_transforms = daily_trend_feature_pipeline.fit_transform(\n"," test_df, \n"," test_df[\"label\"]\n"," )\n"," print(pd.DataFrame(\n"," sample_transforms, \n"," columns=daily_trend_feature_pipeline.named_steps[\"ohe\"].get_feature_names()\n"," ))\n","test_new_feature_pipeline()"],"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":[" return\n","0 0.991621\n","1 0.997277\n","2 0.981978\n","3 0.996962\n","4 1.012702\n"," x0_up x0_down x0_flat\n","0 0.0 0.0 1.0\n","1 0.0 0.0 1.0\n","2 0.0 0.0 1.0\n","3 0.0 1.0 0.0\n","4 0.0 0.0 1.0\n"]}]},{"cell_type":"code","metadata":{"id":"GFN-s6e0Z-_I"},"source":["feature_pipeline = FeatureUnion(\n"," n_jobs=-1, \n"," transformer_list=[ \n"," (\"numerical_pipeline\", numerical_pipeline),\n"," (\"categorical_pipeline\", categorical_pipeline),\n"," (\"daily_trend_feature_pipeline\", daily_trend_feature_pipeline),\n"," ]\n",")"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":493},"id":"WA0POsiZoGEC","executionInfo":{"status":"ok","timestamp":1631992636373,"user_tz":240,"elapsed":1268,"user":{"displayName":"Adam Novotny","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgMOZAxPPEuLDBB3LpK2USLaQVH2rbEb08f2sV-=s64","userId":"10515788909603796811"}},"outputId":"37d83718-6f8d-498a-f871-0205ca24895b"},"source":["def test_feature_pipeline():\n"," test_df = train_df.sample(5).copy(deep=True).reset_index()\n"," display(test_df)\n"," feature_pipeline.fit(test_df, test_df[\"label\"])\n"," display(pd.DataFrame(feature_pipeline.transform(test_df),\n"," columns = (\n"," numerical_features \n"," + list(feature_pipeline.transformer_list[1][1][\"ohe\"].get_feature_names())\n"," + list(feature_pipeline.transformer_list[2][1][\"ohe\"].get_feature_names())\n"," )\n"," )\n"," )\n","test_feature_pipeline()"],"execution_count":null,"outputs":[{"output_type":"display_data","data":{"text/html":["<div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>index</th>\n"," <th>date</th>\n"," <th>open</th>\n"," <th>high</th>\n"," <th>low</th>\n"," <th>close</th>\n"," <th>adj_close</th>\n"," <th>volume</th>\n"," <th>open_close_delta</th>\n"," <th>day_of_week</th>\n"," <th>return</th>\n"," <th>label</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>5215</td>\n"," <td>2013-10-14</td>\n"," <td>169.210007</td>\n"," <td>171.080002</td>\n"," <td>169.080002</td>\n"," <td>170.940002</td>\n"," <td>147.378723</td>\n"," <td>112106000</td>\n"," <td>1.010224</td>\n"," <td>monday</td>\n"," <td>1.003994</td>\n"," <td>0.0</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>5924</td>\n"," <td>2016-08-08</td>\n"," <td>218.399994</td>\n"," <td>218.520004</td>\n"," <td>217.740005</td>\n"," <td>218.050003</td>\n"," <td>198.728592</td>\n"," <td>39906500</td>\n"," <td>0.998397</td>\n"," <td>monday</td>\n"," <td>0.999404</td>\n"," <td>0.0</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>821</td>\n"," <td>1996-04-30</td>\n"," <td>65.437500</td>\n"," <td>65.562500</td>\n"," <td>65.125000</td>\n"," <td>65.390625</td>\n"," <td>41.525272</td>\n"," <td>184400</td>\n"," <td>0.999284</td>\n"," <td>1</td>\n"," <td>0.999284</td>\n"," <td>0.0</td>\n"," </tr>\n"," <tr>\n"," <th>3</th>\n"," <td>973</td>\n"," <td>1996-12-04</td>\n"," <td>74.875000</td>\n"," <td>75.062500</td>\n"," <td>74.093750</td>\n"," <td>74.953125</td>\n"," <td>48.096256</td>\n"," <td>2365100</td>\n"," <td>1.001043</td>\n"," <td>2</td>\n"," <td>1.002717</td>\n"," <td>0.0</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n"," <td>2043</td>\n"," <td>2001-03-05</td>\n"," <td>124.150002</td>\n"," <td>124.779999</td>\n"," <td>123.809998</td>\n"," <td>124.739998</td>\n"," <td>84.509819</td>\n"," <td>5293200</td>\n"," <td>1.004752</td>\n"," <td>monday</td>\n"," <td>1.009142</td>\n"," <td>1.0</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>"],"text/plain":[" index date open ... day_of_week return label\n","0 5215 2013-10-14 169.210007 ... monday 1.003994 0.0\n","1 5924 2016-08-08 218.399994 ... monday 0.999404 0.0\n","2 821 1996-04-30 65.437500 ... 1 0.999284 0.0\n","3 973 1996-12-04 74.875000 ... 2 1.002717 0.0\n","4 2043 2001-03-05 124.150002 ... monday 1.009142 1.0\n","\n","[5 rows x 12 columns]"]},"metadata":{}},{"output_type":"display_data","data":{"text/html":["<div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>volume</th>\n"," <th>open_close_delta</th>\n"," <th>x0_4</th>\n"," <th>x0_monday</th>\n"," <th>x0_1</th>\n"," <th>x0_2</th>\n"," <th>x0_3</th>\n"," <th>x0_up</th>\n"," <th>x0_down</th>\n"," <th>x0_flat</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>1.880130</td>\n"," <td>1.728391</td>\n"," <td>0.0</td>\n"," <td>1.0</td>\n"," <td>0.0</td>\n"," <td>0.0</td>\n"," <td>0.0</td>\n"," <td>1.0</td>\n"," <td>0.0</td>\n"," <td>0.0</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>0.186182</td>\n"," <td>-1.002947</td>\n"," <td>0.0</td>\n"," <td>1.0</td>\n"," <td>0.0</td>\n"," <td>0.0</td>\n"," <td>0.0</td>\n"," <td>0.0</td>\n"," <td>0.0</td>\n"," <td>1.0</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>-0.745779</td>\n"," <td>-0.798280</td>\n"," <td>0.0</td>\n"," <td>0.0</td>\n"," <td>1.0</td>\n"," <td>0.0</td>\n"," <td>0.0</td>\n"," <td>0.0</td>\n"," <td>0.0</td>\n"," <td>1.0</td>\n"," </tr>\n"," <tr>\n"," <th>3</th>\n"," <td>-0.694616</td>\n"," <td>-0.391867</td>\n"," <td>0.0</td>\n"," <td>0.0</td>\n"," <td>0.0</td>\n"," <td>1.0</td>\n"," <td>0.0</td>\n"," <td>0.0</td>\n"," <td>0.0</td>\n"," <td>1.0</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n"," <td>-0.625917</td>\n"," <td>0.464703</td>\n"," <td>0.0</td>\n"," <td>1.0</td>\n"," <td>0.0</td>\n"," <td>0.0</td>\n"," <td>0.0</td>\n"," <td>0.0</td>\n"," <td>0.0</td>\n"," <td>1.0</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>"],"text/plain":[" volume open_close_delta x0_4 x0_monday ... x0_3 x0_up x0_down x0_flat\n","0 1.880130 1.728391 0.0 1.0 ... 0.0 1.0 0.0 0.0\n","1 0.186182 -1.002947 0.0 1.0 ... 0.0 0.0 0.0 1.0\n","2 -0.745779 -0.798280 0.0 0.0 ... 0.0 0.0 0.0 1.0\n","3 -0.694616 -0.391867 0.0 0.0 ... 0.0 0.0 0.0 1.0\n","4 -0.625917 0.464703 0.0 1.0 ... 0.0 0.0 0.0 1.0\n","\n","[5 rows x 10 columns]"]},"metadata":{}}]},{"cell_type":"markdown","metadata":{"id":"ANUdza9f2cY6"},"source":["## Model"]},{"cell_type":"code","metadata":{"id":"FARmX_UJ2eyk"},"source":["model_pipeline = Pipeline(steps=[\n"," (\"feature_pipeline\", feature_pipeline),\n"," (\"model\", LogisticRegression())\n","])\n","param_grid = [\n"," {\n"," \"feature_pipeline__numerical_pipeline__imputer__strategy\": [\"mean\", \"median\"],\n"," \"model\": [LogisticRegression()],\n"," \"model__C\": [0.1, 1.0, 10],\n"," },\n"," {\n"," \"feature_pipeline__numerical_pipeline__imputer__strategy\": [\"mean\", \"median\"],\n"," \"model\": [RandomForestClassifier()],\n"," \"model__max_depth\": [3.0, 5.0, 7.0],\n"," }\n","]\n","grid_search = GridSearchCV(\n"," model_pipeline, \n"," param_grid, \n"," cv=TimeSeriesSplit(n_splits=5),\n"," scoring=\"roc_auc\",\n"," refit=True,\n"," n_jobs=-1\n",")\n","# grid_search"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"J6CeX92j3g5Y","executionInfo":{"status":"ok","timestamp":1631992807216,"user_tz":240,"elapsed":170846,"user":{"displayName":"Adam Novotny","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgMOZAxPPEuLDBB3LpK2USLaQVH2rbEb08f2sV-=s64","userId":"10515788909603796811"}},"outputId":"c913d8ab-f4eb-4c39-963e-3c8bb11b4207"},"source":["now = datetime.datetime.now()\n","grid_search.fit(train_df, train_df[\"label\"])\n","print(datetime.datetime.now() - now)"],"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["0:02:50.686868\n"]}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"9nWIva1Rpfsk","executionInfo":{"status":"ok","timestamp":1631992807217,"user_tz":240,"elapsed":14,"user":{"displayName":"Adam Novotny","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgMOZAxPPEuLDBB3LpK2USLaQVH2rbEb08f2sV-=s64","userId":"10515788909603796811"}},"outputId":"c5b65bac-43d7-45ff-d6d2-8c4a15a207c1"},"source":["print(f\"Best params: {grid_search.best_params_}\")\n","print(f\"Best score: {grid_search.best_score_}\")"],"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Best params: {'feature_pipeline__numerical_pipeline__imputer__strategy': 'mean', 'model': LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,\n"," intercept_scaling=1, l1_ratio=None, max_iter=100,\n"," multi_class='auto', n_jobs=None, penalty='l2',\n"," random_state=None, solver='lbfgs', tol=0.0001, verbose=0,\n"," warm_start=False), 'model__C': 0.1}\n","Best score: 0.5765670811118563\n"]}]},{"cell_type":"markdown","metadata":{"id":"Aj3up6IG5sey"},"source":["## Metrics"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"19LMO2kBrx6I","executionInfo":{"status":"ok","timestamp":1631992811877,"user_tz":240,"elapsed":4666,"user":{"displayName":"Adam Novotny","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgMOZAxPPEuLDBB3LpK2USLaQVH2rbEb08f2sV-=s64","userId":"10515788909603796811"}},"outputId":"659139eb-f9e6-4b91-bb86-375204a53228"},"source":["metrics.roc_auc_score(\n"," y_true=train_df[\"label\"],\n"," y_score=grid_search.predict(train_df),\n"," average=\"weighted\"\n",")"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["0.5041638147080223"]},"metadata":{},"execution_count":22}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"FG6oYfoFpoYF","executionInfo":{"status":"ok","timestamp":1631992812704,"user_tz":240,"elapsed":847,"user":{"displayName":"Adam Novotny","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgMOZAxPPEuLDBB3LpK2USLaQVH2rbEb08f2sV-=s64","userId":"10515788909603796811"}},"outputId":"2c43416d-28c2-43d1-b4d2-516508eb2ddf"},"source":["metrics.roc_auc_score(\n"," y_true=test_df[\"label\"],\n"," y_score=grid_search.predict(test_df),\n"," average=\"weighted\"\n",")"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["0.5009555705544877"]},"metadata":{},"execution_count":23}]},{"cell_type":"markdown","metadata":{"id":"F9ChyNSLjb8T"},"source":["## Export notebook as HTML"]},{"cell_type":"code","metadata":{"id":"_7nuQJ2GaxyM","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1631998376733,"user_tz":240,"elapsed":1680,"user":{"displayName":"Adam Novotny","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgMOZAxPPEuLDBB3LpK2USLaQVH2rbEb08f2sV-=s64","userId":"10515788909603796811"}},"outputId":"49819423-c1b4-45b4-a87e-fc619b86eed9"},"source":["%%shell\n","jupyter nbconvert --to html '/content/drive/My Drive/Colab Notebooks/custom_scikit_pipeline/sklearn_pipe.ipynb'"],"execution_count":1,"outputs":[{"output_type":"stream","name":"stdout","text":["[NbConvertApp] Converting notebook /content/drive/My Drive/Colab Notebooks/custom_scikit_pipeline/sklearn_pipe.ipynb to html\n","[NbConvertApp] Writing 338767 bytes to /content/drive/My Drive/Colab Notebooks/custom_scikit_pipeline/sklearn_pipe.html\n"]},{"output_type":"execute_result","data":{"text/plain":[""]},"metadata":{},"execution_count":1}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"jKfbk43ueWB_","executionInfo":{"status":"ok","timestamp":1631998488759,"user_tz":240,"elapsed":1388,"user":{"displayName":"Adam Novotny","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgMOZAxPPEuLDBB3LpK2USLaQVH2rbEb08f2sV-=s64","userId":"10515788909603796811"}},"outputId":"2e937b6f-9d60-463b-d268-edf63192ab0e"},"source":["%%shell\n","# ### html with outputs\n","jupyter nbconvert --to html --no-input --no-prompt '/content/drive/My Drive/Colab Notebooks/custom_scikit_pipeline/sklearn_pipe.ipynb' --output sklearn_pipe_no_code.html"],"execution_count":2,"outputs":[{"output_type":"stream","name":"stdout","text":["[NbConvertApp] Converting notebook /content/drive/My Drive/Colab Notebooks/custom_scikit_pipeline/sklearn_pipe.ipynb to html\n","[NbConvertApp] Writing 299986 bytes to /content/drive/My Drive/Colab Notebooks/custom_scikit_pipeline/sklearn_pipe_no_code.html\n"]},{"output_type":"execute_result","data":{"text/plain":[""]},"metadata":{},"execution_count":2}]},{"cell_type":"code","metadata":{"id":"3th4ahEqe3Wd"},"source":[""],"execution_count":null,"outputs":[]}]} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment