Skip to content

Instantly share code, notes, and snippets.

@adamnovotnycom
Last active April 2, 2022 15:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save adamnovotnycom/a09294f179d8e483d5411eb5c8c4e00f to your computer and use it in GitHub Desktop.
Save adamnovotnycom/a09294f179d8e483d5411eb5c8c4e00f to your computer and use it in GitHub Desktop.
sklearn_pipe.ipynb
Display the source blob
Display the rendered blob
Raw
{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"sklearn_pipe.ipynb","provenance":[],"collapsed_sections":["Px8yvYANQkV_"],"toc_visible":true,"mount_file_id":"1dzgBGllszE7-0j9cu-WI_i8HPvlDhAhr","authorship_tag":"ABX9TyMnY1Al295O8NuFyE9rdMde"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"markdown","metadata":{"id":"jJShBSUIWBHY"},"source":["# Sklearn pipeline"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"0L2bqfaHOMBn","executionInfo":{"status":"ok","timestamp":1631992634080,"user_tz":240,"elapsed":1089,"user":{"displayName":"Adam Novotny","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgMOZAxPPEuLDBB3LpK2USLaQVH2rbEb08f2sV-=s64","userId":"10515788909603796811"}},"outputId":"1ed1ced4-9240-4572-e99a-5c8cbc8236d3"},"source":["import datetime\n","import multiprocessing\n","import pandas as pd\n","from sklearn.base import BaseEstimator, TransformerMixin\n","from sklearn.ensemble import RandomForestClassifier\n","from sklearn.impute import SimpleImputer\n","from sklearn.linear_model import LogisticRegression\n","from sklearn import metrics\n","from sklearn.pipeline import FeatureUnion, Pipeline \n","from sklearn.preprocessing import StandardScaler, OneHotEncoder\n","from sklearn.model_selection import GridSearchCV, TimeSeriesSplit\n","import sys\n","print(sys.version)\n","print(pd.__version__)\n","print(multiprocessing.cpu_count())"],"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["3.7.12 (default, Sep 10 2021, 00:21:48) \n","[GCC 7.5.0]\n","1.1.5\n","2\n"]}]},{"cell_type":"markdown","metadata":{"id":"Px8yvYANQkV_"},"source":["## Load raw stock price data"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":203},"id":"tX-hbcJ1Ou_5","executionInfo":{"status":"ok","timestamp":1631992634679,"user_tz":240,"elapsed":460,"user":{"displayName":"Adam Novotny","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgMOZAxPPEuLDBB3LpK2USLaQVH2rbEb08f2sV-=s64","userId":"10515788909603796811"}},"outputId":"8dcb1e31-ec86-4003-80c6-b7969b773d26"},"source":["df = pd.read_csv(\n"," \"/content/drive/My Drive/Colab Notebooks/custom_scikit_pipeline/SPY_yahoo_finance.csv\",\n"," header=0\n",")\n","df.columns = [x.lower().replace(\" \", \"_\") for x in df.columns]\n","df.head(5)"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/html":["<div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>date</th>\n"," <th>open</th>\n"," <th>high</th>\n"," <th>low</th>\n"," <th>close</th>\n"," <th>adj_close</th>\n"," <th>volume</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>1993-01-29</td>\n"," <td>43.96875</td>\n"," <td>43.96875</td>\n"," <td>43.75000</td>\n"," <td>43.93750</td>\n"," <td>25.799770</td>\n"," <td>1003200</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>1993-02-01</td>\n"," <td>43.96875</td>\n"," <td>44.25000</td>\n"," <td>43.96875</td>\n"," <td>44.25000</td>\n"," <td>25.983273</td>\n"," <td>480500</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>1993-02-02</td>\n"," <td>44.21875</td>\n"," <td>44.37500</td>\n"," <td>44.12500</td>\n"," <td>44.34375</td>\n"," <td>26.038315</td>\n"," <td>201300</td>\n"," </tr>\n"," <tr>\n"," <th>3</th>\n"," <td>1993-02-03</td>\n"," <td>44.40625</td>\n"," <td>44.84375</td>\n"," <td>44.37500</td>\n"," <td>44.81250</td>\n"," <td>26.313566</td>\n"," <td>529400</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n"," <td>1993-02-04</td>\n"," <td>44.96875</td>\n"," <td>45.09375</td>\n"," <td>44.46875</td>\n"," <td>45.00000</td>\n"," <td>26.423655</td>\n"," <td>531500</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>"],"text/plain":[" date open high low close adj_close volume\n","0 1993-01-29 43.96875 43.96875 43.75000 43.93750 25.799770 1003200\n","1 1993-02-01 43.96875 44.25000 43.96875 44.25000 25.983273 480500\n","2 1993-02-02 44.21875 44.37500 44.12500 44.34375 26.038315 201300\n","3 1993-02-03 44.40625 44.84375 44.37500 44.81250 26.313566 529400\n","4 1993-02-04 44.96875 45.09375 44.46875 45.00000 26.423655 531500"]},"metadata":{},"execution_count":2}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":388},"id":"u7xYwOCcWZFE","executionInfo":{"status":"ok","timestamp":1631992634682,"user_tz":240,"elapsed":6,"user":{"displayName":"Adam Novotny","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgMOZAxPPEuLDBB3LpK2USLaQVH2rbEb08f2sV-=s64","userId":"10515788909603796811"}},"outputId":"ade1ef11-952e-44db-ecca-f156361a9ac3"},"source":["df.describe(include=\"all\")"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/html":["<div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>date</th>\n"," <th>open</th>\n"," <th>high</th>\n"," <th>low</th>\n"," <th>close</th>\n"," <th>adj_close</th>\n"," <th>volume</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>count</th>\n"," <td>7193</td>\n"," <td>7193.000000</td>\n"," <td>7193.000000</td>\n"," <td>7193.000000</td>\n"," <td>7193.000000</td>\n"," <td>7193.000000</td>\n"," <td>7.193000e+03</td>\n"," </tr>\n"," <tr>\n"," <th>unique</th>\n"," <td>7193</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," </tr>\n"," <tr>\n"," <th>top</th>\n"," <td>2000-10-09</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," </tr>\n"," <tr>\n"," <th>freq</th>\n"," <td>1</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," </tr>\n"," <tr>\n"," <th>mean</th>\n"," <td>NaN</td>\n"," <td>149.569786</td>\n"," <td>150.446365</td>\n"," <td>148.596184</td>\n"," <td>149.573008</td>\n"," <td>124.130425</td>\n"," <td>8.432958e+07</td>\n"," </tr>\n"," <tr>\n"," <th>std</th>\n"," <td>NaN</td>\n"," <td>80.710651</td>\n"," <td>81.049916</td>\n"," <td>80.339373</td>\n"," <td>80.732359</td>\n"," <td>86.543832</td>\n"," <td>9.571367e+07</td>\n"," </tr>\n"," <tr>\n"," <th>min</th>\n"," <td>NaN</td>\n"," <td>43.343750</td>\n"," <td>43.531250</td>\n"," <td>42.812500</td>\n"," <td>43.406250</td>\n"," <td>25.487831</td>\n"," <td>5.200000e+03</td>\n"," </tr>\n"," <tr>\n"," <th>25%</th>\n"," <td>NaN</td>\n"," <td>100.739998</td>\n"," <td>101.593750</td>\n"," <td>99.790001</td>\n"," <td>100.699997</td>\n"," <td>71.142609</td>\n"," <td>8.162800e+06</td>\n"," </tr>\n"," <tr>\n"," <th>50%</th>\n"," <td>NaN</td>\n"," <td>128.125000</td>\n"," <td>128.860001</td>\n"," <td>127.269997</td>\n"," <td>128.187500</td>\n"," <td>93.903046</td>\n"," <td>5.864900e+07</td>\n"," </tr>\n"," <tr>\n"," <th>75%</th>\n"," <td>NaN</td>\n"," <td>190.369995</td>\n"," <td>191.820007</td>\n"," <td>188.789993</td>\n"," <td>190.300003</td>\n"," <td>168.407654</td>\n"," <td>1.195754e+08</td>\n"," </tr>\n"," <tr>\n"," <th>max</th>\n"," <td>NaN</td>\n"," <td>445.589996</td>\n"," <td>447.109985</td>\n"," <td>445.070007</td>\n"," <td>446.970001</td>\n"," <td>446.970001</td>\n"," <td>8.710263e+08</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>"],"text/plain":[" date open ... adj_close volume\n","count 7193 7193.000000 ... 7193.000000 7.193000e+03\n","unique 7193 NaN ... NaN NaN\n","top 2000-10-09 NaN ... NaN NaN\n","freq 1 NaN ... NaN NaN\n","mean NaN 149.569786 ... 124.130425 8.432958e+07\n","std NaN 80.710651 ... 86.543832 9.571367e+07\n","min NaN 43.343750 ... 25.487831 5.200000e+03\n","25% NaN 100.739998 ... 71.142609 8.162800e+06\n","50% NaN 128.125000 ... 93.903046 5.864900e+07\n","75% NaN 190.369995 ... 168.407654 1.195754e+08\n","max NaN 445.589996 ... 446.970001 8.710263e+08\n","\n","[11 rows x 7 columns]"]},"metadata":{},"execution_count":3}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"x4Bjkda4Xzj2","executionInfo":{"status":"ok","timestamp":1631992634683,"user_tz":240,"elapsed":6,"user":{"displayName":"Adam Novotny","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgMOZAxPPEuLDBB3LpK2USLaQVH2rbEb08f2sV-=s64","userId":"10515788909603796811"}},"outputId":"f9965006-be46-4772-c355-8674a21ce72e"},"source":["df.dtypes"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["date object\n","open float64\n","high float64\n","low float64\n","close float64\n","adj_close float64\n","volume int64\n","dtype: object"]},"metadata":{},"execution_count":4}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":554},"id":"dd6TIeOsX5hm","executionInfo":{"status":"ok","timestamp":1631992634822,"user_tz":240,"elapsed":144,"user":{"displayName":"Adam Novotny","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgMOZAxPPEuLDBB3LpK2USLaQVH2rbEb08f2sV-=s64","userId":"10515788909603796811"}},"outputId":"e61c86cd-9581-424a-8094-0d0ab3fec22f"},"source":["df[\"date\"] = pd.to_datetime(df[\"date\"])\n","df = df.sort_values(by=\"date\", ascending=True)\n","df.describe(include=\"all\")"],"execution_count":null,"outputs":[{"output_type":"stream","name":"stderr","text":["/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:3: FutureWarning: Treating datetime data as categorical rather than numeric in `.describe` is deprecated and will be removed in a future version of pandas. Specify `datetime_is_numeric=True` to silence this warning and adopt the future behavior now.\n"," This is separate from the ipykernel package so we can avoid doing imports until\n"]},{"output_type":"execute_result","data":{"text/html":["<div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>date</th>\n"," <th>open</th>\n"," <th>high</th>\n"," <th>low</th>\n"," <th>close</th>\n"," <th>adj_close</th>\n"," <th>volume</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>count</th>\n"," <td>7193</td>\n"," <td>7193.000000</td>\n"," <td>7193.000000</td>\n"," <td>7193.000000</td>\n"," <td>7193.000000</td>\n"," <td>7193.000000</td>\n"," <td>7.193000e+03</td>\n"," </tr>\n"," <tr>\n"," <th>unique</th>\n"," <td>7193</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," </tr>\n"," <tr>\n"," <th>top</th>\n"," <td>2007-09-10 00:00:00</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," </tr>\n"," <tr>\n"," <th>freq</th>\n"," <td>1</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," </tr>\n"," <tr>\n"," <th>first</th>\n"," <td>1993-01-29 00:00:00</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," </tr>\n"," <tr>\n"," <th>last</th>\n"," <td>2021-08-20 00:00:00</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," </tr>\n"," <tr>\n"," <th>mean</th>\n"," <td>NaN</td>\n"," <td>149.569786</td>\n"," <td>150.446365</td>\n"," <td>148.596184</td>\n"," <td>149.573008</td>\n"," <td>124.130425</td>\n"," <td>8.432958e+07</td>\n"," </tr>\n"," <tr>\n"," <th>std</th>\n"," <td>NaN</td>\n"," <td>80.710651</td>\n"," <td>81.049916</td>\n"," <td>80.339373</td>\n"," <td>80.732359</td>\n"," <td>86.543832</td>\n"," <td>9.571367e+07</td>\n"," </tr>\n"," <tr>\n"," <th>min</th>\n"," <td>NaN</td>\n"," <td>43.343750</td>\n"," <td>43.531250</td>\n"," <td>42.812500</td>\n"," <td>43.406250</td>\n"," <td>25.487831</td>\n"," <td>5.200000e+03</td>\n"," </tr>\n"," <tr>\n"," <th>25%</th>\n"," <td>NaN</td>\n"," <td>100.739998</td>\n"," <td>101.593750</td>\n"," <td>99.790001</td>\n"," <td>100.699997</td>\n"," <td>71.142609</td>\n"," <td>8.162800e+06</td>\n"," </tr>\n"," <tr>\n"," <th>50%</th>\n"," <td>NaN</td>\n"," <td>128.125000</td>\n"," <td>128.860001</td>\n"," <td>127.269997</td>\n"," <td>128.187500</td>\n"," <td>93.903046</td>\n"," <td>5.864900e+07</td>\n"," </tr>\n"," <tr>\n"," <th>75%</th>\n"," <td>NaN</td>\n"," <td>190.369995</td>\n"," <td>191.820007</td>\n"," <td>188.789993</td>\n"," <td>190.300003</td>\n"," <td>168.407654</td>\n"," <td>1.195754e+08</td>\n"," </tr>\n"," <tr>\n"," <th>max</th>\n"," <td>NaN</td>\n"," <td>445.589996</td>\n"," <td>447.109985</td>\n"," <td>445.070007</td>\n"," <td>446.970001</td>\n"," <td>446.970001</td>\n"," <td>8.710263e+08</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>"],"text/plain":[" date open ... adj_close volume\n","count 7193 7193.000000 ... 7193.000000 7.193000e+03\n","unique 7193 NaN ... NaN NaN\n","top 2007-09-10 00:00:00 NaN ... NaN NaN\n","freq 1 NaN ... NaN NaN\n","first 1993-01-29 00:00:00 NaN ... NaN NaN\n","last 2021-08-20 00:00:00 NaN ... NaN NaN\n","mean NaN 149.569786 ... 124.130425 8.432958e+07\n","std NaN 80.710651 ... 86.543832 9.571367e+07\n","min NaN 43.343750 ... 25.487831 5.200000e+03\n","25% NaN 100.739998 ... 71.142609 8.162800e+06\n","50% NaN 128.125000 ... 93.903046 5.864900e+07\n","75% NaN 190.369995 ... 168.407654 1.195754e+08\n","max NaN 445.589996 ... 446.970001 8.710263e+08\n","\n","[13 rows x 7 columns]"]},"metadata":{},"execution_count":5}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"vcaCtBgfx7ml","executionInfo":{"status":"ok","timestamp":1631992634823,"user_tz":240,"elapsed":10,"user":{"displayName":"Adam Novotny","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgMOZAxPPEuLDBB3LpK2USLaQVH2rbEb08f2sV-=s64","userId":"10515788909603796811"}},"outputId":"c00840d0-5c8d-4e30-d165-60e4642c654d"},"source":["df.dtypes"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["date datetime64[ns]\n","open float64\n","high float64\n","low float64\n","close float64\n","adj_close float64\n","volume int64\n","dtype: object"]},"metadata":{},"execution_count":6}]},{"cell_type":"markdown","metadata":{"id":"64qZxzKqQqMT"},"source":["## Illustrative feature engineering"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":203},"id":"TLou4u3dQpJp","executionInfo":{"status":"ok","timestamp":1631992634823,"user_tz":240,"elapsed":10,"user":{"displayName":"Adam Novotny","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgMOZAxPPEuLDBB3LpK2USLaQVH2rbEb08f2sV-=s64","userId":"10515788909603796811"}},"outputId":"3ceada9d-a4e9-480e-e1e7-94641c27147b"},"source":["df[\"open_close_delta\"] = df[\"close\"] / df[\"open\"]\n","df.head(5)"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/html":["<div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>date</th>\n"," <th>open</th>\n"," <th>high</th>\n"," <th>low</th>\n"," <th>close</th>\n"," <th>adj_close</th>\n"," <th>volume</th>\n"," <th>open_close_delta</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>1993-01-29</td>\n"," <td>43.96875</td>\n"," <td>43.96875</td>\n"," <td>43.75000</td>\n"," <td>43.93750</td>\n"," <td>25.799770</td>\n"," <td>1003200</td>\n"," <td>0.999289</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>1993-02-01</td>\n"," <td>43.96875</td>\n"," <td>44.25000</td>\n"," <td>43.96875</td>\n"," <td>44.25000</td>\n"," <td>25.983273</td>\n"," <td>480500</td>\n"," <td>1.006397</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>1993-02-02</td>\n"," <td>44.21875</td>\n"," <td>44.37500</td>\n"," <td>44.12500</td>\n"," <td>44.34375</td>\n"," <td>26.038315</td>\n"," <td>201300</td>\n"," <td>1.002827</td>\n"," </tr>\n"," <tr>\n"," <th>3</th>\n"," <td>1993-02-03</td>\n"," <td>44.40625</td>\n"," <td>44.84375</td>\n"," <td>44.37500</td>\n"," <td>44.81250</td>\n"," <td>26.313566</td>\n"," <td>529400</td>\n"," <td>1.009148</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n"," <td>1993-02-04</td>\n"," <td>44.96875</td>\n"," <td>45.09375</td>\n"," <td>44.46875</td>\n"," <td>45.00000</td>\n"," <td>26.423655</td>\n"," <td>531500</td>\n"," <td>1.000695</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>"],"text/plain":[" date open high ... adj_close volume open_close_delta\n","0 1993-01-29 43.96875 43.96875 ... 25.799770 1003200 0.999289\n","1 1993-02-01 43.96875 44.25000 ... 25.983273 480500 1.006397\n","2 1993-02-02 44.21875 44.37500 ... 26.038315 201300 1.002827\n","3 1993-02-03 44.40625 44.84375 ... 26.313566 529400 1.009148\n","4 1993-02-04 44.96875 45.09375 ... 26.423655 531500 1.000695\n","\n","[5 rows x 8 columns]"]},"metadata":{},"execution_count":7}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":307},"id":"xmP32uclbuLx","executionInfo":{"status":"ok","timestamp":1631992634824,"user_tz":240,"elapsed":9,"user":{"displayName":"Adam Novotny","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgMOZAxPPEuLDBB3LpK2USLaQVH2rbEb08f2sV-=s64","userId":"10515788909603796811"}},"outputId":"5807a038-d61b-4717-b62d-ae455b6a10e4"},"source":["df[\"day_of_week\"] = df[\"date\"].dt.dayofweek\n","df[\"day_of_week\"] = df[\"day_of_week\"].apply(lambda x: \"monday\" if x == 0 else x)\n","df.head(5)"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/html":["<div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>date</th>\n"," <th>open</th>\n"," <th>high</th>\n"," <th>low</th>\n"," <th>close</th>\n"," <th>adj_close</th>\n"," <th>volume</th>\n"," <th>open_close_delta</th>\n"," <th>day_of_week</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>1993-01-29</td>\n"," <td>43.96875</td>\n"," <td>43.96875</td>\n"," <td>43.75000</td>\n"," <td>43.93750</td>\n"," <td>25.799770</td>\n"," <td>1003200</td>\n"," <td>0.999289</td>\n"," <td>4</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>1993-02-01</td>\n"," <td>43.96875</td>\n"," <td>44.25000</td>\n"," <td>43.96875</td>\n"," <td>44.25000</td>\n"," <td>25.983273</td>\n"," <td>480500</td>\n"," <td>1.006397</td>\n"," <td>monday</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>1993-02-02</td>\n"," <td>44.21875</td>\n"," <td>44.37500</td>\n"," <td>44.12500</td>\n"," <td>44.34375</td>\n"," <td>26.038315</td>\n"," <td>201300</td>\n"," <td>1.002827</td>\n"," <td>1</td>\n"," </tr>\n"," <tr>\n"," <th>3</th>\n"," <td>1993-02-03</td>\n"," <td>44.40625</td>\n"," <td>44.84375</td>\n"," <td>44.37500</td>\n"," <td>44.81250</td>\n"," <td>26.313566</td>\n"," <td>529400</td>\n"," <td>1.009148</td>\n"," <td>2</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n"," <td>1993-02-04</td>\n"," <td>44.96875</td>\n"," <td>45.09375</td>\n"," <td>44.46875</td>\n"," <td>45.00000</td>\n"," <td>26.423655</td>\n"," <td>531500</td>\n"," <td>1.000695</td>\n"," <td>3</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>"],"text/plain":[" date open high ... volume open_close_delta day_of_week\n","0 1993-01-29 43.96875 43.96875 ... 1003200 0.999289 4\n","1 1993-02-01 43.96875 44.25000 ... 480500 1.006397 monday\n","2 1993-02-02 44.21875 44.37500 ... 201300 1.002827 1\n","3 1993-02-03 44.40625 44.84375 ... 529400 1.009148 2\n","4 1993-02-04 44.96875 45.09375 ... 531500 1.000695 3\n","\n","[5 rows x 9 columns]"]},"metadata":{},"execution_count":8}]},{"cell_type":"markdown","metadata":{"id":"44Pr1QsabryC"},"source":["## Define Label"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"J6CiP2GpReQ2","executionInfo":{"status":"ok","timestamp":1631992634824,"user_tz":240,"elapsed":8,"user":{"displayName":"Adam Novotny","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgMOZAxPPEuLDBB3LpK2USLaQVH2rbEb08f2sV-=s64","userId":"10515788909603796811"}},"outputId":"79c44bda-0d97-4e01-9358-d1c599f31706"},"source":["df[\"return\"] = df[\"adj_close\"] / df[\"adj_close\"].shift(1)\n","df[\"label\"] = df[\"return\"].shift(-1) # today's features are used to forecast tomorrow's return\n","# setup label as a classification problem {0, 1}\n","df[\"label\"] = df[\"label\"].apply(lambda x: 1.0 if x > 1.005 else 0.0)\n","print(df.loc[:, [\"date\", \"adj_close\", \"return\", \"label\"]].head(5))\n","print(df[\"label\"].value_counts(ascending=False))"],"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":[" date adj_close return label\n","0 1993-01-29 25.799770 NaN 1.0\n","1 1993-02-01 25.983273 1.007113 0.0\n","2 1993-02-02 26.038315 1.002118 1.0\n","3 1993-02-03 26.313566 1.010571 0.0\n","4 1993-02-04 26.423655 1.004184 0.0\n","0.0 5144\n","1.0 2049\n","Name: label, dtype: int64\n"]}]},{"cell_type":"markdown","metadata":{"id":"xj5NGSs9Zp1i"},"source":["## Train/test split\n","Time series dataset: Train test split by date to avoid leakage"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"_KiNBDEtY7VY","executionInfo":{"status":"ok","timestamp":1631992634960,"user_tz":240,"elapsed":7,"user":{"displayName":"Adam Novotny","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgMOZAxPPEuLDBB3LpK2USLaQVH2rbEb08f2sV-=s64","userId":"10515788909603796811"}},"outputId":"f3e00f4e-f5fb-4afd-8b0f-72464aa376cc"},"source":["train_df = df.loc[pd.Timestamp(\"2016-12-31\") >= df[\"date\"], :]\n","print(len(train_df))\n","print(train_df[\"date\"].describe())\n","test_df = df.loc[pd.Timestamp(\"2016-12-31\") < df[\"date\"], :]\n","print(len(test_df))\n","print(test_df[\"date\"].describe())"],"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["6026\n","count 6026\n","unique 6026\n","top 1999-08-23 00:00:00\n","freq 1\n","first 1993-01-29 00:00:00\n","last 2016-12-30 00:00:00\n","Name: date, dtype: object\n","1167\n","count 1167\n","unique 1167\n","top 2017-10-30 00:00:00\n","freq 1\n","first 2017-01-03 00:00:00\n","last 2021-08-20 00:00:00\n","Name: date, dtype: object\n"]},{"output_type":"stream","name":"stderr","text":["/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:3: FutureWarning: Treating datetime data as categorical rather than numeric in `.describe` is deprecated and will be removed in a future version of pandas. Specify `datetime_is_numeric=True` to silence this warning and adopt the future behavior now.\n"," This is separate from the ipykernel package so we can avoid doing imports until\n","/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:6: FutureWarning: Treating datetime data as categorical rather than numeric in `.describe` is deprecated and will be removed in a future version of pandas. Specify `datetime_is_numeric=True` to silence this warning and adopt the future behavior now.\n"," \n"]}]},{"cell_type":"markdown","metadata":{"id":"AlfnH-gwwAUX"},"source":["## Feature transformation pipeline"]},{"cell_type":"code","metadata":{"id":"WGX99oBTXkqj"},"source":["numerical_features = [\"volume\", \"open_close_delta\"]\n","categorical_features = [\"day_of_week\"]"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"DQsWyNcFW5_s"},"source":["class FeatureSelector(BaseEstimator, TransformerMixin):\n"," def __init__(self, feature_names):\n"," self.feature_names = feature_names \n"," def fit( self, X, y = None ):\n"," return self\n"," def transform(self, X, y=None):\n"," return X.loc[:, self.feature_names].copy(deep=True)"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"cojolSPBXXmu"},"source":["numerical_pipeline = Pipeline(steps = [ \n"," (\"num_selector\", FeatureSelector(numerical_features)),\n"," (\"imputer\", SimpleImputer(strategy=\"median\")),\n"," (\"std_scaler\", StandardScaler()) \n","])"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"id":"MwPqBp3EYeGQ"},"source":["categorical_pipeline = Pipeline(steps = [ \n"," (\"num_selector\", FeatureSelector(categorical_features)),\n"," (\"ohe\", OneHotEncoder(\n"," handle_unknown=\"ignore\", \n"," sparse=False,\n"," categories=[\n"," df[\"day_of_week\"].unique()\n"," ])\n"," ) \n","])"],"execution_count":null,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"ssLPKpHRwMg9"},"source":["### Example feature engineering inside pipeline"]},{"cell_type":"code","metadata":{"id":"1saG5wjPyhcg"},"source":["class DailyTrendFeature(BaseEstimator, TransformerMixin):\n"," def __init__(self):\n"," pass\n"," def fit( self, X, y = None ):\n"," return self\n"," def transform(self, X, y=None):\n"," X.loc[:, \"open_close_delta\"] = X[\"close\"] / X[\"open\"]\n"," def daily_trend(row):\n"," if 0.99 > row[\"open_close_delta\"]: # assume 'down' day when prices fall > 1% from open\n"," row[\"daily_trend\"] = \"down\"\n"," elif 1.01 < row[\"open_close_delta\"]: # assume 'up' day when prices rise > 1% from open\n"," row[\"daily_trend\"] = \"up\"\n"," else:\n"," row[\"daily_trend\"] = \"flat\"\n"," return row\n"," X = X.apply(daily_trend, axis=1)\n"," return X\n","\n","daily_trend_feature_pipeline = Pipeline(steps = [ \n"," (\"selector\", FeatureSelector([\"open\", \"close\"])),\n"," (\"feature_engineering\", DailyTrendFeature()),\n"," (\"selector_new\", FeatureSelector([\"daily_trend\"])),\n"," (\"ohe\", OneHotEncoder(\n"," handle_unknown=\"ignore\", \n"," sparse=False,\n"," categories=[\n"," [\"up\", \"down\", \"flat\"],\n"," ])\n"," ) \n","])"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"USuhkKMumkCv","executionInfo":{"status":"ok","timestamp":1631992635108,"user_tz":240,"elapsed":151,"user":{"displayName":"Adam Novotny","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgMOZAxPPEuLDBB3LpK2USLaQVH2rbEb08f2sV-=s64","userId":"10515788909603796811"}},"outputId":"abb02a0e-a6fe-4897-9a0d-f94f1bab63be"},"source":["def test_new_feature_pipeline():\n"," test_df = train_df.sample(5).copy(deep=True).reset_index()\n"," print(test_df.loc[:, [\"return\"]])\n"," sample_transforms = daily_trend_feature_pipeline.fit_transform(\n"," test_df, \n"," test_df[\"label\"]\n"," )\n"," print(pd.DataFrame(\n"," sample_transforms, \n"," columns=daily_trend_feature_pipeline.named_steps[\"ohe\"].get_feature_names()\n"," ))\n","test_new_feature_pipeline()"],"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":[" return\n","0 0.991621\n","1 0.997277\n","2 0.981978\n","3 0.996962\n","4 1.012702\n"," x0_up x0_down x0_flat\n","0 0.0 0.0 1.0\n","1 0.0 0.0 1.0\n","2 0.0 0.0 1.0\n","3 0.0 1.0 0.0\n","4 0.0 0.0 1.0\n"]}]},{"cell_type":"code","metadata":{"id":"GFN-s6e0Z-_I"},"source":["feature_pipeline = FeatureUnion(\n"," n_jobs=-1, \n"," transformer_list=[ \n"," (\"numerical_pipeline\", numerical_pipeline),\n"," (\"categorical_pipeline\", categorical_pipeline),\n"," (\"daily_trend_feature_pipeline\", daily_trend_feature_pipeline),\n"," ]\n",")"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/","height":493},"id":"WA0POsiZoGEC","executionInfo":{"status":"ok","timestamp":1631992636373,"user_tz":240,"elapsed":1268,"user":{"displayName":"Adam Novotny","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgMOZAxPPEuLDBB3LpK2USLaQVH2rbEb08f2sV-=s64","userId":"10515788909603796811"}},"outputId":"37d83718-6f8d-498a-f871-0205ca24895b"},"source":["def test_feature_pipeline():\n"," test_df = train_df.sample(5).copy(deep=True).reset_index()\n"," display(test_df)\n"," feature_pipeline.fit(test_df, test_df[\"label\"])\n"," display(pd.DataFrame(feature_pipeline.transform(test_df),\n"," columns = (\n"," numerical_features \n"," + list(feature_pipeline.transformer_list[1][1][\"ohe\"].get_feature_names())\n"," + list(feature_pipeline.transformer_list[2][1][\"ohe\"].get_feature_names())\n"," )\n"," )\n"," )\n","test_feature_pipeline()"],"execution_count":null,"outputs":[{"output_type":"display_data","data":{"text/html":["<div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>index</th>\n"," <th>date</th>\n"," <th>open</th>\n"," <th>high</th>\n"," <th>low</th>\n"," <th>close</th>\n"," <th>adj_close</th>\n"," <th>volume</th>\n"," <th>open_close_delta</th>\n"," <th>day_of_week</th>\n"," <th>return</th>\n"," <th>label</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>5215</td>\n"," <td>2013-10-14</td>\n"," <td>169.210007</td>\n"," <td>171.080002</td>\n"," <td>169.080002</td>\n"," <td>170.940002</td>\n"," <td>147.378723</td>\n"," <td>112106000</td>\n"," <td>1.010224</td>\n"," <td>monday</td>\n"," <td>1.003994</td>\n"," <td>0.0</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>5924</td>\n"," <td>2016-08-08</td>\n"," <td>218.399994</td>\n"," <td>218.520004</td>\n"," <td>217.740005</td>\n"," <td>218.050003</td>\n"," <td>198.728592</td>\n"," <td>39906500</td>\n"," <td>0.998397</td>\n"," <td>monday</td>\n"," <td>0.999404</td>\n"," <td>0.0</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>821</td>\n"," <td>1996-04-30</td>\n"," <td>65.437500</td>\n"," <td>65.562500</td>\n"," <td>65.125000</td>\n"," <td>65.390625</td>\n"," <td>41.525272</td>\n"," <td>184400</td>\n"," <td>0.999284</td>\n"," <td>1</td>\n"," <td>0.999284</td>\n"," <td>0.0</td>\n"," </tr>\n"," <tr>\n"," <th>3</th>\n"," <td>973</td>\n"," <td>1996-12-04</td>\n"," <td>74.875000</td>\n"," <td>75.062500</td>\n"," <td>74.093750</td>\n"," <td>74.953125</td>\n"," <td>48.096256</td>\n"," <td>2365100</td>\n"," <td>1.001043</td>\n"," <td>2</td>\n"," <td>1.002717</td>\n"," <td>0.0</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n"," <td>2043</td>\n"," <td>2001-03-05</td>\n"," <td>124.150002</td>\n"," <td>124.779999</td>\n"," <td>123.809998</td>\n"," <td>124.739998</td>\n"," <td>84.509819</td>\n"," <td>5293200</td>\n"," <td>1.004752</td>\n"," <td>monday</td>\n"," <td>1.009142</td>\n"," <td>1.0</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>"],"text/plain":[" index date open ... day_of_week return label\n","0 5215 2013-10-14 169.210007 ... monday 1.003994 0.0\n","1 5924 2016-08-08 218.399994 ... monday 0.999404 0.0\n","2 821 1996-04-30 65.437500 ... 1 0.999284 0.0\n","3 973 1996-12-04 74.875000 ... 2 1.002717 0.0\n","4 2043 2001-03-05 124.150002 ... monday 1.009142 1.0\n","\n","[5 rows x 12 columns]"]},"metadata":{}},{"output_type":"display_data","data":{"text/html":["<div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>volume</th>\n"," <th>open_close_delta</th>\n"," <th>x0_4</th>\n"," <th>x0_monday</th>\n"," <th>x0_1</th>\n"," <th>x0_2</th>\n"," <th>x0_3</th>\n"," <th>x0_up</th>\n"," <th>x0_down</th>\n"," <th>x0_flat</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>1.880130</td>\n"," <td>1.728391</td>\n"," <td>0.0</td>\n"," <td>1.0</td>\n"," <td>0.0</td>\n"," <td>0.0</td>\n"," <td>0.0</td>\n"," <td>1.0</td>\n"," <td>0.0</td>\n"," <td>0.0</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>0.186182</td>\n"," <td>-1.002947</td>\n"," <td>0.0</td>\n"," <td>1.0</td>\n"," <td>0.0</td>\n"," <td>0.0</td>\n"," <td>0.0</td>\n"," <td>0.0</td>\n"," <td>0.0</td>\n"," <td>1.0</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>-0.745779</td>\n"," <td>-0.798280</td>\n"," <td>0.0</td>\n"," <td>0.0</td>\n"," <td>1.0</td>\n"," <td>0.0</td>\n"," <td>0.0</td>\n"," <td>0.0</td>\n"," <td>0.0</td>\n"," <td>1.0</td>\n"," </tr>\n"," <tr>\n"," <th>3</th>\n"," <td>-0.694616</td>\n"," <td>-0.391867</td>\n"," <td>0.0</td>\n"," <td>0.0</td>\n"," <td>0.0</td>\n"," <td>1.0</td>\n"," <td>0.0</td>\n"," <td>0.0</td>\n"," <td>0.0</td>\n"," <td>1.0</td>\n"," </tr>\n"," <tr>\n"," <th>4</th>\n"," <td>-0.625917</td>\n"," <td>0.464703</td>\n"," <td>0.0</td>\n"," <td>1.0</td>\n"," <td>0.0</td>\n"," <td>0.0</td>\n"," <td>0.0</td>\n"," <td>0.0</td>\n"," <td>0.0</td>\n"," <td>1.0</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>"],"text/plain":[" volume open_close_delta x0_4 x0_monday ... x0_3 x0_up x0_down x0_flat\n","0 1.880130 1.728391 0.0 1.0 ... 0.0 1.0 0.0 0.0\n","1 0.186182 -1.002947 0.0 1.0 ... 0.0 0.0 0.0 1.0\n","2 -0.745779 -0.798280 0.0 0.0 ... 0.0 0.0 0.0 1.0\n","3 -0.694616 -0.391867 0.0 0.0 ... 0.0 0.0 0.0 1.0\n","4 -0.625917 0.464703 0.0 1.0 ... 0.0 0.0 0.0 1.0\n","\n","[5 rows x 10 columns]"]},"metadata":{}}]},{"cell_type":"markdown","metadata":{"id":"ANUdza9f2cY6"},"source":["## Model"]},{"cell_type":"code","metadata":{"id":"FARmX_UJ2eyk"},"source":["model_pipeline = Pipeline(steps=[\n"," (\"feature_pipeline\", feature_pipeline),\n"," (\"model\", LogisticRegression())\n","])\n","param_grid = [\n"," {\n"," \"feature_pipeline__numerical_pipeline__imputer__strategy\": [\"mean\", \"median\"],\n"," \"model\": [LogisticRegression()],\n"," \"model__C\": [0.1, 1.0, 10],\n"," },\n"," {\n"," \"feature_pipeline__numerical_pipeline__imputer__strategy\": [\"mean\", \"median\"],\n"," \"model\": [RandomForestClassifier()],\n"," \"model__max_depth\": [3.0, 5.0, 7.0],\n"," }\n","]\n","grid_search = GridSearchCV(\n"," model_pipeline, \n"," param_grid, \n"," cv=TimeSeriesSplit(n_splits=5),\n"," scoring=\"roc_auc\",\n"," refit=True,\n"," n_jobs=-1\n",")\n","# grid_search"],"execution_count":null,"outputs":[]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"J6CeX92j3g5Y","executionInfo":{"status":"ok","timestamp":1631992807216,"user_tz":240,"elapsed":170846,"user":{"displayName":"Adam Novotny","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgMOZAxPPEuLDBB3LpK2USLaQVH2rbEb08f2sV-=s64","userId":"10515788909603796811"}},"outputId":"c913d8ab-f4eb-4c39-963e-3c8bb11b4207"},"source":["now = datetime.datetime.now()\n","grid_search.fit(train_df, train_df[\"label\"])\n","print(datetime.datetime.now() - now)"],"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["0:02:50.686868\n"]}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"9nWIva1Rpfsk","executionInfo":{"status":"ok","timestamp":1631992807217,"user_tz":240,"elapsed":14,"user":{"displayName":"Adam Novotny","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgMOZAxPPEuLDBB3LpK2USLaQVH2rbEb08f2sV-=s64","userId":"10515788909603796811"}},"outputId":"c5b65bac-43d7-45ff-d6d2-8c4a15a207c1"},"source":["print(f\"Best params: {grid_search.best_params_}\")\n","print(f\"Best score: {grid_search.best_score_}\")"],"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Best params: {'feature_pipeline__numerical_pipeline__imputer__strategy': 'mean', 'model': LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,\n"," intercept_scaling=1, l1_ratio=None, max_iter=100,\n"," multi_class='auto', n_jobs=None, penalty='l2',\n"," random_state=None, solver='lbfgs', tol=0.0001, verbose=0,\n"," warm_start=False), 'model__C': 0.1}\n","Best score: 0.5765670811118563\n"]}]},{"cell_type":"markdown","metadata":{"id":"Aj3up6IG5sey"},"source":["## Metrics"]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"19LMO2kBrx6I","executionInfo":{"status":"ok","timestamp":1631992811877,"user_tz":240,"elapsed":4666,"user":{"displayName":"Adam Novotny","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgMOZAxPPEuLDBB3LpK2USLaQVH2rbEb08f2sV-=s64","userId":"10515788909603796811"}},"outputId":"659139eb-f9e6-4b91-bb86-375204a53228"},"source":["metrics.roc_auc_score(\n"," y_true=train_df[\"label\"],\n"," y_score=grid_search.predict(train_df),\n"," average=\"weighted\"\n",")"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["0.5041638147080223"]},"metadata":{},"execution_count":22}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"FG6oYfoFpoYF","executionInfo":{"status":"ok","timestamp":1631992812704,"user_tz":240,"elapsed":847,"user":{"displayName":"Adam Novotny","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgMOZAxPPEuLDBB3LpK2USLaQVH2rbEb08f2sV-=s64","userId":"10515788909603796811"}},"outputId":"2c43416d-28c2-43d1-b4d2-516508eb2ddf"},"source":["metrics.roc_auc_score(\n"," y_true=test_df[\"label\"],\n"," y_score=grid_search.predict(test_df),\n"," average=\"weighted\"\n",")"],"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["0.5009555705544877"]},"metadata":{},"execution_count":23}]},{"cell_type":"markdown","metadata":{"id":"F9ChyNSLjb8T"},"source":["## Export notebook as HTML"]},{"cell_type":"code","metadata":{"id":"_7nuQJ2GaxyM","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1631998376733,"user_tz":240,"elapsed":1680,"user":{"displayName":"Adam Novotny","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgMOZAxPPEuLDBB3LpK2USLaQVH2rbEb08f2sV-=s64","userId":"10515788909603796811"}},"outputId":"49819423-c1b4-45b4-a87e-fc619b86eed9"},"source":["%%shell\n","jupyter nbconvert --to html '/content/drive/My Drive/Colab Notebooks/custom_scikit_pipeline/sklearn_pipe.ipynb'"],"execution_count":1,"outputs":[{"output_type":"stream","name":"stdout","text":["[NbConvertApp] Converting notebook /content/drive/My Drive/Colab Notebooks/custom_scikit_pipeline/sklearn_pipe.ipynb to html\n","[NbConvertApp] Writing 338767 bytes to /content/drive/My Drive/Colab Notebooks/custom_scikit_pipeline/sklearn_pipe.html\n"]},{"output_type":"execute_result","data":{"text/plain":[""]},"metadata":{},"execution_count":1}]},{"cell_type":"code","metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"jKfbk43ueWB_","executionInfo":{"status":"ok","timestamp":1631998488759,"user_tz":240,"elapsed":1388,"user":{"displayName":"Adam Novotny","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14GgMOZAxPPEuLDBB3LpK2USLaQVH2rbEb08f2sV-=s64","userId":"10515788909603796811"}},"outputId":"2e937b6f-9d60-463b-d268-edf63192ab0e"},"source":["%%shell\n","# ### html with outputs\n","jupyter nbconvert --to html --no-input --no-prompt '/content/drive/My Drive/Colab Notebooks/custom_scikit_pipeline/sklearn_pipe.ipynb' --output sklearn_pipe_no_code.html"],"execution_count":2,"outputs":[{"output_type":"stream","name":"stdout","text":["[NbConvertApp] Converting notebook /content/drive/My Drive/Colab Notebooks/custom_scikit_pipeline/sklearn_pipe.ipynb to html\n","[NbConvertApp] Writing 299986 bytes to /content/drive/My Drive/Colab Notebooks/custom_scikit_pipeline/sklearn_pipe_no_code.html\n"]},{"output_type":"execute_result","data":{"text/plain":[""]},"metadata":{},"execution_count":2}]},{"cell_type":"code","metadata":{"id":"3th4ahEqe3Wd"},"source":[""],"execution_count":null,"outputs":[]}]}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment