Skip to content

Instantly share code, notes, and snippets.

@markpvoice
Last active October 4, 2020 21:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save markpvoice/f08717068baa46852a88bc5078d43e2e to your computer and use it in GitHub Desktop.
Save markpvoice/f08717068baa46852a88bc5078d43e2e to your computer and use it in GitHub Desktop.
notebooks/titanic-ml.ipynb
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "import fastai; fastai.__version__",
"execution_count": 1,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 1,
"data": {
"text/plain": "'2.0.10'"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "import numpy as np \nimport pandas as pd ",
"execution_count": 2,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "from fastcore import *",
"execution_count": 3,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "from fastai.tabular.all import *",
"execution_count": 4,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "from sklearn.ensemble import RandomForestClassifier",
"execution_count": 5,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "#import kaggle",
"execution_count": 6,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "#!kaggle competitions download -c titanic",
"execution_count": 7,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "import os",
"execution_count": 8,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "#print('getcwd: ', os.getcwd())",
"execution_count": 9,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "cwd = os.getcwd()",
"execution_count": 10,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "path = Path(cwd)",
"execution_count": 11,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "path = path/'data/titanic'",
"execution_count": 12,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "#path.ls()",
"execution_count": 13,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "train_data = pd.read_csv(path/'train.csv')",
"execution_count": 14,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "train_data.head()",
"execution_count": 15,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 15,
"data": {
"text/plain": " PassengerId Survived Pclass \\\n0 1 0 3 \n1 2 1 1 \n2 3 1 3 \n3 4 1 1 \n4 5 0 3 \n\n Name Sex Age SibSp \\\n0 Braund, Mr. Owen Harris male 22.0 1 \n1 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38.0 1 \n2 Heikkinen, Miss. Laina female 26.0 0 \n3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n4 Allen, Mr. William Henry male 35.0 0 \n\n Parch Ticket Fare Cabin Embarked \n0 0 A/5 21171 7.2500 NaN S \n1 0 PC 17599 71.2833 C85 C \n2 0 STON/O2. 3101282 7.9250 NaN S \n3 0 113803 53.1000 C123 S \n4 0 373450 8.0500 NaN S ",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>PassengerId</th>\n <th>Survived</th>\n <th>Pclass</th>\n <th>Name</th>\n <th>Sex</th>\n <th>Age</th>\n <th>SibSp</th>\n <th>Parch</th>\n <th>Ticket</th>\n <th>Fare</th>\n <th>Cabin</th>\n <th>Embarked</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1</td>\n <td>0</td>\n <td>3</td>\n <td>Braund, Mr. Owen Harris</td>\n <td>male</td>\n <td>22.0</td>\n <td>1</td>\n <td>0</td>\n <td>A/5 21171</td>\n <td>7.2500</td>\n <td>NaN</td>\n <td>S</td>\n </tr>\n <tr>\n <th>1</th>\n <td>2</td>\n <td>1</td>\n <td>1</td>\n <td>Cumings, Mrs. John Bradley (Florence Briggs Thayer)</td>\n <td>female</td>\n <td>38.0</td>\n <td>1</td>\n <td>0</td>\n <td>PC 17599</td>\n <td>71.2833</td>\n <td>C85</td>\n <td>C</td>\n </tr>\n <tr>\n <th>2</th>\n <td>3</td>\n <td>1</td>\n <td>3</td>\n <td>Heikkinen, Miss. Laina</td>\n <td>female</td>\n <td>26.0</td>\n <td>0</td>\n <td>0</td>\n <td>STON/O2. 3101282</td>\n <td>7.9250</td>\n <td>NaN</td>\n <td>S</td>\n </tr>\n <tr>\n <th>3</th>\n <td>4</td>\n <td>1</td>\n <td>1</td>\n <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n <td>female</td>\n <td>35.0</td>\n <td>1</td>\n <td>0</td>\n <td>113803</td>\n <td>53.1000</td>\n <td>C123</td>\n <td>S</td>\n </tr>\n <tr>\n <th>4</th>\n <td>5</td>\n <td>0</td>\n <td>3</td>\n <td>Allen, Mr. William Henry</td>\n <td>male</td>\n <td>35.0</td>\n <td>0</td>\n <td>0</td>\n <td>373450</td>\n <td>8.0500</td>\n <td>NaN</td>\n <td>S</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "#from pandas_profiling import ProfileReport\n#prof = ProfileReport(train_data)\n#prof.to_file(output_file='output.html')",
"execution_count": 16,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "test_data = pd.read_csv(path/'test.csv')",
"execution_count": 17,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "test_data.head()",
"execution_count": 18,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 18,
"data": {
"text/plain": " PassengerId Pclass Name Sex \\\n0 892 3 Kelly, Mr. James male \n1 893 3 Wilkes, Mrs. James (Ellen Needs) female \n2 894 2 Myles, Mr. Thomas Francis male \n3 895 3 Wirz, Mr. Albert male \n4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female \n\n Age SibSp Parch Ticket Fare Cabin Embarked \n0 34.5 0 0 330911 7.8292 NaN Q \n1 47.0 1 0 363272 7.0000 NaN S \n2 62.0 0 0 240276 9.6875 NaN Q \n3 27.0 0 0 315154 8.6625 NaN S \n4 22.0 1 1 3101298 12.2875 NaN S ",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>PassengerId</th>\n <th>Pclass</th>\n <th>Name</th>\n <th>Sex</th>\n <th>Age</th>\n <th>SibSp</th>\n <th>Parch</th>\n <th>Ticket</th>\n <th>Fare</th>\n <th>Cabin</th>\n <th>Embarked</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>892</td>\n <td>3</td>\n <td>Kelly, Mr. James</td>\n <td>male</td>\n <td>34.5</td>\n <td>0</td>\n <td>0</td>\n <td>330911</td>\n <td>7.8292</td>\n <td>NaN</td>\n <td>Q</td>\n </tr>\n <tr>\n <th>1</th>\n <td>893</td>\n <td>3</td>\n <td>Wilkes, Mrs. James (Ellen Needs)</td>\n <td>female</td>\n <td>47.0</td>\n <td>1</td>\n <td>0</td>\n <td>363272</td>\n <td>7.0000</td>\n <td>NaN</td>\n <td>S</td>\n </tr>\n <tr>\n <th>2</th>\n <td>894</td>\n <td>2</td>\n <td>Myles, Mr. Thomas Francis</td>\n <td>male</td>\n <td>62.0</td>\n <td>0</td>\n <td>0</td>\n <td>240276</td>\n <td>9.6875</td>\n <td>NaN</td>\n <td>Q</td>\n </tr>\n <tr>\n <th>3</th>\n <td>895</td>\n <td>3</td>\n <td>Wirz, Mr. Albert</td>\n <td>male</td>\n <td>27.0</td>\n <td>0</td>\n <td>0</td>\n <td>315154</td>\n <td>8.6625</td>\n <td>NaN</td>\n <td>S</td>\n </tr>\n <tr>\n <th>4</th>\n <td>896</td>\n <td>3</td>\n <td>Hirvonen, Mrs. Alexander (Helga E Lindqvist)</td>\n <td>female</td>\n <td>22.0</td>\n <td>1</td>\n <td>1</td>\n <td>3101298</td>\n <td>12.2875</td>\n <td>NaN</td>\n <td>S</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "dep_var = 'Survived'",
"execution_count": 19,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "cat_vars = ['PassengerId', 'Pclass', 'Sex', 'SibSp', 'Parch', 'Cabin', 'Embarked']",
"execution_count": 20,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "cont_vars = ['Age', 'Fare']",
"execution_count": 21,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "#cont,cat = cont_cat_split(train_data, 1, dep_var=dep_var)",
"execution_count": 22,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "procs = [Categorify, FillMissing, Normalize]",
"execution_count": 23,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "len(train_data)",
"execution_count": 24,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 24,
"data": {
"text/plain": "891"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "splits = IndexSplitter(list(range(710,891)))(range_of(train_data))",
"execution_count": 25,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "#splits = RandomSplitter(valid_pct=0.2)(range_of(train_data))",
"execution_count": 26,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "#splits = RandomSplitter()(range_of(train_data))",
"execution_count": 27,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "to = TabularPandas(train_data, procs, cat_vars, cont_vars, y_names=dep_var, splits=splits)",
"execution_count": 29,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "to.show()",
"execution_count": 30,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": "<IPython.core.display.HTML object>",
"text/html": "<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>PassengerId</th>\n <th>Pclass</th>\n <th>Sex</th>\n <th>SibSp</th>\n <th>Parch</th>\n <th>Cabin</th>\n <th>Embarked</th>\n <th>Age_na</th>\n <th>Age</th>\n <th>Fare</th>\n <th>Survived</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1</td>\n <td>3</td>\n <td>male</td>\n <td>1</td>\n <td>0</td>\n <td>#na#</td>\n <td>S</td>\n <td>False</td>\n <td>22.0</td>\n <td>7.250000</td>\n <td>0</td>\n </tr>\n <tr>\n <th>1</th>\n <td>2</td>\n <td>1</td>\n <td>female</td>\n <td>1</td>\n <td>0</td>\n <td>C85</td>\n <td>C</td>\n <td>False</td>\n <td>38.0</td>\n <td>71.283302</td>\n <td>1</td>\n </tr>\n <tr>\n <th>2</th>\n <td>3</td>\n <td>3</td>\n <td>female</td>\n <td>0</td>\n <td>0</td>\n <td>#na#</td>\n <td>S</td>\n <td>False</td>\n <td>26.0</td>\n <td>7.925000</td>\n <td>1</td>\n </tr>\n <tr>\n <th>3</th>\n <td>4</td>\n <td>1</td>\n <td>female</td>\n <td>1</td>\n <td>0</td>\n <td>C123</td>\n <td>S</td>\n <td>False</td>\n <td>35.0</td>\n <td>53.099998</td>\n <td>1</td>\n </tr>\n <tr>\n <th>4</th>\n <td>5</td>\n <td>3</td>\n <td>male</td>\n <td>0</td>\n <td>0</td>\n <td>#na#</td>\n <td>S</td>\n <td>False</td>\n <td>35.0</td>\n <td>8.050000</td>\n <td>0</td>\n </tr>\n <tr>\n <th>5</th>\n <td>6</td>\n <td>3</td>\n <td>male</td>\n <td>0</td>\n <td>0</td>\n <td>#na#</td>\n <td>Q</td>\n <td>True</td>\n <td>28.0</td>\n <td>8.458300</td>\n <td>0</td>\n </tr>\n <tr>\n <th>6</th>\n <td>7</td>\n <td>1</td>\n <td>male</td>\n <td>0</td>\n <td>0</td>\n <td>E46</td>\n <td>S</td>\n <td>False</td>\n <td>54.0</td>\n <td>51.862499</td>\n <td>0</td>\n </tr>\n <tr>\n <th>7</th>\n <td>8</td>\n <td>3</td>\n <td>male</td>\n <td>3</td>\n <td>1</td>\n <td>#na#</td>\n <td>S</td>\n <td>False</td>\n <td>2.0</td>\n <td>21.075001</td>\n <td>0</td>\n </tr>\n <tr>\n <th>8</th>\n <td>9</td>\n <td>3</td>\n <td>female</td>\n <td>0</td>\n <td>2</td>\n <td>#na#</td>\n <td>S</td>\n <td>False</td>\n <td>27.0</td>\n <td>11.133300</td>\n <td>1</td>\n </tr>\n <tr>\n <th>9</th>\n <td>10</td>\n <td>2</td>\n <td>female</td>\n <td>1</td>\n <td>0</td>\n <td>#na#</td>\n <td>C</td>\n <td>False</td>\n <td>14.0</td>\n <td>30.070801</td>\n <td>1</td>\n </tr>\n </tbody>\n</table>"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "to.valid.xs #I believe PassengerId is set to 0/#na# by the Categorify proc",
"execution_count": 31,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 31,
"data": {
"text/plain": " PassengerId Pclass Sex SibSp Parch Cabin Embarked Age_na \\\n710 0 1 1 1 1 85 1 1 \n711 0 1 2 1 1 57 3 2 \n712 0 1 2 2 1 59 3 1 \n713 0 3 2 1 1 0 3 1 \n714 0 2 2 1 1 0 3 1 \n.. ... ... ... ... ... ... ... ... \n886 0 2 2 1 1 0 3 1 \n887 0 1 1 1 1 31 3 1 \n888 0 3 1 2 3 0 3 2 \n889 0 1 2 1 1 61 1 1 \n890 0 3 2 1 1 0 2 1 \n\n Age Fare \n710 -0.431126 0.349269 \n711 -0.124355 -0.122048 \n712 1.409496 0.400515 \n713 -0.047663 -0.472477 \n714 1.716266 -0.400269 \n.. ... ... \n886 -0.201048 -0.400269 \n887 -0.814588 -0.051210 \n888 -0.124355 -0.185700 \n889 -0.277740 -0.051210 \n890 0.182415 -0.508067 \n\n[181 rows x 10 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>PassengerId</th>\n <th>Pclass</th>\n <th>Sex</th>\n <th>SibSp</th>\n <th>Parch</th>\n <th>Cabin</th>\n <th>Embarked</th>\n <th>Age_na</th>\n <th>Age</th>\n <th>Fare</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>710</th>\n <td>0</td>\n <td>1</td>\n <td>1</td>\n <td>1</td>\n <td>1</td>\n <td>85</td>\n <td>1</td>\n <td>1</td>\n <td>-0.431126</td>\n <td>0.349269</td>\n </tr>\n <tr>\n <th>711</th>\n <td>0</td>\n <td>1</td>\n <td>2</td>\n <td>1</td>\n <td>1</td>\n <td>57</td>\n <td>3</td>\n <td>2</td>\n <td>-0.124355</td>\n <td>-0.122048</td>\n </tr>\n <tr>\n <th>712</th>\n <td>0</td>\n <td>1</td>\n <td>2</td>\n <td>2</td>\n <td>1</td>\n <td>59</td>\n <td>3</td>\n <td>1</td>\n <td>1.409496</td>\n <td>0.400515</td>\n </tr>\n <tr>\n <th>713</th>\n <td>0</td>\n <td>3</td>\n <td>2</td>\n <td>1</td>\n <td>1</td>\n <td>0</td>\n <td>3</td>\n <td>1</td>\n <td>-0.047663</td>\n <td>-0.472477</td>\n </tr>\n <tr>\n <th>714</th>\n <td>0</td>\n <td>2</td>\n <td>2</td>\n <td>1</td>\n <td>1</td>\n <td>0</td>\n <td>3</td>\n <td>1</td>\n <td>1.716266</td>\n <td>-0.400269</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>886</th>\n <td>0</td>\n <td>2</td>\n <td>2</td>\n <td>1</td>\n <td>1</td>\n <td>0</td>\n <td>3</td>\n <td>1</td>\n <td>-0.201048</td>\n <td>-0.400269</td>\n </tr>\n <tr>\n <th>887</th>\n <td>0</td>\n <td>1</td>\n <td>1</td>\n <td>1</td>\n <td>1</td>\n <td>31</td>\n <td>3</td>\n <td>1</td>\n <td>-0.814588</td>\n <td>-0.051210</td>\n </tr>\n <tr>\n <th>888</th>\n <td>0</td>\n <td>3</td>\n <td>1</td>\n <td>2</td>\n <td>3</td>\n <td>0</td>\n <td>3</td>\n <td>2</td>\n <td>-0.124355</td>\n <td>-0.185700</td>\n </tr>\n <tr>\n <th>889</th>\n <td>0</td>\n <td>1</td>\n <td>2</td>\n <td>1</td>\n <td>1</td>\n <td>61</td>\n <td>1</td>\n <td>1</td>\n <td>-0.277740</td>\n <td>-0.051210</td>\n </tr>\n <tr>\n <th>890</th>\n <td>0</td>\n <td>3</td>\n <td>2</td>\n <td>1</td>\n <td>1</td>\n <td>0</td>\n <td>2</td>\n <td>1</td>\n <td>0.182415</td>\n <td>-0.508067</td>\n </tr>\n </tbody>\n</table>\n<p>181 rows × 10 columns</p>\n</div>"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "dls = to.dataloaders(bs=64)",
"execution_count": 32,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "dls.train.xs",
"execution_count": 33,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 33,
"data": {
"text/plain": " PassengerId Pclass Sex SibSp Parch Cabin Embarked Age_na \\\n0 1 3 2 2 1 0 3 1 \n1 2 1 1 2 1 82 1 1 \n2 3 3 1 1 1 0 3 1 \n3 4 1 1 2 1 56 3 1 \n4 5 3 2 1 1 0 3 1 \n.. ... ... ... ... ... ... ... ... \n705 706 2 2 1 1 0 3 1 \n706 707 2 1 1 1 0 3 1 \n707 708 1 2 1 1 121 3 1 \n708 709 1 1 1 1 0 3 1 \n709 710 3 2 2 2 0 1 2 \n\n Age Fare \n0 -0.584511 -0.518334 \n1 0.642570 0.796457 \n2 -0.277740 -0.504474 \n3 0.412492 0.423101 \n4 0.412492 -0.501907 \n.. ... ... \n705 0.719263 -0.133341 \n706 1.179418 -0.390003 \n707 0.949340 -0.127438 \n708 -0.584511 2.444567 \n709 -0.124355 -0.354156 \n\n[710 rows x 10 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>PassengerId</th>\n <th>Pclass</th>\n <th>Sex</th>\n <th>SibSp</th>\n <th>Parch</th>\n <th>Cabin</th>\n <th>Embarked</th>\n <th>Age_na</th>\n <th>Age</th>\n <th>Fare</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>1</td>\n <td>3</td>\n <td>2</td>\n <td>2</td>\n <td>1</td>\n <td>0</td>\n <td>3</td>\n <td>1</td>\n <td>-0.584511</td>\n <td>-0.518334</td>\n </tr>\n <tr>\n <th>1</th>\n <td>2</td>\n <td>1</td>\n <td>1</td>\n <td>2</td>\n <td>1</td>\n <td>82</td>\n <td>1</td>\n <td>1</td>\n <td>0.642570</td>\n <td>0.796457</td>\n </tr>\n <tr>\n <th>2</th>\n <td>3</td>\n <td>3</td>\n <td>1</td>\n <td>1</td>\n <td>1</td>\n <td>0</td>\n <td>3</td>\n <td>1</td>\n <td>-0.277740</td>\n <td>-0.504474</td>\n </tr>\n <tr>\n <th>3</th>\n <td>4</td>\n <td>1</td>\n <td>1</td>\n <td>2</td>\n <td>1</td>\n <td>56</td>\n <td>3</td>\n <td>1</td>\n <td>0.412492</td>\n <td>0.423101</td>\n </tr>\n <tr>\n <th>4</th>\n <td>5</td>\n <td>3</td>\n <td>2</td>\n <td>1</td>\n <td>1</td>\n <td>0</td>\n <td>3</td>\n <td>1</td>\n <td>0.412492</td>\n <td>-0.501907</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>705</th>\n <td>706</td>\n <td>2</td>\n <td>2</td>\n <td>1</td>\n <td>1</td>\n <td>0</td>\n <td>3</td>\n <td>1</td>\n <td>0.719263</td>\n <td>-0.133341</td>\n </tr>\n <tr>\n <th>706</th>\n <td>707</td>\n <td>2</td>\n <td>1</td>\n <td>1</td>\n <td>1</td>\n <td>0</td>\n <td>3</td>\n <td>1</td>\n <td>1.179418</td>\n <td>-0.390003</td>\n </tr>\n <tr>\n <th>707</th>\n <td>708</td>\n <td>1</td>\n <td>2</td>\n <td>1</td>\n <td>1</td>\n <td>121</td>\n <td>3</td>\n <td>1</td>\n <td>0.949340</td>\n <td>-0.127438</td>\n </tr>\n <tr>\n <th>708</th>\n <td>709</td>\n <td>1</td>\n <td>1</td>\n <td>1</td>\n <td>1</td>\n <td>0</td>\n <td>3</td>\n <td>1</td>\n <td>-0.584511</td>\n <td>2.444567</td>\n </tr>\n <tr>\n <th>709</th>\n <td>710</td>\n <td>3</td>\n <td>2</td>\n <td>2</td>\n <td>2</td>\n <td>0</td>\n <td>1</td>\n <td>2</td>\n <td>-0.124355</td>\n <td>-0.354156</td>\n </tr>\n </tbody>\n</table>\n<p>710 rows × 10 columns</p>\n</div>"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "dls.valid.show_batch() #I believe PassengerId is set to 0/#na# by the Categorify proc in TabularPandas",
"execution_count": 34,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": "<IPython.core.display.HTML object>",
"text/html": "<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>PassengerId</th>\n <th>Pclass</th>\n <th>Sex</th>\n <th>SibSp</th>\n <th>Parch</th>\n <th>Cabin</th>\n <th>Embarked</th>\n <th>Age_na</th>\n <th>Age</th>\n <th>Fare</th>\n <th>Survived</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>#na#</td>\n <td>1</td>\n <td>female</td>\n <td>0</td>\n <td>0</td>\n <td>C90</td>\n <td>C</td>\n <td>False</td>\n <td>24.000000</td>\n <td>49.504200</td>\n <td>1.0</td>\n </tr>\n <tr>\n <th>1</th>\n <td>#na#</td>\n <td>1</td>\n <td>male</td>\n <td>0</td>\n <td>0</td>\n <td>C124</td>\n <td>S</td>\n <td>True</td>\n <td>28.000000</td>\n <td>26.549999</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>2</th>\n <td>#na#</td>\n <td>1</td>\n <td>male</td>\n <td>1</td>\n <td>0</td>\n <td>C126</td>\n <td>S</td>\n <td>False</td>\n <td>48.000000</td>\n <td>52.000000</td>\n <td>1.0</td>\n </tr>\n <tr>\n <th>3</th>\n <td>#na#</td>\n <td>3</td>\n <td>male</td>\n <td>0</td>\n <td>0</td>\n <td>#na#</td>\n <td>S</td>\n <td>False</td>\n <td>29.000000</td>\n <td>9.483300</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>4</th>\n <td>#na#</td>\n <td>2</td>\n <td>male</td>\n <td>0</td>\n <td>0</td>\n <td>#na#</td>\n <td>S</td>\n <td>False</td>\n <td>52.000001</td>\n <td>12.999999</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>5</th>\n <td>#na#</td>\n <td>3</td>\n <td>male</td>\n <td>0</td>\n <td>0</td>\n <td>F G73</td>\n <td>S</td>\n <td>False</td>\n <td>19.000000</td>\n <td>7.649999</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>6</th>\n <td>#na#</td>\n <td>1</td>\n <td>female</td>\n <td>0</td>\n <td>0</td>\n <td>C45</td>\n <td>C</td>\n <td>False</td>\n <td>38.000000</td>\n <td>227.525004</td>\n <td>1.0</td>\n </tr>\n <tr>\n <th>7</th>\n <td>#na#</td>\n <td>2</td>\n <td>female</td>\n <td>0</td>\n <td>0</td>\n <td>E101</td>\n <td>S</td>\n <td>False</td>\n <td>27.000000</td>\n <td>10.499999</td>\n <td>1.0</td>\n </tr>\n <tr>\n <th>8</th>\n <td>#na#</td>\n <td>3</td>\n <td>male</td>\n <td>0</td>\n <td>0</td>\n <td>#na#</td>\n <td>Q</td>\n <td>True</td>\n <td>28.000000</td>\n <td>15.500001</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>9</th>\n <td>#na#</td>\n <td>3</td>\n <td>male</td>\n <td>0</td>\n <td>0</td>\n <td>#na#</td>\n <td>S</td>\n <td>False</td>\n <td>33.000000</td>\n <td>7.775000</td>\n <td>0.0</td>\n </tr>\n </tbody>\n</table>"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "dls.show_batch()",
"execution_count": 35,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": "<IPython.core.display.HTML object>",
"text/html": "<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>PassengerId</th>\n <th>Pclass</th>\n <th>Sex</th>\n <th>SibSp</th>\n <th>Parch</th>\n <th>Cabin</th>\n <th>Embarked</th>\n <th>Age_na</th>\n <th>Age</th>\n <th>Fare</th>\n <th>Survived</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>453</td>\n <td>1</td>\n <td>male</td>\n <td>0</td>\n <td>0</td>\n <td>C111</td>\n <td>C</td>\n <td>False</td>\n <td>30.000000</td>\n <td>27.750000</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>1</th>\n <td>168</td>\n <td>3</td>\n <td>female</td>\n <td>1</td>\n <td>4</td>\n <td>#na#</td>\n <td>S</td>\n <td>False</td>\n <td>45.000001</td>\n <td>27.900000</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>2</th>\n <td>41</td>\n <td>3</td>\n <td>female</td>\n <td>1</td>\n <td>0</td>\n <td>#na#</td>\n <td>S</td>\n <td>False</td>\n <td>40.000000</td>\n <td>9.475000</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>3</th>\n <td>105</td>\n <td>3</td>\n <td>male</td>\n <td>2</td>\n <td>0</td>\n <td>#na#</td>\n <td>S</td>\n <td>False</td>\n <td>37.000000</td>\n <td>7.925001</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>4</th>\n <td>50</td>\n <td>3</td>\n <td>female</td>\n <td>1</td>\n <td>0</td>\n <td>#na#</td>\n <td>S</td>\n <td>False</td>\n <td>18.000000</td>\n <td>17.799999</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>5</th>\n <td>21</td>\n <td>2</td>\n <td>male</td>\n <td>0</td>\n <td>0</td>\n <td>#na#</td>\n <td>S</td>\n <td>False</td>\n <td>35.000000</td>\n <td>26.000000</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>6</th>\n <td>305</td>\n <td>3</td>\n <td>male</td>\n <td>0</td>\n <td>0</td>\n <td>#na#</td>\n <td>S</td>\n <td>True</td>\n <td>28.000000</td>\n <td>8.049999</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>7</th>\n <td>208</td>\n <td>3</td>\n <td>male</td>\n <td>0</td>\n <td>0</td>\n <td>#na#</td>\n <td>C</td>\n <td>False</td>\n <td>26.000000</td>\n <td>18.787500</td>\n <td>1.0</td>\n </tr>\n <tr>\n <th>8</th>\n <td>214</td>\n <td>2</td>\n <td>male</td>\n <td>0</td>\n <td>0</td>\n <td>#na#</td>\n <td>S</td>\n <td>False</td>\n <td>30.000000</td>\n <td>12.999999</td>\n <td>0.0</td>\n </tr>\n <tr>\n <th>9</th>\n <td>40</td>\n <td>3</td>\n <td>female</td>\n <td>1</td>\n <td>0</td>\n <td>#na#</td>\n <td>C</td>\n <td>False</td>\n <td>14.000000</td>\n <td>11.241700</td>\n <td>1.0</td>\n </tr>\n </tbody>\n</table>"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "learn = tabular_learner(dls, layers=[200,100], metrics=accuracy)",
"execution_count": 36,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "learn.lr_find()",
"execution_count": 37,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": "<IPython.core.display.HTML object>",
"text/html": ""
},
"metadata": {}
},
{
"output_type": "execute_result",
"execution_count": 37,
"data": {
"text/plain": "SuggestedLRs(lr_min=0.010000000149011612, lr_steep=0.0010000000474974513)"
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"text/plain": "<Figure size 432x288 with 1 Axes>",
"image/png": "\n"
},
"metadata": {
"needs_background": "light"
}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "learn.fit(15, 1e-2/2)",
"execution_count": 38,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": "<IPython.core.display.HTML object>",
"text/html": "<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: left;\">\n <th>epoch</th>\n <th>train_loss</th>\n <th>valid_loss</th>\n <th>accuracy</th>\n <th>time</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <td>0</td>\n <td>0.269739</td>\n <td>0.266663</td>\n <td>0.640884</td>\n <td>00:00</td>\n </tr>\n <tr>\n <td>1</td>\n <td>0.191999</td>\n <td>0.249106</td>\n <td>0.640884</td>\n <td>00:00</td>\n </tr>\n <tr>\n <td>2</td>\n <td>0.133725</td>\n <td>0.237784</td>\n <td>0.640884</td>\n <td>00:00</td>\n </tr>\n <tr>\n <td>3</td>\n <td>0.094445</td>\n <td>0.180118</td>\n <td>0.640884</td>\n <td>00:00</td>\n </tr>\n <tr>\n <td>4</td>\n <td>0.068655</td>\n <td>0.146765</td>\n <td>0.640884</td>\n <td>00:00</td>\n </tr>\n <tr>\n <td>5</td>\n <td>0.052655</td>\n <td>0.131560</td>\n <td>0.640884</td>\n <td>00:00</td>\n </tr>\n <tr>\n <td>6</td>\n <td>0.040959</td>\n <td>0.130345</td>\n <td>0.640884</td>\n <td>00:00</td>\n </tr>\n <tr>\n <td>7</td>\n <td>0.032211</td>\n <td>0.133559</td>\n <td>0.640884</td>\n <td>00:00</td>\n </tr>\n <tr>\n <td>8</td>\n <td>0.025696</td>\n <td>0.133039</td>\n <td>0.640884</td>\n <td>00:00</td>\n </tr>\n <tr>\n <td>9</td>\n <td>0.020668</td>\n <td>0.134102</td>\n <td>0.640884</td>\n <td>00:00</td>\n </tr>\n <tr>\n <td>10</td>\n <td>0.016920</td>\n <td>0.141724</td>\n <td>0.640884</td>\n <td>00:00</td>\n </tr>\n <tr>\n <td>11</td>\n <td>0.014150</td>\n <td>0.139222</td>\n <td>0.640884</td>\n <td>00:00</td>\n </tr>\n <tr>\n <td>12</td>\n <td>0.011672</td>\n <td>0.143290</td>\n <td>0.640884</td>\n <td>00:00</td>\n </tr>\n <tr>\n <td>13</td>\n <td>0.010044</td>\n <td>0.145464</td>\n <td>0.640884</td>\n <td>00:00</td>\n </tr>\n <tr>\n <td>14</td>\n <td>0.008688</td>\n <td>0.144896</td>\n <td>0.640884</td>\n <td>00:00</td>\n </tr>\n </tbody>\n</table>"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "X_train, y_train = to.train.xs, to.train.y",
"execution_count": 39,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "X_test, y_test = to.valid.xs, to.valid.y",
"execution_count": 40,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "model = RandomForestClassifier(n_estimators=500, max_depth=10, random_state=1)",
"execution_count": 41,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "model.fit(X_train, y_train)",
"execution_count": 42,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 42,
"data": {
"text/plain": "RandomForestClassifier(max_depth=10, n_estimators=500, random_state=1)"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "model.score(X_train, y_train)",
"execution_count": 43,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 43,
"data": {
"text/plain": "0.9535211267605633"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "predictions = model.predict(X_test)",
"execution_count": 44,
"outputs": []
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "predictions.shape",
"execution_count": 45,
"outputs": [
{
"output_type": "execute_result",
"execution_count": 45,
"data": {
"text/plain": "(181,)"
},
"metadata": {}
}
]
},
{
"metadata": {
"trusted": true
},
"cell_type": "code",
"source": "",
"execution_count": null,
"outputs": []
}
],
"metadata": {
"_draft": {
"nbviewer_url": "https://gist.github.com/f08717068baa46852a88bc5078d43e2e"
},
"gist": {
"id": "f08717068baa46852a88bc5078d43e2e",
"data": {
"description": "notebooks/titanic-ml.ipynb",
"public": true
}
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3",
"language": "python"
},
"language_info": {
"name": "python",
"version": "3.7.9",
"mimetype": "text/x-python",
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"pygments_lexer": "ipython3",
"nbconvert_exporter": "python",
"file_extension": ".py"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment