Skip to content

Instantly share code, notes, and snippets.

@EKami
Created January 4, 2017 11:33
Show Gist options
  • Save EKami/593e26dbd79225ed30acf2df0fc5c4f7 to your computer and use it in GitHub Desktop.
Save EKami/593e26dbd79225ed30acf2df0fc5c4f7 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false,
"outputExpanded": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>PassengerId</th>\n",
" <th>Survived</th>\n",
" <th>Pclass</th>\n",
" <th>Name</th>\n",
" <th>Sex</th>\n",
" <th>Age</th>\n",
" <th>SibSp</th>\n",
" <th>Parch</th>\n",
" <th>Ticket</th>\n",
" <th>Fare</th>\n",
" <th>Cabin</th>\n",
" <th>Embarked</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>Braund, Mr. Owen Harris</td>\n",
" <td>male</td>\n",
" <td>22.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>A/5 21171</td>\n",
" <td>7.2500</td>\n",
" <td>NaN</td>\n",
" <td>S</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
" <td>female</td>\n",
" <td>38.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>PC 17599</td>\n",
" <td>71.2833</td>\n",
" <td>C85</td>\n",
" <td>C</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>Heikkinen, Miss. Laina</td>\n",
" <td>female</td>\n",
" <td>26.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>STON/O2. 3101282</td>\n",
" <td>7.9250</td>\n",
" <td>NaN</td>\n",
" <td>S</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
" <td>female</td>\n",
" <td>35.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>113803</td>\n",
" <td>53.1000</td>\n",
" <td>C123</td>\n",
" <td>S</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>Allen, Mr. William Henry</td>\n",
" <td>male</td>\n",
" <td>35.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>373450</td>\n",
" <td>8.0500</td>\n",
" <td>NaN</td>\n",
" <td>S</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" PassengerId Survived Pclass \\\n",
"0 1 0 3 \n",
"1 2 1 1 \n",
"2 3 1 3 \n",
"3 4 1 1 \n",
"4 5 0 3 \n",
"\n",
" Name Sex Age SibSp \\\n",
"0 Braund, Mr. Owen Harris male 22.0 1 \n",
"1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n",
"2 Heikkinen, Miss. Laina female 26.0 0 \n",
"3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n",
"4 Allen, Mr. William Henry male 35.0 0 \n",
"\n",
" Parch Ticket Fare Cabin Embarked \n",
"0 0 A/5 21171 7.2500 NaN S \n",
"1 0 PC 17599 71.2833 C85 C \n",
"2 0 STON/O2. 3101282 7.9250 NaN S \n",
"3 0 113803 53.1000 C123 S \n",
"4 0 373450 8.0500 NaN S "
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"%matplotlib inline\n",
"\n",
"df_train = pd.read_csv('../input/train.csv')\n",
"df_test = pd.read_csv('../input/test.csv')\n",
"df_train.head()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def get_median_age(df):\n",
" median_ages = np.zeros((2,3))\n",
" # We determine a median age for the passengers of the same class\n",
" for i in range(0, 2):\n",
" for j in range(0, 3):\n",
" median_ages[i,j] = df[(df['Gender'] == i) & (df['Pclass'] == j+1)]['Age'].dropna().median()\n",
" return median_ages"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false,
"outputExpanded": false
},
"outputs": [],
"source": [
"def get_feature_engineered_df(df):\n",
" df['Gender'] = df['Sex'].map( {'female': 0, 'male': 1} ).astype(int)\n",
" df['Embarked'] = df['Embarked'].fillna('S').map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)\n",
" median_ages = get_median_age(df)\n",
" df['AgeFill'] = df['Age']\n",
" for i in range(0, 2):\n",
" for j in range(0, 3):\n",
" df.loc[ (df.Age.isnull()) & (df.Gender == i) & (df.Pclass == j+1), 'AgeFill'] = median_ages[i,j]\n",
"\n",
" df['AgeIsNull'] = pd.isnull(df['Age']).astype(int)\n",
" df['FamilySize'] = df['SibSp'] + df['Parch']\n",
" df['Age*Class'] = df.AgeFill * df.Pclass\n",
" df.drop(['PassengerId', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)\n",
" return df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Clean the test and trained data"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false,
"outputExpanded": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Survived</th>\n",
" <th>Pclass</th>\n",
" <th>Age</th>\n",
" <th>SibSp</th>\n",
" <th>Parch</th>\n",
" <th>Fare</th>\n",
" <th>Gender</th>\n",
" <th>AgeFill</th>\n",
" <th>AgeIsNull</th>\n",
" <th>FamilySize</th>\n",
" <th>Age*Class</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>22.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>7.2500</td>\n",
" <td>1</td>\n",
" <td>22.0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>66.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>38.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>71.2833</td>\n",
" <td>0</td>\n",
" <td>38.0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>38.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>26.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>7.9250</td>\n",
" <td>0</td>\n",
" <td>26.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>78.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>35.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>53.1000</td>\n",
" <td>0</td>\n",
" <td>35.0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>35.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>35.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>8.0500</td>\n",
" <td>1</td>\n",
" <td>35.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>105.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Survived Pclass Age SibSp Parch Fare Gender AgeFill AgeIsNull \\\n",
"0 0 3 22.0 1 0 7.2500 1 22.0 0 \n",
"1 1 1 38.0 1 0 71.2833 0 38.0 0 \n",
"2 1 3 26.0 0 0 7.9250 0 26.0 0 \n",
"3 1 1 35.0 1 0 53.1000 0 35.0 0 \n",
"4 0 3 35.0 0 0 8.0500 1 35.0 0 \n",
"\n",
" FamilySize Age*Class \n",
"0 1 66.0 \n",
"1 1 38.0 \n",
"2 0 78.0 \n",
"3 1 35.0 \n",
"4 0 105.0 "
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_data = get_feature_engineered_df(df_train.copy())\n",
"test_data = get_feature_engineered_df(df_test.copy())\n",
"train_data.dropna(inplace=True)\n",
"train_data.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Check if our train data do not contains any incompatible values for our machine learning algorithm to fit them"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"False\n",
"True\n"
]
}
],
"source": [
"print(np.any(np.isnan(train_data)))\n",
"print(np.all(np.isfinite(train_data)))"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Import the random forest package\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"\n",
"# Create the random forest object which will include all the parameters\n",
"# for the fit\n",
"forest = RandomForestClassifier(n_estimators=100)\n",
"\n",
"X_train = train_data.ix[:, train_data.columns != 'Survived']\n",
"y_train = train_data['Survived']\n",
"X_test = test_data.fillna(99999) # TODO change this \n",
"\n",
"# Fit the training data to the Survived labels and create the decision trees\n",
"forest = forest.fit(X_train, y_train)\n",
"\n",
"# Take the same decision trees and run it on the test data\n",
"y_pred = forest.predict(X_test.dropna())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create our final dataframe of predictions"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Survived</th>\n",
" </tr>\n",
" <tr>\n",
" <th>PassengerId</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>892</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>893</th>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>894</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>895</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>896</th>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Survived\n",
"PassengerId \n",
"892 0\n",
"893 0\n",
"894 1\n",
"895 1\n",
"896 0"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_ret = pd.DataFrame(df_test['PassengerId'])\n",
"df_ret['Survived'] = y_pred\n",
"df_ret.set_index(['PassengerId'], inplace=True)\n",
"df_ret.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now register our predictions into a csv file to upload it to kaggle"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df_ret.to_csv('predictions.csv')"
]
}
],
"metadata": {
"kernel_info": {
"name": "ipytensorflow"
},
"kernelspec": {
"display_name": "Python (tensorflow)",
"language": "python",
"name": "ipytensorflow"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment