h5li/TitanicSolutionNotebook.ipynb

## TitanicSolutionNotebook.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Titanic Predictor\n",
    "\n",
    "This code takes csv data from the Kaggle website, cleans it, and runs it through a random forest classifier."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import sklearn as sk\n",
    "import matplotlib.pyplot as plt\n",
    "# Load scikit's random forest classifier library\n",
    "from sklearn.ensemble import RandomForestClassifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Read in training and testing csvs\n",
    "train_df = pd.read_csv(open('train.csv'))\n",
    "test_df = pd.read_csv(open('train.csv'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Clean the train dataset\n",
    "#dropping name, ticket, cabin, ID, and embarked columns\n",
    "train_df.drop(['Name', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True) \n",
    "# One-hot encoding Gender\n",
    "train_df['m'] = (train_df['Sex'] == 'male')\n",
    "train_df['f'] = (train_df['Sex'] == 'female')\n",
    "train_df.drop(['Sex'], axis=1, inplace=True)\n",
    "# Replace Nan values in age column with average age\n",
    "train_mean_age = train_df['Age'].mean()\n",
    "train_df.fillna(value=train_mean_age, inplace=True);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Clean the test dataset in the same way as the train dataset\n",
    "#dropping name, ticket, cabin, ID, and embarked columns\n",
    "test_df.drop(['Name', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True) \n",
    "# One-hot encoding Gender\n",
    "test_df['m'] = (test_df['Sex'] == 'male')\n",
    "test_df['f'] = (test_df['Sex'] == 'female')\n",
    "test_df.drop(['Sex'], axis=1, inplace=True)\n",
    "# Replace Nan values in age column with average age\n",
    "test_mean_age = test_df['Age'].mean()\n",
    "test_df.fillna(value=test_mean_age, inplace=True);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#Fill NaN value in fare column with mean fare in only the test dataset as it has some missing values\n",
    "test_df['Fare'].fillna(test_df['Fare'].mean(), inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Fare</th>\n",
       "      <th>m</th>\n",
       "      <th>f</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Survived  Pclass   Age  SibSp  Parch     Fare      m      f\n",
       "0         0       3  22.0      1      0   7.2500   True  False\n",
       "1         1       1  38.0      1      0  71.2833  False   True\n",
       "2         1       3  26.0      0      0   7.9250  False   True\n",
       "3         1       1  35.0      1      0  53.1000  False   True\n",
       "4         0       3  35.0      0      0   8.0500   True  False"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Drop the PassengerId column from the training dataset as we will not need it for training\n",
    "#(Note: Keep it in the test dataset though)\n",
    "train_df.drop(['PassengerId'], axis=1, inplace=True)\n",
    "train_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Initialize a random forest Classifier. By convention, clf means 'Classifier'\n",
    "clf = RandomForestClassifier(n_jobs=2, random_state=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Holds survival data; this is what we want to predict-- the label\n",
    "y = train_df['Survived']\n",
    "x = train_df.iloc[:,1:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Train the Classifier to take the training features and learn how they relate\n",
    "# to the training y (the species)\n",
    "clf.fit(x, y);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Apply the Classifier we trained to the test data\n",
    "survive = clf.predict(test_df.iloc[:, 2:])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Zip together the passengerIds column and the predictions column that we created in the previous steps\n",
    "final = zip(test_df['PassengerId'], survive)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Create a dataframe from the last cell's columns\n",
    "final_prediction = pd.DataFrame(final, columns=['PassengerId', 'Survived'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Output dataframe to CSV\n",
    "final_prediction.to_csv('final_prediction8.csv', index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Titanic Predictor\n",
	"\n",
	"This code takes csv data from the Kaggle website, cleans it, and runs it through a random forest classifier."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 23,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import numpy as np\n",
	"import pandas as pd\n",
	"import sklearn as sk\n",
	"import matplotlib.pyplot as plt\n",
	"# Load scikit's random forest classifier library\n",
	"from sklearn.ensemble import RandomForestClassifier"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 24,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# Read in training and testing csvs\n",
	"train_df = pd.read_csv(open('train.csv'))\n",
	"test_df = pd.read_csv(open('train.csv'))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 25,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# Clean the train dataset\n",
	"#dropping name, ticket, cabin, ID, and embarked columns\n",
	"train_df.drop(['Name', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True) \n",
	"# One-hot encoding Gender\n",
	"train_df['m'] = (train_df['Sex'] == 'male')\n",
	"train_df['f'] = (train_df['Sex'] == 'female')\n",
	"train_df.drop(['Sex'], axis=1, inplace=True)\n",
	"# Replace Nan values in age column with average age\n",
	"train_mean_age = train_df['Age'].mean()\n",
	"train_df.fillna(value=train_mean_age, inplace=True);"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 26,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# Clean the test dataset in the same way as the train dataset\n",
	"#dropping name, ticket, cabin, ID, and embarked columns\n",
	"test_df.drop(['Name', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True) \n",
	"# One-hot encoding Gender\n",
	"test_df['m'] = (test_df['Sex'] == 'male')\n",
	"test_df['f'] = (test_df['Sex'] == 'female')\n",
	"test_df.drop(['Sex'], axis=1, inplace=True)\n",
	"# Replace Nan values in age column with average age\n",
	"test_mean_age = test_df['Age'].mean()\n",
	"test_df.fillna(value=test_mean_age, inplace=True);"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 27,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"#Fill NaN value in fare column with mean fare in only the test dataset as it has some missing values\n",
	"test_df['Fare'].fillna(test_df['Fare'].mean(), inplace=True)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 28,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style>\n",
	" .dataframe thead tr:only-child th {\n",
	" text-align: right;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: left;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>Survived</th>\n",
	" <th>Pclass</th>\n",
	" <th>Age</th>\n",
	" <th>SibSp</th>\n",
	" <th>Parch</th>\n",
	" <th>Fare</th>\n",
	" <th>m</th>\n",
	" <th>f</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>0</td>\n",
	" <td>3</td>\n",
	" <td>22.0</td>\n",
	" <td>1</td>\n",
	" <td>0</td>\n",
	" <td>7.2500</td>\n",
	" <td>True</td>\n",
	" <td>False</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>1</td>\n",
	" <td>1</td>\n",
	" <td>38.0</td>\n",
	" <td>1</td>\n",
	" <td>0</td>\n",
	" <td>71.2833</td>\n",
	" <td>False</td>\n",
	" <td>True</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>1</td>\n",
	" <td>3</td>\n",
	" <td>26.0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>7.9250</td>\n",
	" <td>False</td>\n",
	" <td>True</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>1</td>\n",
	" <td>1</td>\n",
	" <td>35.0</td>\n",
	" <td>1</td>\n",
	" <td>0</td>\n",
	" <td>53.1000</td>\n",
	" <td>False</td>\n",
	" <td>True</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>0</td>\n",
	" <td>3</td>\n",
	" <td>35.0</td>\n",
	" <td>0</td>\n",
	" <td>0</td>\n",
	" <td>8.0500</td>\n",
	" <td>True</td>\n",
	" <td>False</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" Survived Pclass Age SibSp Parch Fare m f\n",
	"0 0 3 22.0 1 0 7.2500 True False\n",
	"1 1 1 38.0 1 0 71.2833 False True\n",
	"2 1 3 26.0 0 0 7.9250 False True\n",
	"3 1 1 35.0 1 0 53.1000 False True\n",
	"4 0 3 35.0 0 0 8.0500 True False"
	]
	},
	"execution_count": 28,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"# Drop the PassengerId column from the training dataset as we will not need it for training\n",
	"#(Note: Keep it in the test dataset though)\n",
	"train_df.drop(['PassengerId'], axis=1, inplace=True)\n",
	"train_df.head()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 30,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# Initialize a random forest Classifier. By convention, clf means 'Classifier'\n",
	"clf = RandomForestClassifier(n_jobs=2, random_state=0)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 43,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# Holds survival data; this is what we want to predict-- the label\n",
	"y = train_df['Survived']\n",
	"x = train_df.iloc[:,1:]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 45,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# Train the Classifier to take the training features and learn how they relate\n",
	"# to the training y (the species)\n",
	"clf.fit(x, y);"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 46,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# Apply the Classifier we trained to the test data\n",
	"survive = clf.predict(test_df.iloc[:, 2:])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 63,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# Zip together the passengerIds column and the predictions column that we created in the previous steps\n",
	"final = zip(test_df['PassengerId'], survive)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 66,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# Create a dataframe from the last cell's columns\n",
	"final_prediction = pd.DataFrame(final, columns=['PassengerId', 'Survived'])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 101,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# Output dataframe to CSV\n",
	"final_prediction.to_csv('final_prediction8.csv', index=False)"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.13"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}