Skip to content

Instantly share code, notes, and snippets.

@h5li
Created October 17, 2018 17:16
Show Gist options
  • Save h5li/cf84fdbe5d0e6faaecc8a396fac6ffaf to your computer and use it in GitHub Desktop.
Save h5li/cf84fdbe5d0e6faaecc8a396fac6ffaf to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Titanic Predictor\n",
"\n",
"This code takes csv data from the Kaggle website, cleans it, and runs it through a random forest classifier."
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import sklearn as sk\n",
"import matplotlib.pyplot as plt\n",
"# Load scikit's random forest classifier library\n",
"from sklearn.ensemble import RandomForestClassifier"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Read in training and testing csvs\n",
"train_df = pd.read_csv(open('train.csv'))\n",
"test_df = pd.read_csv(open('train.csv'))"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Clean the train dataset\n",
"#dropping name, ticket, cabin, ID, and embarked columns\n",
"train_df.drop(['Name', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True) \n",
"# One-hot encoding Gender\n",
"train_df['m'] = (train_df['Sex'] == 'male')\n",
"train_df['f'] = (train_df['Sex'] == 'female')\n",
"train_df.drop(['Sex'], axis=1, inplace=True)\n",
"# Replace Nan values in age column with average age\n",
"train_mean_age = train_df['Age'].mean()\n",
"train_df.fillna(value=train_mean_age, inplace=True);"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Clean the test dataset in the same way as the train dataset\n",
"#dropping name, ticket, cabin, ID, and embarked columns\n",
"test_df.drop(['Name', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True) \n",
"# One-hot encoding Gender\n",
"test_df['m'] = (test_df['Sex'] == 'male')\n",
"test_df['f'] = (test_df['Sex'] == 'female')\n",
"test_df.drop(['Sex'], axis=1, inplace=True)\n",
"# Replace Nan values in age column with average age\n",
"test_mean_age = test_df['Age'].mean()\n",
"test_df.fillna(value=test_mean_age, inplace=True);"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#Fill NaN value in fare column with mean fare in only the test dataset as it has some missing values\n",
"test_df['Fare'].fillna(test_df['Fare'].mean(), inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Survived</th>\n",
" <th>Pclass</th>\n",
" <th>Age</th>\n",
" <th>SibSp</th>\n",
" <th>Parch</th>\n",
" <th>Fare</th>\n",
" <th>m</th>\n",
" <th>f</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>22.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>7.2500</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>38.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>71.2833</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>26.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>7.9250</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>35.0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>53.1000</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>35.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>8.0500</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Survived Pclass Age SibSp Parch Fare m f\n",
"0 0 3 22.0 1 0 7.2500 True False\n",
"1 1 1 38.0 1 0 71.2833 False True\n",
"2 1 3 26.0 0 0 7.9250 False True\n",
"3 1 1 35.0 1 0 53.1000 False True\n",
"4 0 3 35.0 0 0 8.0500 True False"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Drop the PassengerId column from the training dataset as we will not need it for training\n",
"#(Note: Keep it in the test dataset though)\n",
"train_df.drop(['PassengerId'], axis=1, inplace=True)\n",
"train_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Initialize a random forest Classifier. By convention, clf means 'Classifier'\n",
"clf = RandomForestClassifier(n_jobs=2, random_state=0)"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Holds survival data; this is what we want to predict-- the label\n",
"y = train_df['Survived']\n",
"x = train_df.iloc[:,1:]"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Train the Classifier to take the training features and learn how they relate\n",
"# to the training y (the species)\n",
"clf.fit(x, y);"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Apply the Classifier we trained to the test data\n",
"survive = clf.predict(test_df.iloc[:, 2:])"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Zip together the passengerIds column and the predictions column that we created in the previous steps\n",
"final = zip(test_df['PassengerId'], survive)"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Create a dataframe from the last cell's columns\n",
"final_prediction = pd.DataFrame(final, columns=['PassengerId', 'Survived'])"
]
},
{
"cell_type": "code",
"execution_count": 101,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Output dataframe to CSV\n",
"final_prediction.to_csv('final_prediction8.csv', index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.13"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment