Skip to content

Instantly share code, notes, and snippets.

@MaximePawlakFr
Created June 27, 2017 15:59
Show Gist options
  • Save MaximePawlakFr/d9a62eb291eb44e4c4e30831e61d5709 to your computer and use it in GitHub Desktop.
Save MaximePawlakFr/d9a62eb291eb44e4c4e30831e61d5709 to your computer and use it in GitHub Desktop.
# Libraries
import pandas as pd
import numpy as np
import random as rnd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn as sk
# Acquire Data
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
combine = [ train_df, test_df]
# Preview data
print(train_df.columns.values)
train_df.head()
train_df.tail()
train_df.info()
test_df.info()
train_df.describe()
train_df.describe(include="all")
# Analyze
train_df[["Pclass", "Survived"]].groupby(["Pclass", as_index=False).mean().sort_values(by="Survived")
train_df[["Pclass", "Survived"]].groupby(["Pclass"]).mean().sort_values(by="Survived")
comp_var = "Sex"
train_df[[comp_var, "Survived"]].groupby([comp_var]).mean().sort_values(by="Survived")
comp_var = "SibSp"
train_df[[comp_var, "Survived"]].groupby([comp_var]).mean().sort_values(by="Survived")
comp_var = "Sex"
train_df[[comp_var, "Survived", "Pclass"]].groupby([comp_var, "Pclass"]).mean().sort_values(by="Survived")
# Visualizing data
g = sns.FacetGrid(train_df, col="Survived")
g.map(plt.hist, "Age", bins=20)
grid = sns.FacetGrid(train_df, col="Survived", row="Pclass", size=2.2)
grid.map(plt.hist, "Age", bins=20)
grid.add_legend()
grid = sns.FacetGrid(train_df, col="Survived", row="Embarked", size=2.2)
grid.map(sns.barplot, "Sex", "Fare")
grid.add_legend()
# Wrangle data
train_df = train_df.drop(["Ticket", "Cabin"], axis=1)
test_df = test_df.drop(["Ticket", "Cabin"], axis=1)
combine = [train_df, test_df]
for df in combine:
df["SexBin"] = df.Sex.map({'female':1, 'male':0})
train_df["AgeBand"] = pd.cut(train_df.Age, 5)
train_df[["AgeBand", "Survived"]].groupby(["AgeBand"]).mean()
# Model
X_train = train_df.drop("Survived", axis=1)
Y_train = train_df["Survived"]
X_test = test_df.copy()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment