Skip to content

Instantly share code, notes, and snippets.

# Median may better represent our passengers
titanic_full.groupby(['Pclass', 'Sex', 'Title'])['Age'].median()
# Higher granularity == better accuracy???
titanic_full.groupby(['Pclass', 'Sex', 'Title'])['Age'].mean()
# Get mean age of passenger by sex and class
titanic_full.groupby(['Pclass', 'Sex'])['Age'].mean()
# Drop name
titanic_full.drop(['Name'], axis=1, inplace=True)
def get_title(x):
if 'Mr.' in x:
return 'Mr'
elif 'Mrs.' in x:
return 'Mrs'
elif 'Master' in x:
return 'Master'
elif 'Miss.' in x:
return 'Miss'
else:
import numpy as np
import pandas as pd
# Read in dataset from CSV
titanic_train = pd.read_csv('/home/matt/datasets/titanic/train.csv'); titanic_train.name='titanic_train'
titanic_test = pd.read_csv('/home/matt/datasets/titanic/test.csv'); titanic_test.name='titanic_test'
titanic_full = titanic_train.append(titanic_test); titanic_full.name='titanic_full'
def df_info(df):
"""Basic info on a dataframe, useful for checking incremental preprocessing steps"""
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.externals import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import PolynomialFeatures, RobustScaler
from tpot.builtins import StackingEstimator
from xgboost import XGBClassifier
# NOTE: Make sure that the class is labeled 'target' in the data file
from tpot import TPOTClassifier
from sklearn.cross_validation import train_test_split
from sklearn.datasets import load_iris
import time
# Load and split the data
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42)
# Construct and fit TPOT classifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn import tree
# Load and split the data
iris = load_iris()