Skip to content

Instantly share code, notes, and snippets.

@joaopcnogueira
Last active July 11, 2019 14:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save joaopcnogueira/7f3b71856b84a27b36f02016d143b400 to your computer and use it in GitHub Desktop.
Save joaopcnogueira/7f3b71856b84a27b36f02016d143b400 to your computer and use it in GitHub Desktop.
Refactored titanic code with pipelines
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from category_encoders import OneHotEncoder
# lendo o dataset
df = pd.read_csv("train.csv")
# retirando colunas com nome, ingresso e cabine dos conjuntos
df.drop(["Name", "Ticket", "Cabin"], axis=1, inplace=True)
# dividindo em conjunto de treino e test
X_train, X_test, y_train, y_test = train_test_split(df.drop(['Survived'], axis=1),
df['Survived'],
test_size=0.2,
random_state=42)
# criando o modelo usando pipeline
model = Pipeline(steps=[
('one-hot encoder', OneHotEncoder()),
('imputer', SimpleImputer(strategy='mean')),
('tree', DecisionTreeClassifier(max_depth=3, random_state=0))
])
# treinando o modelo
model.fit(X_train, y_train)
train_score = model.score(X_train, y_train)
# avaliando o modelo
test_score = model.score(X_test, y_test)
print("Train score: {}".format(train_score))
print("Test score: {}".format(test_score))
# OUTPUT
# Train score: 0.8342696629213483
# Test score: 0.7988826815642458
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment