Created
September 23, 2019 14:59
-
-
Save yabyzq/90757cccef65f367a210bb013932cfb4 to your computer and use it in GitHub Desktop.
SKLearn pipeline
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np # linear algebra | |
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) | |
from sklearn.metrics import roc_auc_score | |
from sklearn.preprocessing import LabelEncoder, StandardScaler | |
from sklearn.compose import ColumnTransformer | |
from sklearn.pipeline import make_pipeline | |
from sklearn.pipeline import Pipeline | |
from lightgbm import LGBMClassifier | |
from xgboost import XGBClassifier | |
from category_encoders import OneHotEncoder | |
from sklearn.model_selection import cross_val_predict | |
from warnings import filterwarnings | |
from sklearn.impute import SimpleImputer | |
#Import data | |
train = pd.read_csv("../input/titanic/train.csv") | |
print("train shape", train.shape) | |
test = pd.read_csv("../input/titanic/test.csv") | |
print("test shape", test.shape) | |
#Target | |
target_column = "Survived" | |
id_column = "PassengerId" | |
categorical_cols = [c for c in test.columns if test[c].dtype in [np.object]] | |
numerical_cols = [c for c in test.columns if test[c].dtype in [np.float, np.int] and c not in [target_column, id_column]] | |
print("Number of features", len(categorical_cols)+len(numerical_cols)) | |
#Classifer | |
numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),('scaler', StandardScaler())]) | |
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')),('onehot', OneHotEncoder(handle_unknown='ignore'))]) | |
pipe = make_pipeline( | |
ColumnTransformer([ | |
('num', numeric_transformer, numerical_cols), | |
('cat', categorical_transformer, ['Embarked','Sex']), | |
]), | |
RandomForestClassifier(), | |
) | |
prediction = cross_val_predict(pipe, train, train[target_column], cv=5, method="predict_proba") | |
print("Cross validation AUC {:.4f}".format(roc_auc_score(train[target_column], prediction[:,1]))) | |
from sklearn.metrics import accuracy_score, log_loss | |
from sklearn.neighbors import KNeighborsClassifier | |
from sklearn.svm import SVC, LinearSVC, NuSVC | |
from sklearn.tree import DecisionTreeClassifier | |
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier | |
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis | |
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis | |
classifiers = [ | |
RandomForestClassifier(), | |
KNeighborsClassifier(3), | |
SVC(kernel="rbf", C=0.025, probability=True), | |
NuSVC(probability=True), | |
DecisionTreeClassifier(), | |
RandomForestClassifier(), | |
AdaBoostClassifier(), | |
GradientBoostingClassifier(), | |
XGBClassifier(), | |
] | |
for classifier in classifiers: | |
pipe = make_pipeline( | |
ColumnTransformer([ | |
('num', numeric_transformer, numerical_cols), | |
('cat', categorical_transformer, ['Embarked','Sex']), | |
]), | |
classifier | |
) | |
prediction = cross_val_predict(pipe, train, train[target_column], cv=5, method="predict_proba") | |
print(classifier) | |
print("Cross validation AUC {:.4f}".format(roc_auc_score(train[target_column], prediction[:,1]))) | |
rf = Pipeline(steps=[('preprocessor', ColumnTransformer([ | |
('num', numeric_transformer, numerical_cols), | |
('cat', categorical_transformer, ['Embarked','Sex']), | |
]), | |
), | |
('classifier', RandomForestClassifier())]) | |
param_grid = { | |
'classifier__n_estimators': [200, 300], | |
'classifier__max_features': ['auto', 'sqrt', 'log2'], | |
'classifier__max_depth' : [4,6,8], | |
'classifier__criterion' :['gini', 'entropy'] | |
} | |
from sklearn.model_selection import GridSearchCV | |
CV = GridSearchCV(rf, param_grid, cv = 3, n_jobs = -1, verbose = 1) | |
CV.fit(train, train[target_column]) | |
print(CV.best_params_) | |
print(CV.best_score_) | |
import eli5 | |
pipe.fit(train, train[target_column]) | |
eli5.show_weights(pipe, top=10) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment