Skip to content

Instantly share code, notes, and snippets.

@DMTSource
Last active March 24, 2021 21:33
Show Gist options
  • Save DMTSource/2b38b473270a50e71025dd6cb1c03521 to your computer and use it in GitHub Desktop.
Save DMTSource/2b38b473270a50e71025dd6cb1c03521 to your computer and use it in GitHub Desktop.
Modified version of the readme_long_example from baikal. Attempting to make it work for multiple inputs: https://github.com/alegonz/baikal/issues/50
import sklearn.decomposition
import sklearn.ensemble
import sklearn.linear_model
import sklearn.preprocessing
import sklearn.svm
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from baikal import Input, Model, make_step
from baikal.plot import plot_model
from baikal.steps import Stack
### ADDED to readme_long_example
from sklearn.model_selection import GridSearchCV, StratifiedKFold
###
# 1. Define the steps
LogisticRegression = make_step(sklearn.linear_model.LogisticRegression)
RandomForestClassifier = make_step(sklearn.ensemble.RandomForestClassifier)
ExtraTreesClassifier = make_step(sklearn.ensemble.ExtraTreesClassifier)
PCA = make_step(sklearn.decomposition.PCA)
SVC = make_step(sklearn.svm.SVC)
PowerTransformer = make_step(sklearn.preprocessing.PowerTransformer)
# 2. Build the model
x1 = Input(name="x1")
x2 = Input(name="x2")
y_t = Input(name="y_t")
y1 = ExtraTreesClassifier()(x1, y_t)
y2 = RandomForestClassifier()(x2, y_t)
z = PowerTransformer()(x2)
z = PCA()(z)
y3 = LogisticRegression()(z, y_t)
stacked_features = Stack()([y1, y2, y3])
y_p = SVC()(stacked_features, y_t)
model = Model([x1, x2], y_p, y_t)
plot_model(model, filename="multiple_input_nonlinear_pipeline_example_plot.png")
# 3. Train the model
dataset = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(
dataset.data, dataset.target, random_state=0
)
# Let's suppose the dataset is originally split in two
X1_train, X2_train = X_train[:, :15], X_train[:, 15:]
X1_test, X2_test = X_test[:, :15], X_test[:, 15:]
### ADDED to readme_long_example
#model.fit([X1_train, X2_train], y_train)
param_grid = [
{
"LogisticRegression_0": [
LogisticRegression(
random_state=0, solver="lbfgs", multi_class="multinomial"
)
],
"LogisticRegression_0__C": [0.01, 0.1, 1],
"PCA_0__n_components": [1, 2, 3, 4],
},
{
"RandomForestClassifier_0": [RandomForestClassifier(random_state=0)],
"RandomForestClassifier_0__n_estimators": [10, 50, 100],
},
]
cv = StratifiedKFold(n_splits=3)#, random_state=0) #shuffle is false so no random
gscv_baikal = GridSearchCV(
model,
param_grid,
cv=cv,
scoring="accuracy",
return_train_score=True,
verbose=1,
)
# THIS WILL FAIL, it does NOT like the inputs?
# ValueError: Found input variables with inconsistent numbers of samples: [2, 426]
gscv_baikal.fit([X1_train, X2_train], y_train)
print("Best score:", gscv_baikal.best_score_)
print("Best parameters", gscv_baikal.best_params_)
model = gscv_baikal.best_estimator_.model
###
# 4. Use the model
y_test_pred = model.predict([X1_test, X2_test])
# This also works:
# y_test_pred = model.predict({x1: X1_test, x2: X2_test})
# We can also query any intermediate outputs:
outs = model.predict(
[X1_test, X2_test], output_names=["ExtraTreesClassifier_0:0/0", "PCA_0:0/0"]
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment