Skip to content

Instantly share code, notes, and snippets.

@liangfu
Last active September 14, 2022 01:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save liangfu/28f814655abbc2cc89c575d69605141b to your computer and use it in GitHub Desktop.
Save liangfu/28f814655abbc2cc89c575d69605141b to your computer and use it in GitHub Desktop.
""" Example script for defining and using custom models in AutoGluon Tabular """
from autogluon.core.utils import infer_problem_type
from autogluon.tabular import TabularDataset, TabularPredictor
from autogluon.tabular.configs.hyperparameter_configs import get_hyperparameter_config
from autogluon.core.data import LabelCleaner
from autogluon.core.models import AbstractModel
from skl2onnx import convert_sklearn, get_model_alias
from skl2onnx.common._registration import get_shape_calculator, get_converter
from onnxconverter_common.data_types import Int64TensorType, FloatTensorType
from sklearn.pipeline import Pipeline
from skl2onnx import update_registered_converter
from skl2onnx._parse import _parse_sklearn
from sklearn.base import is_classifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import onnxruntime as rt
import numpy as np
import pandas as pd
import time
#########################
# Create a custom model #
#########################
# In this example, we create a custom Naive Bayes model for use in AutoGluon
class NaiveBayesModel(AbstractModel):
# The `_preprocess` method takes the input data and transforms it to the internal representation usable by the model.
# `_preprocess` is called by `preprocess` and is used during model fit and model inference.
def _preprocess(self, X, **kwargs):
# Drop category and object column dtypes, since NaiveBayes can't handle these dtypes.
cat_columns = X.select_dtypes(['category', 'object']).columns
X = X.drop(cat_columns, axis=1)
# Add a fillna call to handle missing values.
return super()._preprocess(X, **kwargs).fillna(0)
# The `_fit` method takes the input training data (and optionally the validation data) and trains the model.
def _fit(self, X, y, **kwargs):
from sklearn.naive_bayes import GaussianNB
# It is important to call `preprocess(X)` in `_fit` to replicate what will occur during inference.
X = self.preprocess(X)
self.model = GaussianNB(**self.params)
self.model.fit(X, y)
# Example of a more optimized implementation that drops the invalid features earlier on to avoid having to make repeated checks.
class AdvancedNaiveBayesModel(AbstractModel):
def _preprocess(self, X, **kwargs):
# Add a fillna call to handle missing values.
return super()._preprocess(X, **kwargs).fillna(0)
def _fit(self, X, y, **kwargs):
from sklearn.naive_bayes import GaussianNB
X = self.preprocess(X)
self.model = GaussianNB(**self.params)
self.model.fit(X, y)
# The `_get_default_auxiliary_params` method defines various model-agnostic parameters such as maximum memory usage and valid input column dtypes.
# For most users who build custom models, they will only need to specify the valid/invalid dtypes to the model here.
def _get_default_auxiliary_params(self) -> dict:
default_auxiliary_params = super()._get_default_auxiliary_params()
extra_auxiliary_params = dict(
# Drop category and object column dtypes, since NaiveBayes can't handle these dtypes.
ignored_type_group_raw=['category', 'object'],
)
default_auxiliary_params.update(extra_auxiliary_params)
return default_auxiliary_params
# In this example, we create a custom Naive Bayes model for use in AutoGluon
class GenericClassifierModel(AbstractModel):
names = [
"Nearest Neighbors",
"Linear SVM",
"RBF SVM",
"Gaussian Process",
"Decision Tree",
# --
"Random Forest",
"Neural Net",
"AdaBoost",
"Naive Bayes",
"QDA",
]
classifiers = [
KNeighborsClassifier,
SVC,
SVC,
GaussianProcessClassifier,
DecisionTreeClassifier,
# --
RandomForestClassifier,
MLPClassifier,
AdaBoostClassifier,
GaussianNB,
QuadraticDiscriminantAnalysis,
]
# The `_preprocess` method takes the input data and transforms it to the internal representation usable by the model.
# `_preprocess` is called by `preprocess` and is used during model fit and model inference.
def _preprocess(self, X, **kwargs):
# Drop category and object column dtypes, since NaiveBayes can't handle these dtypes.
cat_columns = X.select_dtypes(['category', 'object']).columns
X = X.drop(cat_columns, axis=1)
# Add a fillna call to handle missing values.
return super()._preprocess(X, **kwargs).fillna(0)
# The `_fit` method takes the input training data (and optionally the validation data) and trains the model.
def _fit(self, X, y, **kwargs):
# It is important to call `preprocess(X)` in `_fit` to replicate what will occur during inference.
X = self.preprocess(X)
classifier_map = {}
for n, c in zip(self.names, self.classifiers):
classifier_map[n] = c
classifier = classifier_map[self.classifier_name]
# import pdb
# pdb.set_trace()
if self.classifier_name == "Linear SVM":
self.params.update(kernel="linear", C=0.025)
elif self.classifier_name == "RBF SVM":
self.params.update(gamma=2, C=1, probability=True)
elif self.classifier_name == "Neural Net":
self.params.update(hidden_layer_sizes=(200,))
self.model = classifier(**self.params)
self.model.fit(X, y)
def __init__(self, classifier_name, **kwargs):
super().__init__(**kwargs)
self.classifier_name = classifier_name
def advanced_naive_bayes_shape_calculator(operator):
pass
def advanced_naive_bayes_converter(scope, operator, container):
"""
:param scope: name space, where to keep node names, get unused new names
:param operator: operator to converter, same object as sent to
*predictable_tsne_shape_calculator*
:param container: contains the ONNX graph
"""
input = operator.inputs[0] # input in ONNX graph
output = operator.outputs[0] # output in ONNX graph
op = operator.raw_operator # scikit-learn model (mmust be fitted)
model = op.model
inputs = operator.inputs
n_features = model.n_features_in_
feature_names = model.feature_names_in_
# We adjust the output of the submodel.
operator.inputs[0].type.shape = (None, n_features)
val_label = scope.declare_local_variable('val_label', Int64TensorType())
operator.outputs.insert(0, val_label)
model.classes_ = model.classes_.astype(np.int64)
# for step in model.steps:
for step in [(None, model),]:
step_model = step[1]
if is_classifier(step_model):
scope.add_options(id(step_model), options={'zipmap': False})
container.add_options(id(step_model), options={'zipmap': False})
outputs = _parse_sklearn(scope, step_model, inputs,
custom_parsers=None)
inputs = outputs
if len(outputs) != len(operator.outputs):
raise RuntimeError(
"Mismatch between pipeline output %d and "
"last step outputs %d." % (
len(outputs), len(operator.outputs)))
for fr, to in zip(outputs, operator.outputs):
container.add_node(
'Identity', fr.full_name, to.full_name,
name=scope.get_unique_operator_name("Id" + operator.onnx_name))
update_registered_converter(NaiveBayesModel, 'NaiveBayesModel',
advanced_naive_bayes_shape_calculator,
advanced_naive_bayes_converter)
update_registered_converter(AdvancedNaiveBayesModel, 'AdvancedNaiveBayesModel',
advanced_naive_bayes_shape_calculator,
advanced_naive_bayes_converter)
update_registered_converter(GenericClassifierModel, 'GenericClassifierModel',
advanced_naive_bayes_shape_calculator,
advanced_naive_bayes_converter)
def main():
################
# Loading Data #
################
train_data = TabularDataset('https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv') # can be local CSV file as well, returns Pandas DataFrame
label = 'class' # specifies which column do we want to predict
save_path = 'ag_models/' # where to save trained models
train_data = train_data.head(1000) # subsample for faster demo
#####################################################
# Training custom model outside of TabularPredictor #
#####################################################
# Separate features and labels
X = train_data.drop(columns=[label])
y = train_data[label]
# Construct a LabelCleaner to neatly convert labels to float/integers during model training/inference, can also use to inverse_transform back to original.
problem_type = infer_problem_type(y=y) # Infer problem type (or else specify directly)
label_cleaner = LabelCleaner.construct(problem_type=problem_type, y=y)
y_clean = label_cleaner.transform(y)
# Prepare test data
test_data = TabularDataset('https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv') # another Pandas DataFrame
X_test = test_data.drop(columns=[label])
y_test = test_data[label]
y_test_clean = label_cleaner.transform(y_test)
print("=======================================================")
print("> Profiling AdvancedNaiveBayesModel")
print("=======================================================")
model = AdvancedNaiveBayesModel()
profile(X, y_clean, X_test, y_test_clean, model)
print("=======================================================")
print("> Profiling NaiveBayesModel")
print("=======================================================")
model = NaiveBayesModel()
profile(X, y_clean, X_test, y_test_clean, model)
names = [
"Nearest Neighbors", # Slow in onnxruntime (10x slower)
"Decision Tree",
"Random Forest",
"Neural Net",
"AdaBoost",
"Naive Bayes",
"RBF SVM",
# "Linear SVM", # Too slow, not responsive
# "Gaussian Process", # com.microsoft.Solve operator not supported in onnxruntime
# "QDA", # Unable to find a shape calculator
]
for name in names:
print("=======================================================")
print(f"> Profiling {name} Model")
print("=======================================================")
model = GenericClassifierModel(name)
profile(X, y_clean, X_test, y_test_clean, model)
def profile(X, y_clean, X_test, y_test_clean, model):
naive_bayes_model = model
# naive_bayes_model = NaiveBayesModel()
naive_bayes_model.fit(X=X, y=y_clean) # Fit custom model
# To save to disk and load the model, do the following:
# load_path = naive_bayes_model.path
# naive_bayes_model.save()
# del naive_bayes_model
# naive_bayes_model = AdvancedNaiveBayesModel.load(path=load_path)
y_pred = naive_bayes_model.predict(X_test)
y_pred_proba = naive_bayes_model.predict_proba(X_test)
print(np.array(y_pred_proba).astype(np.float32))
score = naive_bayes_model.score(X_test, y_test_clean)
print(f'>>>>>>>>>>>>>>>>>>>>>> test score ({naive_bayes_model.eval_metric.name}) = {score} <<<<<<<<<<<<<<<<<<<<<<')
#####################################
# Conversion to onnx using skl2onnx #
#####################################
# X = predictor._learner.transform_features(test_data)
X = X_test
# trainer = predictor._learner.load_trainer()
# model = predictor._learner.load_trainer()._get_best()
# model = trainer.load_model(model)
# y_pred = model.predict_proba(X)
## autogluon.core.models.ensemble.weighted_ensemble_model.WeightedEnsembleModel
# pipe = Pipeline(steps=[('model', model)])
pipe = Pipeline(steps=[('model', naive_bayes_model)])
print("skl predict_proba")
tic = time.time()
skl_pred = pipe.predict_proba(X)
toc = time.time()
print(skl_pred)
print(f">>>>>>>>>>>>>>>>>>>>>> skl elapsed: {(toc-tic)*1000.0:.3f} ms <<<<<<<<<<<<<<<<<<<<<<")
initial_types = [('input', FloatTensorType((None, X.shape[1])))]
model_onnx = convert_sklearn(pipe, initial_types=initial_types,
target_opset=12, verbose=0)
X = naive_bayes_model.preprocess(X_test).to_numpy()
with open("onnx.txt", 'wt') as fp:
fp.write(str(model_onnx))
fp.flush()
print("onnx predict_proba")
sess = rt.InferenceSession(model_onnx.SerializeToString())
tic = time.time()
onx_pred = sess.run(None, {'input': X.astype(np.float32)})[0]
toc = time.time()
print(onx_pred[:, 1])
print(f">>>>>>>>>>>>>>>>>>>>>> onx elapsed: {(toc-tic)*1000.0:.3f} ms <<<<<<<<<<<<<<<<<<<<<<")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment