Skip to content

Instantly share code, notes, and snippets.

@victornoel
Created February 14, 2020 09:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save victornoel/06d6231f6276719ddba53cb381dfd468 to your computer and use it in GitHub Desktop.
Save victornoel/06d6231f6276719ddba53cb381dfd468 to your computer and use it in GitHub Desktop.
import os
import warnings
from typing import List, Tuple
import onnxruntime as rt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from scipy.stats import randint as sp_randint, uniform as sp_uniform
from xgboost import XGBRegressor
from onnxmltools.convert.xgboost.operator_converters.XGBoost import convert_xgboost
import onnxmltools.convert.common.data_types as onnxtypes
from skl2onnx import update_registered_converter, convert_sklearn
from skl2onnx.common.shape_calculator import calculate_linear_regressor_output_shapes
warnings.simplefilter(action='ignore', category=FutureWarning)
os.environ["PYTHONWARNINGS"] = 'ignore'
def test():
data = pd.read_csv('https://raw.githubusercontent.com/amueller/scipy-2017-sklearn/091d371/notebooks/datasets/titanic3.csv')
data['age'] = data['age'].astype(str)
data = data[['age', 'fare', 'sex', 'embarked', 'pclass', 'survived']]
for col in data:
dtype = data[col].dtype
if dtype in ['float64', 'float32']:
data[col].fillna(0., inplace=True)
if dtype in ['int64']:
data[col].fillna(0, inplace=True)
elif dtype in ['O']:
data[col].fillna('N/A', inplace=True)
full_df = data.drop('survived', axis=1)
full_labels = data['survived']
train_df, test_df, train_labels, test_labels = train_test_split(
full_df, full_labels, test_size=.2
)
col_transformer = _column_tranformer_fitted_from_df(full_df)
search = RandomizedSearchCV(
XGBRegressor(verbose=0, objective='reg:squarederror'),
param_distributions={
"colsample_bytree": sp_uniform(),
"gamma": sp_uniform(.1, 1),
'learning_rate': sp_uniform(.1, .6),
'max_depth': sp_randint(10, 30),
'min_child_weight': sp_uniform(0, 3),
'n_estimators': range(10, 75), # nb de pas d'apprentissange pour chaque cas
},
cv=5, n_iter=10, n_jobs=-1
)
search.fit(col_transformer.transform(train_df), train_labels)
regressor = XGBRegressor(verbose=0, objective='reg:squarederror', **search.best_params_)
regressor.fit(
col_transformer.transform(train_df),
train_labels
)
model = Pipeline(
steps=[('preprocessor', col_transformer),
('regressor', regressor)]
)
update_registered_converter(
XGBRegressor, 'XGBRegressor',
calculate_linear_regressor_output_shapes,
convert_xgboost
)
onnx = convert_sklearn(
model,
initial_types=_convert_dataframe_schema(full_df)
)
session = rt.InferenceSession(onnx.SerializeToString())
pred_skl = model.predict(test_df)
pred_onx = _predict(session, test_df)
diff = np.sort(np.abs(np.squeeze(pred_skl) - np.squeeze(pred_onx)))[-5:]
print(diff)
print('min(Y)-max(Y):', min(test_labels), max(test_labels))
def _column_tranformer_fitted_from_df(data: pd.DataFrame) -> ColumnTransformer:
def transformer_for_column(column: pd.Series):
if column.dtype in ['float64', 'float32', 'int64']:
return MinMaxScaler()
if column.dtype in ['bool']:
return 'passthrough'
if column.dtype in ['O']:
return OneHotEncoder()
raise ValueError(f'Unexpected column dtype for {column.name}:{column.dtype}')
return ColumnTransformer(
[(col, transformer_for_column(data[col]), [col]) for col in data.columns],
remainder='drop'
).fit(data)
def _convert_dataframe_schema(data: pd.DataFrame) -> List[Tuple[str, onnxtypes.DataType]]:
def type_for_column(column: pd.Series):
if column.dtype in ['float64', 'float32']:
# onnx does not really support float64 (DoubleTensorType does not work with TreeEnsembleRegressor)
return onnxtypes.FloatTensorType([None, 1])
if column.dtype in ['int64']:
return onnxtypes.Int64TensorType([None, 1])
if column.dtype in ['bool']:
return onnxtypes.BooleanTensorType([None, 1])
if column.dtype in ['O']:
return onnxtypes.StringTensorType([None, 1])
raise ValueError(f'Unexpected column dtype for {column.name}:{column.dtype}')
return [(col, type_for_column(data[col])) for col in data.columns]
def _predict(session: rt.InferenceSession, data: pd.DataFrame) -> pd.Series:
def _correctly_typed_column(column: pd.Series) -> pd.Series:
if column.dtype in ['float64']:
return column.astype(np.float32)
return column
def _correctly_shaped_values(values):
return values.reshape((values.shape[0], 1))
inputs = {
c: _correctly_shaped_values(_correctly_typed_column(data[c]).values)
for c in data.columns
}
return pd.Series(
session.run(None, inputs)[0].reshape(-1),
index=data.index
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment