Skip to content

Instantly share code, notes, and snippets.

View xiaowei1234's full-sized avatar

Xiao Wei xiaowei1234

View GitHub Profile
@xiaowei1234
xiaowei1234 / sklearn2pmml features.py
Last active November 8, 2018 00:18
sklearn2pmml feature engineering and creation
featureU = FeatureUnion([
('transformations', DataFrameMapper([
(trend_vars, ExpressionTransformer("X[:, 0] - X[:, 1]"))
, (to_logs, make_pipeline(Imputer(strategy='median')
, FunctionTransformer(np.log1p)
))
])
)
, ('identity', DataFrameMapper([(non_trans_vars, ContinuousDomain())]))
])
@xiaowei1234
xiaowei1234 / sklearn2pmml_pipe.py
Created November 8, 2018 00:24
sklearn2pmml pipe
pl = PMMLPipeline([
('featureUnion', featureU)
, ('impute', Imputer(strategy='median'))
, ('standardize', StandardScaler())
# , ('interactions', PolynomialFeatures(include_bias=False))
# , ('clf', SGDClassifier(alpha=0.008, l1_ratio=0.13, max_iter=450,loss='log'
# ,penalty='elasticnet', n_iter=None, tol=None))# alpha = 0.8
, ('clf', LogisticRegression(penalty='l2', max_iter=500, C=0.8))
])
pl = Pipeline([
('impute', Imputer(strategy='median'))
, ('standardize', StandardScaler())
# , ('interactions', PolynomialFeatures(include_bias=False))
, ('clf', SGDClassifier())
])
alpha = [0.0001, 0.001, 0.1]
@xiaowei1234
xiaowei1234 / pdf_pipe_decorator.py
Last active November 9, 2018 00:37
pipe decorator example
def cell_wrapper(df, func, field, drop=True, new_name=None):
"""
decorator function for pandas pipe api
takes func which applies function to one value in field
returns modified dataframe
df (pandas dataframe): the dataframe to apply transformation on
func (function): function to apply to each value of field
field (str): name of column in df
drop (boolean): whether to drop 'field' after transformation
new_name (str): whether to rename transformed 'field' column to new_name
@xiaowei1234
xiaowei1234 / scorers.py
Last active October 1, 2020 12:58
binary classification scorers
import numpy as np
import pandas as pd
from scipy.stats import ks_2samp
from sklearn.metrics import make_scorer, roc_auc_score, log_loss
from sklearn.model_selection import GridSearchCV
def ks_stat(y, yhat):
return ks_2samp(yhat[y==1], yhat[y!=1]).statistic
@xiaowei1234
xiaowei1234 / ContinuousDomain.py
Last active April 18, 2019 20:58
ContinuousDomain example
from sklearn2pmml.decoration import ContinuousDomain
from sklearn.impute import SimpleImputer
cont_d = ContinuousDomain(missing_value_replacement=350, missing_value_treatment='as_value'
, missing_values=[float("NaN"), -1], outlier_treatment='as_extreme_values'
, low_value=300, high_value=1500)
wrap = DataFrameMapper([
('amount', [cont_d, SimpleImputer(), FunctionTransformer(np.log1p, validate=False)])
@xiaowei1234
xiaowei1234 / CutTransformer.py
Last active April 18, 2019 21:04
Binning using CutTransformer
from sklearn2pmml.preprocessing import CutTransformer
from sklearn.impute import SimpleImputer
bins = CutTransformer(bins=[0, 250, 2200], labels=[0.3, 0.4])
wrap = DataFrameMapper([
('amount', [SimpleImputer(), bins])
])
@xiaowei1234
xiaowei1234 / LookupTransformer.py
Last active July 16, 2019 00:53
LookupTransformer example
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer
from sklearn_pandas import DataFrameMapper
col_trans2 = ColumnTransformer([('first', 'drop', [0])]
, remainder='passthrough', sparse_threshold=0.0)
mapper = DataFrameMapper([
('code1', [CategoricalDomain(missing_value_treatment = "as_value", missing_value_replacement = '!')
,LookupTransformer({'a': 'b', 'b': 'd', 'c': 'd'}, 'a')
@xiaowei1234
xiaowei1234 / status_codes.py
Last active April 24, 2019 15:53
Status Code Transformation
from sklearn.pipeline import make_pipeline
wrap2 = DataFrameMapper([
('Status 1', LookupTransformer({203:1}, 0))
, ('Status 2', LookupTransformer({203:1}, 0))
, ('Status 3', LookupTransformer({203:1}, 0))
])
union = ExpressionTransformer("X[0]+X[1]+X[2]")
@xiaowei1234
xiaowei1234 / Alias.py
Last active June 1, 2019 01:37
Status Code aliases
from sklearn.pipeline import make_pipeline
from sklearn2pmml.decoration import Alias
from sklearn.pipeline import FeatureUnion
wrap1 = DataFrameMapper([
('Status 1', Alias(LookupTransformer({202:1}, 0), 'status_1_202', prefit=True))
, ('Status 2', Alias(LookupTransformer({202:1}, 0), 'status_2_202', prefit=True))
, ('Status 3', Alias(LookupTransformer({202:1}, 0), 'status_3_202', prefit=True))
])