Skip to content

Instantly share code, notes, and snippets.

@shadiakiki1986
Last active June 28, 2017 16:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save shadiakiki1986/f4bec0b3d86d41c143013a0e9770144d to your computer and use it in GitHub Desktop.
Save shadiakiki1986/f4bec0b3d86d41c143013a0e9770144d to your computer and use it in GitHub Desktop.
Reconcile sklearn with MLJAR
"""
Reconciling results from MLJAR and sklearn.
Random forest classifier
X==y
Usage
pew new --python=python2 MLJAR_TEST
pip install mljar sklearn numpy scipy pandas
export MLJAR_TOKEN=exampleexampleexample
Author: Shadi Akiki
"""
print(__doc__)
MAX_VERSION_INFO = 3, 0
import sys
if not sys.version_info < MAX_VERSION_INFO:
exit("Python {}.{}- is required.".format(*MAX_VERSION_INFO))
#################################
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from mljar import Mljar
#################################
# Data
n_samples = 100
# Using n_samples = 20 yields an error in MLJAR fitting
# Using n_samples = 50 yields prediction = 0.4 for all points from the MLJAR prediction if the best model uses min_samples_split = 50
# Using n_samples = 100 yields prediction = 0.45 or 0.55 for all points from the MLJAR prediction
# but coincidentally, the best model uses min_samples_split = 4
half = n_samples / 2
# Very simple
# Binary classification
# input == output
X = np.zeros((n_samples,1))
X[ 0:half,0] = 2
# X[half:n_samples,1] = 1
y = X[:,0]
print("X",X.transpose())
print("y",y)
########################
print("MLJAR")
clf_mlj = Mljar(
project='Recon mljar-sklearn',
# experiment='Experiment 1', # for n_samples = 50
# experiment='Experiment 2', # for n_samples = 100, and 2 classes: 0, 1
experiment='Experiment 3', # for n_samples = 100, and 2 classes: 0, 2
metric='auc',
algorithms=['rfc'],
validation_kfolds=None,
validation_shuffle=False,
validation_stratify=True,
validation_train_split=0.95,
tuning_mode='Normal', # Used Sport for experiments 1-3
create_ensemble=False,
single_algorithm_time_limit='1'
)
print("fit")
clf_mlj.fit(X,y) #,dataset_title="Ones and zeros")
print("predict")
pred_proba_mlj = clf_mlj.predict(X)
pred_proba_mlj = pred_proba_mlj.squeeze().values
print("pred_proba_mlj",pred_proba_mlj) # shows values = 0 or 1
pred_mlj = [2 if x==1 else 0 for x in pred_proba_mlj]
print("same",(pred_mlj==y).all()) # returns True
# mljar_fit_params = {'max_features': 0.5, 'min_samples_split': 50, 'criterion': "gini", 'min_samples_leaf': 1}
# mljar_fit_params = {'max_features': 0.7, 'min_samples_split': 4, 'criterion': "entropy", 'min_samples_leaf': 2}
mljar_fit_params = clf_mlj.selected_algorithm.params['model_params']['fit_params']
print("mljar_fit_params", mljar_fit_params)
########################
print("Random forest with same params")
clf_skl = RandomForestClassifier(
max_features = mljar_fit_params['max_features'],
min_samples_split = mljar_fit_params['min_samples_split'],
criterion = mljar_fit_params['criterion'],
min_samples_leaf = mljar_fit_params['min_samples_leaf']
)
clf_skl.fit(X, y)
pred_skl = clf_skl.predict(X)
print("pred_skl",pred_skl) # shows values = 0 or 2
print("same",(pred_skl==y).all()) # returns True
pred_proba_skl = clf_skl.predict_proba(X)
pred_proba_skl = pred_proba_skl[:,pred_proba_skl.shape[1]-1]
print("pred_proba_skl",pred_proba_skl) # shows values = 0 or 1
print("probability from mlj = skl: ",(pred_proba_mlj==pred_proba_skl).all()) # returns True
np.matrix([pred_proba_mlj,pred_proba_skl,y]).transpose()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment