Skip to content

Instantly share code, notes, and snippets.

@Shihab-Shahriar
Last active September 8, 2019 12:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Shihab-Shahriar/f865c28357e4a5dfd99bc318e21bd00f to your computer and use it in GitHub Desktop.
Save Shihab-Shahriar/f865c28357e4a5dfd99bc318e21bd00f to your computer and use it in GitHub Desktop.
performance of InstanceHardnessThreshold: Compares coarse parallelism using cross_val_predict with parallelizing estimator
from collections import Counter
from time import perf_counter
import numpy as np
from sklearn.base import ClassifierMixin, clone
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.utils import safe_indexing
from imblearn.under_sampling import InstanceHardnessThreshold
from imblearn.utils.deprecation import deprecate_parameter
class FineGrainedIH(InstanceHardnessThreshold):
def _validate_estimator(self):
"""Private function to create the classifier"""
if (self.estimator is not None and
isinstance(self.estimator, ClassifierMixin) and
hasattr(self.estimator, 'predict_proba')):
self.estimator_ = clone(self.estimator)
self.estimator_.set_params(random_state=self.random_state)
if 'n_jobs' in self.estimator_.get_params().keys():
self.estimator_.set_params(n_jobs = self.n_jobs)
elif self.estimator is None:
self.estimator_ = RandomForestClassifier(
n_estimators=100, random_state=self.random_state,
n_jobs=self.n_jobs)
else:
raise ValueError('Invalid parameter `estimator`. Got {}.'.format(
type(self.estimator)))
class CoarseGrainedIH(InstanceHardnessThreshold):
def _validate_estimator(self):
"""Private function to create the classifier"""
if (self.estimator is not None and
isinstance(self.estimator, ClassifierMixin) and
hasattr(self.estimator, 'predict_proba')):
self.estimator_ = clone(self.estimator)
self.estimator_.set_params(random_state=self.random_state)
if 'n_jobs' in self.estimator_.get_params().keys():
self.estimator_.set_params(n_jobs = 1)
elif self.estimator is None:
self.estimator_ = RandomForestClassifier(
n_estimators=100, random_state=self.random_state,
n_jobs=self.n_jobs)
else:
raise ValueError('Invalid parameter `estimator`. Got {}.'.format(
type(self.estimator)))
def _fit_resample(self, X, y):
if self.return_indices:
deprecate_parameter(self, '0.4', 'return_indices',
'sample_indices_')
self._validate_estimator()
target_stats = Counter(y)
skf = StratifiedKFold(
n_splits=self.cv, shuffle=False,
random_state=self.random_state)
probabilities = cross_val_predict(self.estimator_, X, y, cv=skf,
n_jobs=self.n_jobs, method='predict_proba')
probabilities = probabilities[range(len(y)), y]
idx_under = np.empty((0,), dtype=int)
for target_class in np.unique(y):
if target_class in self.sampling_strategy_.keys():
n_samples = self.sampling_strategy_[target_class]
threshold = np.percentile(
probabilities[y == target_class],
(1. - (n_samples / target_stats[target_class])) * 100.)
index_target_class = np.flatnonzero(
probabilities[y == target_class] >= threshold)
else:
index_target_class = slice(None)
idx_under = np.concatenate(
(idx_under,
np.flatnonzero(y == target_class)[index_target_class]),
axis=0)
self.sample_indices_ = idx_under
if self.return_indices:
return (safe_indexing(X, idx_under), safe_indexing(y, idx_under),
idx_under)
return safe_indexing(X, idx_under), safe_indexing(y, idx_under)
def avg_time(est,X,y):
start = perf_counter()
for _ in range(10):
est.fit_resample(X,y)
return (perf_counter() - start)/10
if __name__=='__main__':
from sklearn.datasets import load_digits,load_iris,load_breast_cancer
SEED = 42
rf = RandomForestClassifier(n_estimators=1000,random_state=SEED)
coarse_ih = CoarseGrainedIH(estimator=rf,n_jobs=4,random_state = SEED)
fine_ih = FineGrainedIH(estimator=rf,n_jobs=4,random_state=SEED)
for name,dataset in zip(['digits','iris','cancer'],[load_digits,load_iris,load_breast_cancer]):
X,y = dataset(return_X_y=True)
print(name)
print(f"Coarse:",avg_time(coarse_ih,X,y))
print(f"Fine:",avg_time(fine_ih,X,y))
@Shihab-Shahriar
Copy link
Author

The output:

digits
Coarse: 3.8638752630000455
Fine: 5.370231804899959
iris
Coarse: 0.8960094903999561
Fine: 3.781246332900082
cancer
Coarse: 1.6516197527000258
Fine: 4.387305887999901

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment