Skip to content

Instantly share code, notes, and snippets.

@MattEding
Created November 19, 2019 19:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save MattEding/fedeef26c79b5d8bfe3836be9627fc80 to your computer and use it in GitHub Desktop.
Save MattEding/fedeef26c79b5d8bfe3836be9627fc80 to your computer and use it in GitHub Desktop.
ADASYN Vectorize vs Loop Benchmark
Namespace(file=None, n_jobs=4, n_neighbors=5, sampling_strategy='auto', trials=3)
1 ecoli
0.6515465679999999
2 optical_digits
1.081901593
3 satimage
0.757048266
4 pen_digits
0.7772778269999998
5 abalone
0.7216238229999998
6 sick_euthyroid
0.7105965379999999
7 spectrometer
0.6476733709999998
8 car_eval_34
0.672791621
9 isolet
6.778681332
10 us_crime
0.6839828560000001
11 yeast_ml8
0.7074405609999985
12 scene
0.8076008940000001
13 libras_move
0.6478258960000005
14 thyroid_sick
0.7199621090000008
15 coil_2000
0.9155116790000015
16 arrhythmia
0.6713255459999985
17 solar_flare_m0
0.6787288859999983
18 oil
0.657154803000001
19 car_eval_4
0.6858342119999996
20 wine_quality
0.7266068040000029
21 letter_img
1.0060752449999981
22 yeast_me2
0.6741237449999993
23 webpage
20.705942846
24 ozone_level
0.6946011910000038
25 mammography
0.8431376089999958
26 protein_homo
8.687401287
27 abalone_19
0.705801689999987
csr(1000, 10)-d0.01-p0.01
0.6483568239999897
arr(1000, 10)-d0.01-p0.01
0.6715544219999998
csr(1000, 10)-d0.1-p0.1
1.136277604
arr(1000, 10)-d0.1-p0.1
0.6596833239999995
csr(10000, 50)-d0.01-p0.01
4.162356662999997
arr(10000, 50)-d0.01-p0.01
1.2406726000000106
csr(10000, 50)-d0.1-p0.1
8.983309900999998
arr(10000, 50)-d0.1-p0.1
1.8265618060000008
csr(100000, 100)-d0.01-p0.01
69.527295614
arr(100000, 100)-d0.01-p0.01
133.846439331
csr(100000, 100)-d0.1-p0.1
230.76780414200005
arr(100000, 100)-d0.1-p0.1
292.81703859699996
Namespace(file=None, n_jobs=4, n_neighbors=5, sampling_strategy='auto', trials=3)
1 ecoli
0.6454916709999998
2 optical_digits
0.891774474
3 satimage
0.6803766329999998
4 pen_digits
0.6826901650000003
5 abalone
0.6517725180000005
6 sick_euthyroid
0.6531233289999996
7 spectrometer
0.6459990499999986
8 car_eval_34
0.6503424380000009
9 isolet
7.005787720000001
10 us_crime
0.6745569519999997
11 yeast_ml8
0.6682129420000003
12 scene
0.7355924750000007
13 libras_move
0.6388636200000022
14 thyroid_sick
0.6670594270000016
15 coil_2000
0.776676092999999
16 arrhythmia
0.6448950659999966
17 solar_flare_m0
0.6482231889999994
18 oil
0.6433028350000001
19 car_eval_4
0.6546346290000002
20 wine_quality
0.6537185050000005
21 letter_img
0.7824089070000007
22 yeast_me2
0.6519063629999984
23 webpage
20.003640710999996
24 ozone_level
0.6614500730000046
25 mammography
0.7093737689999955
26 protein_homo
6.829618027999999
27 abalone_19
0.6590629220000039
csr(1000, 10)-d0.01-p0.01
0.6387534919999993
arr(1000, 10)-d0.01-p0.01
0.6466937980000012
csr(1000, 10)-d0.1-p0.1
0.6638392989999957
arr(1000, 10)-d0.1-p0.1
0.6362426099999965
csr(10000, 50)-d0.01-p0.01
0.7112311320000089
arr(10000, 50)-d0.01-p0.01
1.104831180000005
csr(10000, 50)-d0.1-p0.1
1.3461644930000034
arr(10000, 50)-d0.1-p0.1
1.6487868080000112
csr(100000, 100)-d0.01-p0.01
11.638341877999991
arr(100000, 100)-d0.01-p0.01
132.322504057
csr(100000, 100)-d0.1-p0.1
153.642012748
arr(100000, 100)-d0.1-p0.1
290.590596469
import argparse
import textwrap
from timeit import timeit
import numpy as np
import pandas as pd
from scipy import sparse
from imblearn.datasets import fetch_datasets
import imblearn.datasets._zenodo as zenodo
from imblearn.over_sampling import ADASYN
def trial_zenodo(name, sampling_strategy, n_neighbors, n_jobs, trials):
setup = f'''
from imblearn.datasets import fetch_datasets
from imblearn.over_sampling import ADASYN
sampling_strategy = '{sampling_strategy}'
n_neighbors = {n_neighbors}
n_jobs = {n_jobs}
dataset = fetch_datasets()['{name}']
X, y = dataset.data, dataset.target
adasyn = ADASYN(sampling_strategy, n_neighbors=n_neighbors, n_jobs=n_jobs, random_state=0)
'''
setup = textwrap.dedent(setup).strip()
t = timeit('adasyn.fit_resample(X, y)', setup=setup, number=trials)
return t
def trial_sparse(fmt, shape, density, p, sampling_strategy, n_neighbors, n_jobs, trials):
setup = f'''
import numpy as np
from scipy import sparse
from imblearn.over_sampling import ADASYN
shape = {shape}
density = {density}
p = {p}
rng = np.random.RandomState(seed=0)
y = rng.choice([0, 1], size=shape[0], p=[p, 1-p])
X = sparse.random(*shape, density=density, random_state=rng)
X = X.to{fmt}()
sampling_strategy = '{sampling_strategy}'
n_neighbors = {n_neighbors}
n_jobs = {n_jobs}
adasyn = ADASYN(sampling_strategy, n_neighbors=n_neighbors, n_jobs=n_jobs, random_state=0)
'''
setup = textwrap.dedent(setup).strip()
t = timeit('adasyn.fit_resample(X, y)', setup=setup, number=trials)
return t
def all_trials(file, sampling_strategy, n_neighbors, n_jobs, trials):
ids = range(1, 28)
for i in ids:
name = zenodo.MAP_ID_NAME[i]
with open(file, 'a') as f: print(i, name, file=f)
t = trial_zenodo(name, sampling_strategy, n_neighbors, n_jobs, trials)
with open(file, 'a') as f: print('\t', t, file=f)
shapes = [(1_000, 10), (10_000, 50), (100_000, 100)]
densities = [0.01, 0.1]
ps = [0.01, 0.1]
for shape in shapes:
for density, p in zip(densities, ps):
for fmt in ['csr', 'array']:
label = f'{fmt[:3]}{shape}-d{density}-p{p}'
with open(file, 'a') as f: print(label, file=f)
t = trial_sparse(fmt, shape, density, p, sampling_strategy, n_neighbors, n_jobs, trials)
with open(file, 'a') as f: print('\t', t, file=f)
def main():
parser = argparse.ArgumentParser(zenodo.__doc__)
parser.add_argument('--trials', '-t', default=3, type=int, help='number of trials for timeit')
parser.add_argument('--n_jobs', '-j', default=None, type=int, help='n_jobs for ADASYN')
choices = ['minority', 'not majority', 'not majority', 'all', 'auto', 'none']
parser.add_argument('--sampling_strategy', '-s', default='auto', choices=choices, help='sampling_strategy for ADASYN')
parser.add_argument('--n_neighbors', '-k', default=5, type=int, help='n_neighbors for ADASYN')
parser.add_argument('--file', '-f', default=None, help='file to save results to')
args = parser.parse_args()
if args.file is None:
file = f'tr{args.trials}-nj{args.n_jobs}-ss{args.sampling_strategy}-nn{args.n_neighbors}.txt'
else:
file = args.file
with open(file, 'a') as f: print(args, file=f)
all_trials(file, args.sampling_strategy, args.n_neighbors, args.n_jobs, args.trials)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment