-
-
Save beckernick/0472922fd6a8897318ba6d714a3acb2b to your computer and use it in GitHub Desktop.
Imbalanced-learn cuML benchmarks
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
python svmsmote-benchmark.py | |
python smote-adasyn-benchmarks.py | |
python condensednearestneighbour-benchmark.py | |
python editednearestneighbors-benchmark.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
import json | |
import os | |
from imblearn.under_sampling import CondensedNearestNeighbour | |
from sklearn.datasets import make_classification | |
from sklearn.neighbors import NearestNeighbors | |
import cuml | |
import numpy as np | |
cuml.common.logger.set_level(1) | |
class Timer: | |
def __enter__(self): | |
self.tick = time.time() | |
return self | |
def __exit__(self, *args, **kwargs): | |
self.tock = time.time() | |
self.elapsed = self.tock - self.tick | |
# Warmup | |
X = np.random.normal(0, 10, (100, 20)) | |
y = np.random.randint(0, 2, 100) | |
nn = cuml.neighbors.NearestNeighbors().fit(X) | |
nn.kneighbors(X) | |
clf = cuml.svm.SVC().fit(X,y) | |
# PARAMS | |
SKIP_SKLEARN = False | |
BACKENDS = [ | |
"cuml", | |
"sklearn", | |
] | |
if SKIP_SKLEARN: | |
BACKENDS.remove("sklearn") | |
TECHNIQUES = [ | |
"CondensedNearestNeighbour", | |
] | |
NROWS = [ | |
1000, | |
5000, | |
10000, | |
50000, | |
] | |
NFEATURES = [ | |
5, | |
20, | |
100, | |
] | |
N_CLASSES_AND_WEIGHTS = { | |
2: [0.9, 0.1], | |
5: [0.8, 0.05, 0.05, 0.05, 0.05] | |
} | |
outpath = f"imblearn-condensednearestneighbour-benchmark-results.jsonl" | |
if os.path.exists(outpath): | |
os.remove(outpath) | |
for technique in TECHNIQUES: | |
for n in NROWS: | |
for k in NFEATURES: | |
for c in N_CLASSES_AND_WEIGHTS.keys(): | |
for library in BACKENDS: | |
w = N_CLASSES_AND_WEIGHTS.get(c) | |
X, y = make_classification( | |
n_samples=n, | |
n_features=k, | |
n_redundant=0, | |
n_informative=k, | |
n_classes=c, | |
n_clusters_per_class=1, | |
weights=w | |
) | |
benchmark_payload = {} | |
with Timer() as resample_timer: | |
if library == "cuml": | |
knn = cuml.neighbors.KNeighborsClassifier(n_neighbors=1) | |
enn = CondensedNearestNeighbour(n_neighbors=knn) | |
enn.estimator_ = knn # extra step for this estimator | |
X_res, y_res = enn.fit_resample(X, y) | |
else: | |
enn = CondensedNearestNeighbour() | |
X_res, y_res = enn.fit_resample(X, y) | |
benchmark_payload["technique"] = technique | |
benchmark_payload["backend"] = library | |
benchmark_payload["nrows"] = n | |
benchmark_payload["nfeatures"] = k | |
benchmark_payload["nclasses"] = c | |
benchmark_payload["weights"] = w | |
benchmark_payload["resample_time"] = resample_timer.elapsed | |
print(benchmark_payload) | |
with open(outpath, "a") as fh: | |
fh.write(json.dumps(benchmark_payload)) | |
fh.write("\n") | |
time.sleep(0.5) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
import json | |
import os | |
from imblearn.under_sampling import EditedNearestNeighbours | |
from sklearn.datasets import make_classification | |
from sklearn.neighbors import NearestNeighbors | |
import cuml | |
import numpy as np | |
cuml.common.logger.set_level(1) | |
class Timer: | |
def __enter__(self): | |
self.tick = time.time() | |
return self | |
def __exit__(self, *args, **kwargs): | |
self.tock = time.time() | |
self.elapsed = self.tock - self.tick | |
# Warmup | |
X = np.random.normal(0, 10, (100, 20)) | |
y = np.random.randint(0, 2, 100) | |
nn = cuml.neighbors.NearestNeighbors().fit(X) | |
nn.kneighbors(X) | |
clf = cuml.svm.SVC().fit(X,y) | |
# PARAMS | |
SKIP_SKLEARN = False | |
BACKENDS = [ | |
"cuml", | |
"sklearn", | |
] | |
if SKIP_SKLEARN: | |
BACKENDS.remove("sklearn") | |
TECHNIQUES = [ | |
"EditedNearestNeighbours", | |
] | |
NROWS = [ | |
100000, | |
500000, | |
1000000, | |
] | |
NFEATURES = [ | |
5, | |
20, | |
100, | |
] | |
N_CLASSES_AND_WEIGHTS = { | |
2: [0.9, 0.1], | |
5: [0.8, 0.05, 0.05, 0.05, 0.05] | |
} | |
outpath = f"imblearn-editednearestneighbors-benchmark-results.jsonl" | |
if os.path.exists(outpath): | |
os.remove(outpath) | |
for technique in TECHNIQUES: | |
for n in NROWS: | |
for k in NFEATURES: | |
for c in N_CLASSES_AND_WEIGHTS.keys(): | |
for library in BACKENDS: | |
w = N_CLASSES_AND_WEIGHTS.get(c) | |
X, y = make_classification( | |
n_samples=n, | |
n_features=k, | |
n_redundant=0, | |
n_informative=k, | |
n_classes=c, | |
n_clusters_per_class=1, | |
weights=w | |
) | |
benchmark_payload = {} | |
with Timer() as resample_timer: | |
if library == "cuml": | |
nn = cuml.neighbors.NearestNeighbors(n_neighbors=4) | |
enn = EditedNearestNeighbours(n_neighbors=nn) | |
X_res, y_res = enn.fit_resample(X, y) | |
else: | |
nn = NearestNeighbors(n_neighbors=4, n_jobs=-1) | |
X_resampled, y_resampled = EditedNearestNeighbours( | |
n_neighbors=nn | |
).fit_resample(X, y) | |
benchmark_payload["technique"] = technique | |
benchmark_payload["backend"] = library | |
benchmark_payload["nrows"] = n | |
benchmark_payload["nfeatures"] = k | |
benchmark_payload["nclasses"] = c | |
benchmark_payload["weights"] = w | |
benchmark_payload["resample_time"] = resample_timer.elapsed | |
print(benchmark_payload) | |
with open(outpath, "a") as fh: | |
fh.write(json.dumps(benchmark_payload)) | |
fh.write("\n") | |
time.sleep(0.5) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
import json | |
import os | |
from imblearn.over_sampling import SMOTE, ADASYN | |
from sklearn.datasets import make_classification | |
from sklearn.neighbors import NearestNeighbors | |
import cuml | |
import numpy as np | |
cuml.common.logger.set_level(1) | |
class Timer: | |
def __enter__(self): | |
self.tick = time.time() | |
return self | |
def __exit__(self, *args, **kwargs): | |
self.tock = time.time() | |
self.elapsed = self.tock - self.tick | |
# Warmup | |
X = np.random.normal(0, 10, (100, 20)) | |
y = np.random.randint(0, 2, 100) | |
nn = cuml.neighbors.NearestNeighbors().fit(X) | |
nn.kneighbors(X) | |
# PARAMS | |
SKIP_SKLEARN = False | |
BACKENDS = [ | |
"cuml", | |
"sklearn", | |
] | |
if SKIP_SKLEARN: | |
BACKENDS.remove("sklearn") | |
TECHNIQUES = [ | |
"SMOTE", | |
"ADASYN" | |
] | |
NROWS = [ | |
100000, | |
1000000, | |
10000000, | |
] | |
NFEATURES = [ | |
5, | |
20, | |
100, | |
] | |
N_CLASSES_AND_WEIGHTS = { | |
2: [0.9, 0.1], | |
5: [0.8, 0.05, 0.05, 0.05, 0.05] | |
} | |
outpath = f"imblearn-smote-adasyn-cuml-benchmark-results.jsonl" | |
if os.path.exists(outpath): | |
os.remove(outpath) | |
for technique in TECHNIQUES: | |
for n in NROWS: | |
for k in NFEATURES: | |
for c in N_CLASSES_AND_WEIGHTS.keys(): | |
for library in BACKENDS: | |
w = N_CLASSES_AND_WEIGHTS.get(c) | |
X, y = make_classification( | |
n_samples=n, | |
n_features=k, | |
n_redundant=0, | |
n_informative=k, | |
n_classes=c, | |
n_clusters_per_class=1, | |
weights=w | |
) | |
if library == "cuml": | |
nn = cuml.neighbors.NearestNeighbors(n_neighbors=6) | |
else: | |
nn = NearestNeighbors(n_neighbors=6, n_jobs=-1) | |
benchmark_payload = {} | |
with Timer() as resample_timer: | |
if technique == "SMOTE": | |
X_resampled, y_resampled = SMOTE(k_neighbors=nn).fit_resample(X, y) | |
elif technique == "ADASYN": | |
X_resampled, y_resampled = ADASYN(n_neighbors=nn).fit_resample(X, y) | |
benchmark_payload["technique"] = technique | |
benchmark_payload["backend"] = library | |
benchmark_payload["nrows"] = n | |
benchmark_payload["nfeatures"] = k | |
benchmark_payload["nclasses"] = c | |
benchmark_payload["weights"] = w | |
benchmark_payload["resample_time"] = resample_timer.elapsed | |
print(benchmark_payload) | |
with open(outpath, "a") as fh: | |
fh.write(json.dumps(benchmark_payload)) | |
fh.write("\n") | |
time.sleep(0.5) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
import json | |
import os | |
from imblearn.over_sampling import SVMSMOTE | |
from sklearn.datasets import make_classification | |
from sklearn.neighbors import NearestNeighbors | |
import cuml | |
import numpy as np | |
cuml.common.logger.set_level(1) | |
class Timer: | |
def __enter__(self): | |
self.tick = time.time() | |
return self | |
def __exit__(self, *args, **kwargs): | |
self.tock = time.time() | |
self.elapsed = self.tock - self.tick | |
# Warmup | |
X = np.random.normal(0, 10, (100, 20)) | |
y = np.random.randint(0, 2, 100) | |
nn = cuml.neighbors.NearestNeighbors().fit(X) | |
nn.kneighbors(X) | |
clf = cuml.svm.SVC().fit(X,y) | |
# PARAMS | |
SKIP_SKLEARN = False | |
BACKENDS = [ | |
"cuml", | |
"sklearn", | |
] | |
if SKIP_SKLEARN: | |
BACKENDS.remove("sklearn") | |
TECHNIQUES = [ | |
"SVMSMOTE", | |
] | |
NROWS = [ | |
100000, | |
500000, | |
1000000, | |
] | |
NFEATURES = [ | |
5, | |
20, | |
100, | |
] | |
N_CLASSES_AND_WEIGHTS = { | |
2: [0.9, 0.1], | |
5: [0.8, 0.05, 0.05, 0.05, 0.05] | |
} | |
outpath = f"imblearn-svmsmote-benchmark-results.jsonl" | |
if os.path.exists(outpath): | |
os.remove(outpath) | |
for technique in TECHNIQUES: | |
for n in NROWS: | |
for k in NFEATURES: | |
for c in N_CLASSES_AND_WEIGHTS.keys(): | |
for library in BACKENDS: | |
w = N_CLASSES_AND_WEIGHTS.get(c) | |
X, y = make_classification( | |
n_samples=n, | |
n_features=k, | |
n_redundant=0, | |
n_informative=k, | |
n_classes=c, | |
n_clusters_per_class=1, | |
weights=w | |
) | |
if library == "cuml": | |
nn = cuml.neighbors.NearestNeighbors(n_neighbors=6) | |
else: | |
nn = NearestNeighbors(n_neighbors=6, n_jobs=-1) | |
benchmark_payload = {} | |
with Timer() as resample_timer: | |
if library == "cuml": | |
nn = cuml.neighbors.NearestNeighbors(n_neighbors=6) | |
svm = cuml.svm.SVC() | |
X_resampled, y_resampled = SVMSMOTE( | |
k_neighbors=nn, m_neighbors=nn, svm_estimator=svm | |
).fit_resample(X, y) | |
else: | |
nn = NearestNeighbors(n_neighbors=6, n_jobs=-1) | |
X_resampled, y_resampled = SVMSMOTE( | |
k_neighbors=nn, | |
m_neighbors=nn | |
).fit_resample(X, y) | |
benchmark_payload["technique"] = technique | |
benchmark_payload["backend"] = library | |
benchmark_payload["nrows"] = n | |
benchmark_payload["nfeatures"] = k | |
benchmark_payload["nclasses"] = c | |
benchmark_payload["weights"] = w | |
benchmark_payload["resample_time"] = resample_timer.elapsed | |
print(benchmark_payload) | |
with open(outpath, "a") as fh: | |
fh.write(json.dumps(benchmark_payload)) | |
fh.write("\n") | |
time.sleep(0.5) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment