Skip to content

Instantly share code, notes, and snippets.

@beckernick
Last active February 13, 2023 21:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save beckernick/0472922fd6a8897318ba6d714a3acb2b to your computer and use it in GitHub Desktop.
Save beckernick/0472922fd6a8897318ba6d714a3acb2b to your computer and use it in GitHub Desktop.
Imbalanced-learn cuML benchmarks
#!/bin/bash
python svmsmote-benchmark.py
python smote-adasyn-benchmarks.py
python condensednearestneighbour-benchmark.py
python editednearestneighbors-benchmark.py
import time
import json
import os
from imblearn.under_sampling import CondensedNearestNeighbour
from sklearn.datasets import make_classification
from sklearn.neighbors import NearestNeighbors
import cuml
import numpy as np
cuml.common.logger.set_level(1)
class Timer:
def __enter__(self):
self.tick = time.time()
return self
def __exit__(self, *args, **kwargs):
self.tock = time.time()
self.elapsed = self.tock - self.tick
# Warmup
X = np.random.normal(0, 10, (100, 20))
y = np.random.randint(0, 2, 100)
nn = cuml.neighbors.NearestNeighbors().fit(X)
nn.kneighbors(X)
clf = cuml.svm.SVC().fit(X,y)
# PARAMS
SKIP_SKLEARN = False
BACKENDS = [
"cuml",
"sklearn",
]
if SKIP_SKLEARN:
BACKENDS.remove("sklearn")
TECHNIQUES = [
"CondensedNearestNeighbour",
]
NROWS = [
1000,
5000,
10000,
50000,
]
NFEATURES = [
5,
20,
100,
]
N_CLASSES_AND_WEIGHTS = {
2: [0.9, 0.1],
5: [0.8, 0.05, 0.05, 0.05, 0.05]
}
outpath = f"imblearn-condensednearestneighbour-benchmark-results.jsonl"
if os.path.exists(outpath):
os.remove(outpath)
for technique in TECHNIQUES:
for n in NROWS:
for k in NFEATURES:
for c in N_CLASSES_AND_WEIGHTS.keys():
for library in BACKENDS:
w = N_CLASSES_AND_WEIGHTS.get(c)
X, y = make_classification(
n_samples=n,
n_features=k,
n_redundant=0,
n_informative=k,
n_classes=c,
n_clusters_per_class=1,
weights=w
)
benchmark_payload = {}
with Timer() as resample_timer:
if library == "cuml":
knn = cuml.neighbors.KNeighborsClassifier(n_neighbors=1)
enn = CondensedNearestNeighbour(n_neighbors=knn)
enn.estimator_ = knn # extra step for this estimator
X_res, y_res = enn.fit_resample(X, y)
else:
enn = CondensedNearestNeighbour()
X_res, y_res = enn.fit_resample(X, y)
benchmark_payload["technique"] = technique
benchmark_payload["backend"] = library
benchmark_payload["nrows"] = n
benchmark_payload["nfeatures"] = k
benchmark_payload["nclasses"] = c
benchmark_payload["weights"] = w
benchmark_payload["resample_time"] = resample_timer.elapsed
print(benchmark_payload)
with open(outpath, "a") as fh:
fh.write(json.dumps(benchmark_payload))
fh.write("\n")
time.sleep(0.5)
import time
import json
import os
from imblearn.under_sampling import EditedNearestNeighbours
from sklearn.datasets import make_classification
from sklearn.neighbors import NearestNeighbors
import cuml
import numpy as np
cuml.common.logger.set_level(1)
class Timer:
def __enter__(self):
self.tick = time.time()
return self
def __exit__(self, *args, **kwargs):
self.tock = time.time()
self.elapsed = self.tock - self.tick
# Warmup
X = np.random.normal(0, 10, (100, 20))
y = np.random.randint(0, 2, 100)
nn = cuml.neighbors.NearestNeighbors().fit(X)
nn.kneighbors(X)
clf = cuml.svm.SVC().fit(X,y)
# PARAMS
SKIP_SKLEARN = False
BACKENDS = [
"cuml",
"sklearn",
]
if SKIP_SKLEARN:
BACKENDS.remove("sklearn")
TECHNIQUES = [
"EditedNearestNeighbours",
]
NROWS = [
100000,
500000,
1000000,
]
NFEATURES = [
5,
20,
100,
]
N_CLASSES_AND_WEIGHTS = {
2: [0.9, 0.1],
5: [0.8, 0.05, 0.05, 0.05, 0.05]
}
outpath = f"imblearn-editednearestneighbors-benchmark-results.jsonl"
if os.path.exists(outpath):
os.remove(outpath)
for technique in TECHNIQUES:
for n in NROWS:
for k in NFEATURES:
for c in N_CLASSES_AND_WEIGHTS.keys():
for library in BACKENDS:
w = N_CLASSES_AND_WEIGHTS.get(c)
X, y = make_classification(
n_samples=n,
n_features=k,
n_redundant=0,
n_informative=k,
n_classes=c,
n_clusters_per_class=1,
weights=w
)
benchmark_payload = {}
with Timer() as resample_timer:
if library == "cuml":
nn = cuml.neighbors.NearestNeighbors(n_neighbors=4)
enn = EditedNearestNeighbours(n_neighbors=nn)
X_res, y_res = enn.fit_resample(X, y)
else:
nn = NearestNeighbors(n_neighbors=4, n_jobs=-1)
X_resampled, y_resampled = EditedNearestNeighbours(
n_neighbors=nn
).fit_resample(X, y)
benchmark_payload["technique"] = technique
benchmark_payload["backend"] = library
benchmark_payload["nrows"] = n
benchmark_payload["nfeatures"] = k
benchmark_payload["nclasses"] = c
benchmark_payload["weights"] = w
benchmark_payload["resample_time"] = resample_timer.elapsed
print(benchmark_payload)
with open(outpath, "a") as fh:
fh.write(json.dumps(benchmark_payload))
fh.write("\n")
time.sleep(0.5)
import time
import json
import os
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.datasets import make_classification
from sklearn.neighbors import NearestNeighbors
import cuml
import numpy as np
cuml.common.logger.set_level(1)
class Timer:
def __enter__(self):
self.tick = time.time()
return self
def __exit__(self, *args, **kwargs):
self.tock = time.time()
self.elapsed = self.tock - self.tick
# Warmup
X = np.random.normal(0, 10, (100, 20))
y = np.random.randint(0, 2, 100)
nn = cuml.neighbors.NearestNeighbors().fit(X)
nn.kneighbors(X)
# PARAMS
SKIP_SKLEARN = False
BACKENDS = [
"cuml",
"sklearn",
]
if SKIP_SKLEARN:
BACKENDS.remove("sklearn")
TECHNIQUES = [
"SMOTE",
"ADASYN"
]
NROWS = [
100000,
1000000,
10000000,
]
NFEATURES = [
5,
20,
100,
]
N_CLASSES_AND_WEIGHTS = {
2: [0.9, 0.1],
5: [0.8, 0.05, 0.05, 0.05, 0.05]
}
outpath = f"imblearn-smote-adasyn-cuml-benchmark-results.jsonl"
if os.path.exists(outpath):
os.remove(outpath)
for technique in TECHNIQUES:
for n in NROWS:
for k in NFEATURES:
for c in N_CLASSES_AND_WEIGHTS.keys():
for library in BACKENDS:
w = N_CLASSES_AND_WEIGHTS.get(c)
X, y = make_classification(
n_samples=n,
n_features=k,
n_redundant=0,
n_informative=k,
n_classes=c,
n_clusters_per_class=1,
weights=w
)
if library == "cuml":
nn = cuml.neighbors.NearestNeighbors(n_neighbors=6)
else:
nn = NearestNeighbors(n_neighbors=6, n_jobs=-1)
benchmark_payload = {}
with Timer() as resample_timer:
if technique == "SMOTE":
X_resampled, y_resampled = SMOTE(k_neighbors=nn).fit_resample(X, y)
elif technique == "ADASYN":
X_resampled, y_resampled = ADASYN(n_neighbors=nn).fit_resample(X, y)
benchmark_payload["technique"] = technique
benchmark_payload["backend"] = library
benchmark_payload["nrows"] = n
benchmark_payload["nfeatures"] = k
benchmark_payload["nclasses"] = c
benchmark_payload["weights"] = w
benchmark_payload["resample_time"] = resample_timer.elapsed
print(benchmark_payload)
with open(outpath, "a") as fh:
fh.write(json.dumps(benchmark_payload))
fh.write("\n")
time.sleep(0.5)
import time
import json
import os
from imblearn.over_sampling import SVMSMOTE
from sklearn.datasets import make_classification
from sklearn.neighbors import NearestNeighbors
import cuml
import numpy as np
cuml.common.logger.set_level(1)
class Timer:
def __enter__(self):
self.tick = time.time()
return self
def __exit__(self, *args, **kwargs):
self.tock = time.time()
self.elapsed = self.tock - self.tick
# Warmup
X = np.random.normal(0, 10, (100, 20))
y = np.random.randint(0, 2, 100)
nn = cuml.neighbors.NearestNeighbors().fit(X)
nn.kneighbors(X)
clf = cuml.svm.SVC().fit(X,y)
# PARAMS
SKIP_SKLEARN = False
BACKENDS = [
"cuml",
"sklearn",
]
if SKIP_SKLEARN:
BACKENDS.remove("sklearn")
TECHNIQUES = [
"SVMSMOTE",
]
NROWS = [
100000,
500000,
1000000,
]
NFEATURES = [
5,
20,
100,
]
N_CLASSES_AND_WEIGHTS = {
2: [0.9, 0.1],
5: [0.8, 0.05, 0.05, 0.05, 0.05]
}
outpath = f"imblearn-svmsmote-benchmark-results.jsonl"
if os.path.exists(outpath):
os.remove(outpath)
for technique in TECHNIQUES:
for n in NROWS:
for k in NFEATURES:
for c in N_CLASSES_AND_WEIGHTS.keys():
for library in BACKENDS:
w = N_CLASSES_AND_WEIGHTS.get(c)
X, y = make_classification(
n_samples=n,
n_features=k,
n_redundant=0,
n_informative=k,
n_classes=c,
n_clusters_per_class=1,
weights=w
)
if library == "cuml":
nn = cuml.neighbors.NearestNeighbors(n_neighbors=6)
else:
nn = NearestNeighbors(n_neighbors=6, n_jobs=-1)
benchmark_payload = {}
with Timer() as resample_timer:
if library == "cuml":
nn = cuml.neighbors.NearestNeighbors(n_neighbors=6)
svm = cuml.svm.SVC()
X_resampled, y_resampled = SVMSMOTE(
k_neighbors=nn, m_neighbors=nn, svm_estimator=svm
).fit_resample(X, y)
else:
nn = NearestNeighbors(n_neighbors=6, n_jobs=-1)
X_resampled, y_resampled = SVMSMOTE(
k_neighbors=nn,
m_neighbors=nn
).fit_resample(X, y)
benchmark_payload["technique"] = technique
benchmark_payload["backend"] = library
benchmark_payload["nrows"] = n
benchmark_payload["nfeatures"] = k
benchmark_payload["nclasses"] = c
benchmark_payload["weights"] = w
benchmark_payload["resample_time"] = resample_timer.elapsed
print(benchmark_payload)
with open(outpath, "a") as fh:
fh.write(json.dumps(benchmark_payload))
fh.write("\n")
time.sleep(0.5)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment