beckernick/benchmark.sh Secret

## benchmark.sh
#!/bin/bash

python svmsmote-benchmark.py
python smote-adasyn-benchmarks.py
python condensednearestneighbour-benchmark.py
python editednearestneighbors-benchmark.py

## condensednearestneighbour-benchmark.py
import time
import json
import os

from imblearn.under_sampling import CondensedNearestNeighbour
from sklearn.datasets import make_classification
from sklearn.neighbors import NearestNeighbors
import cuml
import numpy as np

cuml.common.logger.set_level(1)

class Timer:
    def __enter__(self):
        self.tick = time.time()
        return self

    def __exit__(self, *args, **kwargs):
        self.tock = time.time()
        self.elapsed = self.tock - self.tick


# Warmup
X = np.random.normal(0, 10, (100, 20))
y = np.random.randint(0, 2, 100)
nn = cuml.neighbors.NearestNeighbors().fit(X)
nn.kneighbors(X)
clf = cuml.svm.SVC().fit(X,y)


# PARAMS
SKIP_SKLEARN = False

BACKENDS = [
    "cuml",
    "sklearn",
]

if SKIP_SKLEARN:
    BACKENDS.remove("sklearn")

TECHNIQUES = [
    "CondensedNearestNeighbour",
]

NROWS = [
    1000,
    5000,
    10000,
    50000,
]

NFEATURES = [
    5,
    20,
    100,
]

N_CLASSES_AND_WEIGHTS = {
    2: [0.9, 0.1],
    5: [0.8, 0.05, 0.05, 0.05, 0.05]
}

outpath = f"imblearn-condensednearestneighbour-benchmark-results.jsonl"
if os.path.exists(outpath):
    os.remove(outpath)


for technique in TECHNIQUES:
    for n in NROWS:
        for k in NFEATURES:
            for c in N_CLASSES_AND_WEIGHTS.keys():
                for library in BACKENDS:
                    w = N_CLASSES_AND_WEIGHTS.get(c)
                    X, y = make_classification(
                        n_samples=n,
                        n_features=k,
                        n_redundant=0,
                        n_informative=k,
                        n_classes=c,
                        n_clusters_per_class=1,
                        weights=w
                    )

                    benchmark_payload = {}
                    with Timer() as resample_timer:
                        if library == "cuml":
                            knn = cuml.neighbors.KNeighborsClassifier(n_neighbors=1)
                            enn = CondensedNearestNeighbour(n_neighbors=knn)
                            enn.estimator_ = knn # extra step for this estimator
                            X_res, y_res = enn.fit_resample(X, y)

                        else:
                            enn = CondensedNearestNeighbour()
                            X_res, y_res = enn.fit_resample(X, y)

                    benchmark_payload["technique"] = technique
                    benchmark_payload["backend"] = library
                    benchmark_payload["nrows"] = n
                    benchmark_payload["nfeatures"] = k
                    benchmark_payload["nclasses"] = c
                    benchmark_payload["weights"] = w
                    benchmark_payload["resample_time"] = resample_timer.elapsed
                    print(benchmark_payload)

                    with open(outpath, "a") as fh:
                        fh.write(json.dumps(benchmark_payload))
                        fh.write("\n")

                    time.sleep(0.5)

## editednearestneighbors-benchmark.py
import time
import json
import os

from imblearn.under_sampling import EditedNearestNeighbours
from sklearn.datasets import make_classification
from sklearn.neighbors import NearestNeighbors
import cuml
import numpy as np

cuml.common.logger.set_level(1)

class Timer:
    def __enter__(self):
        self.tick = time.time()
        return self

    def __exit__(self, *args, **kwargs):
        self.tock = time.time()
        self.elapsed = self.tock - self.tick


# Warmup
X = np.random.normal(0, 10, (100, 20))
y = np.random.randint(0, 2, 100)
nn = cuml.neighbors.NearestNeighbors().fit(X)
nn.kneighbors(X)
clf = cuml.svm.SVC().fit(X,y)


# PARAMS
SKIP_SKLEARN = False

BACKENDS = [
    "cuml",
    "sklearn",
]

if SKIP_SKLEARN:
    BACKENDS.remove("sklearn")

TECHNIQUES = [
    "EditedNearestNeighbours",
]

NROWS = [
    100000,
    500000,
    1000000,
]

NFEATURES = [
    5,
    20,
    100,
]

N_CLASSES_AND_WEIGHTS = {
    2: [0.9, 0.1],
    5: [0.8, 0.05, 0.05, 0.05, 0.05]
}

outpath = f"imblearn-editednearestneighbors-benchmark-results.jsonl"
if os.path.exists(outpath):
    os.remove(outpath)


for technique in TECHNIQUES:
    for n in NROWS:
        for k in NFEATURES:
            for c in N_CLASSES_AND_WEIGHTS.keys():
                for library in BACKENDS:
                    w = N_CLASSES_AND_WEIGHTS.get(c)
                    X, y = make_classification(
                        n_samples=n,
                        n_features=k,
                        n_redundant=0,
                        n_informative=k,
                        n_classes=c,
                        n_clusters_per_class=1,
                        weights=w
                    )

                    benchmark_payload = {}
                    with Timer() as resample_timer:
                        if library == "cuml":
                            nn = cuml.neighbors.NearestNeighbors(n_neighbors=4)
                            enn = EditedNearestNeighbours(n_neighbors=nn)
                            X_res, y_res = enn.fit_resample(X, y)

                        else:
                            nn = NearestNeighbors(n_neighbors=4, n_jobs=-1)
                            X_resampled, y_resampled = EditedNearestNeighbours(
                                n_neighbors=nn
                            ).fit_resample(X, y)

                    benchmark_payload["technique"] = technique
                    benchmark_payload["backend"] = library
                    benchmark_payload["nrows"] = n
                    benchmark_payload["nfeatures"] = k
                    benchmark_payload["nclasses"] = c
                    benchmark_payload["weights"] = w
                    benchmark_payload["resample_time"] = resample_timer.elapsed
                    print(benchmark_payload)

                    with open(outpath, "a") as fh:
                        fh.write(json.dumps(benchmark_payload))
                        fh.write("\n")

                    time.sleep(0.5)

## smote-adasyn-benchmarks.py
import time
import json
import os

from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.datasets import make_classification
from sklearn.neighbors import NearestNeighbors
import cuml
import numpy as np

cuml.common.logger.set_level(1)

class Timer:
    def __enter__(self):
        self.tick = time.time()
        return self

    def __exit__(self, *args, **kwargs):
        self.tock = time.time()
        self.elapsed = self.tock - self.tick


# Warmup
X = np.random.normal(0, 10, (100, 20))
y = np.random.randint(0, 2, 100)
nn = cuml.neighbors.NearestNeighbors().fit(X)
nn.kneighbors(X)


# PARAMS
SKIP_SKLEARN = False

BACKENDS = [
    "cuml",
    "sklearn",
]

if SKIP_SKLEARN:
    BACKENDS.remove("sklearn")

TECHNIQUES = [
    "SMOTE",
    "ADASYN"
]

NROWS = [
    100000,
    1000000,
    10000000,
]

NFEATURES = [
    5,
    20,
    100,
]

N_CLASSES_AND_WEIGHTS = {
    2: [0.9, 0.1],
    5: [0.8, 0.05, 0.05, 0.05, 0.05]
}

outpath = f"imblearn-smote-adasyn-cuml-benchmark-results.jsonl"
if os.path.exists(outpath):
    os.remove(outpath)


for technique in TECHNIQUES:
    for n in NROWS:
        for k in NFEATURES:
            for c in N_CLASSES_AND_WEIGHTS.keys():
                for library in BACKENDS:
                    w = N_CLASSES_AND_WEIGHTS.get(c)
                    X, y = make_classification(
                        n_samples=n,
                        n_features=k,
                        n_redundant=0,
                        n_informative=k,
                        n_classes=c,
                        n_clusters_per_class=1,
                        weights=w
                    )

                    if library == "cuml":
                        nn = cuml.neighbors.NearestNeighbors(n_neighbors=6)
                    else:
                        nn = NearestNeighbors(n_neighbors=6, n_jobs=-1)

                    benchmark_payload = {}
                    with Timer() as resample_timer:
                        if technique == "SMOTE":
                            X_resampled, y_resampled = SMOTE(k_neighbors=nn).fit_resample(X, y)
                        elif technique == "ADASYN":
                            X_resampled, y_resampled = ADASYN(n_neighbors=nn).fit_resample(X, y)

                    benchmark_payload["technique"] = technique
                    benchmark_payload["backend"] = library
                    benchmark_payload["nrows"] = n
                    benchmark_payload["nfeatures"] = k
                    benchmark_payload["nclasses"] = c
                    benchmark_payload["weights"] = w
                    benchmark_payload["resample_time"] = resample_timer.elapsed
                    print(benchmark_payload)

                    with open(outpath, "a") as fh:
                        fh.write(json.dumps(benchmark_payload))
                        fh.write("\n")

                    time.sleep(0.5)

## svmsmote-benchmark.py
import time
import json
import os

from imblearn.over_sampling import SVMSMOTE
from sklearn.datasets import make_classification
from sklearn.neighbors import NearestNeighbors
import cuml
import numpy as np

cuml.common.logger.set_level(1)

class Timer:
    def __enter__(self):
        self.tick = time.time()
        return self

    def __exit__(self, *args, **kwargs):
        self.tock = time.time()
        self.elapsed = self.tock - self.tick


# Warmup
X = np.random.normal(0, 10, (100, 20))
y = np.random.randint(0, 2, 100)
nn = cuml.neighbors.NearestNeighbors().fit(X)
nn.kneighbors(X)
clf = cuml.svm.SVC().fit(X,y)


# PARAMS
SKIP_SKLEARN = False

BACKENDS = [
    "cuml",
    "sklearn",
]

if SKIP_SKLEARN:
    BACKENDS.remove("sklearn")

TECHNIQUES = [
    "SVMSMOTE",
]

NROWS = [
    100000,
    500000,
    1000000,
]

NFEATURES = [
    5,
    20,
    100,
]

N_CLASSES_AND_WEIGHTS = {
    2: [0.9, 0.1],
    5: [0.8, 0.05, 0.05, 0.05, 0.05]
}

outpath = f"imblearn-svmsmote-benchmark-results.jsonl"
if os.path.exists(outpath):
    os.remove(outpath)


for technique in TECHNIQUES:
    for n in NROWS:
        for k in NFEATURES:
            for c in N_CLASSES_AND_WEIGHTS.keys():
                for library in BACKENDS:
                    w = N_CLASSES_AND_WEIGHTS.get(c)
                    X, y = make_classification(
                        n_samples=n,
                        n_features=k,
                        n_redundant=0,
                        n_informative=k,
                        n_classes=c,
                        n_clusters_per_class=1,
                        weights=w
                    )

                    if library == "cuml":
                        nn = cuml.neighbors.NearestNeighbors(n_neighbors=6)
                    else:
                        nn = NearestNeighbors(n_neighbors=6, n_jobs=-1)

                    benchmark_payload = {}
                    with Timer() as resample_timer:
                        if library == "cuml":
                            nn = cuml.neighbors.NearestNeighbors(n_neighbors=6)
                            svm = cuml.svm.SVC()
                            X_resampled, y_resampled = SVMSMOTE(
                                k_neighbors=nn, m_neighbors=nn, svm_estimator=svm
                            ).fit_resample(X, y)

                        else:
                            nn = NearestNeighbors(n_neighbors=6, n_jobs=-1)
                            X_resampled, y_resampled = SVMSMOTE(
                                k_neighbors=nn,
                                m_neighbors=nn
                            ).fit_resample(X, y)

                    benchmark_payload["technique"] = technique
                    benchmark_payload["backend"] = library
                    benchmark_payload["nrows"] = n
                    benchmark_payload["nfeatures"] = k
                    benchmark_payload["nclasses"] = c
                    benchmark_payload["weights"] = w
                    benchmark_payload["resample_time"] = resample_timer.elapsed
                    print(benchmark_payload)

                    with open(outpath, "a") as fh:
                        fh.write(json.dumps(benchmark_payload))
                        fh.write("\n")

                    time.sleep(0.5)
	#!/bin/bash

	python svmsmote-benchmark.py
	python smote-adasyn-benchmarks.py
	python condensednearestneighbour-benchmark.py
	python editednearestneighbors-benchmark.py
	import time
	import json
	import os

	from imblearn.under_sampling import CondensedNearestNeighbour
	from sklearn.datasets import make_classification
	from sklearn.neighbors import NearestNeighbors
	import cuml
	import numpy as np

	cuml.common.logger.set_level(1)

	class Timer:
	def __enter__(self):
	self.tick = time.time()
	return self

	def __exit__(self, args, *kwargs):
	self.tock = time.time()
	self.elapsed = self.tock - self.tick


	# Warmup
	X = np.random.normal(0, 10, (100, 20))
	y = np.random.randint(0, 2, 100)
	nn = cuml.neighbors.NearestNeighbors().fit(X)
	nn.kneighbors(X)
	clf = cuml.svm.SVC().fit(X,y)



	# PARAMS
	SKIP_SKLEARN = False

	BACKENDS = [
	"cuml",
	"sklearn",
	]

	if SKIP_SKLEARN:
	BACKENDS.remove("sklearn")

	TECHNIQUES = [
	"CondensedNearestNeighbour",
	]

	NROWS = [
	1000,
	5000,
	10000,
	50000,
	]

	NFEATURES = [
	5,
	20,
	100,
	]

	N_CLASSES_AND_WEIGHTS = {
	2: [0.9, 0.1],
	5: [0.8, 0.05, 0.05, 0.05, 0.05]
	}

	outpath = f"imblearn-condensednearestneighbour-benchmark-results.jsonl"
	if os.path.exists(outpath):
	os.remove(outpath)


	for technique in TECHNIQUES:
	for n in NROWS:
	for k in NFEATURES:
	for c in N_CLASSES_AND_WEIGHTS.keys():
	for library in BACKENDS:
	w = N_CLASSES_AND_WEIGHTS.get(c)
	X, y = make_classification(
	n_samples=n,
	n_features=k,
	n_redundant=0,
	n_informative=k,
	n_classes=c,
	n_clusters_per_class=1,
	weights=w
	)

	benchmark_payload = {}
	with Timer() as resample_timer:
	if library == "cuml":
	knn = cuml.neighbors.KNeighborsClassifier(n_neighbors=1)
	enn = CondensedNearestNeighbour(n_neighbors=knn)
	enn.estimator_ = knn # extra step for this estimator
	X_res, y_res = enn.fit_resample(X, y)

	else:
	enn = CondensedNearestNeighbour()
	X_res, y_res = enn.fit_resample(X, y)

	benchmark_payload["technique"] = technique
	benchmark_payload["backend"] = library
	benchmark_payload["nrows"] = n
	benchmark_payload["nfeatures"] = k
	benchmark_payload["nclasses"] = c
	benchmark_payload["weights"] = w
	benchmark_payload["resample_time"] = resample_timer.elapsed
	print(benchmark_payload)

	with open(outpath, "a") as fh:
	fh.write(json.dumps(benchmark_payload))
	fh.write("\n")

	time.sleep(0.5)