beckernick/tpot-timed-experiments.py Secret

## tpot-timed-experiments.py
import os
import time
import sys
import yaml
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.datasets import fetch_covtype

import cudf
from dask.utils import parse_bytes

cudf.set_allocator(
    pool=True,
    initial_pool_size=parse_bytes("20GB")
)

from tpot import TPOTClassifier
import cuml


class Timer:
    def __enter__(self):
        self.tick = time.time()
        return self

    def __exit__(self, *args, **kwargs):
        self.tock = time.time()
        self.elapsed = self.tock - self.tick


def get_highest_internal_cv(fitted_tpot):
    top_score = 0
    for k, v in fitted_tpot.evaluated_individuals_.items():
        current_score = v.get('internal_cv_score', 0)
        if current_score > top_score:
            top_score = current_score

    return top_score

benchamrk_config_path = sys.argv[1]

with open(benchamrk_config_path) as fp:
    BENCHMARK_CONFIG = yaml.safe_load(fp.read())


DATA_DIRECTORY = "/raid/nicholasb"


def prepare_airlines(nrows=None):
    # see https://transtats.bts.gov/Tables.asp?DB_ID=120&DB_Name=Airline%20On-Time%20Performance%20Data&DB_Short_Name=On-Time#
    # mirror: https://rapidsai-cloud-ml-sample-data.s3-us-west-2.amazonaws.com/airline_20000000.parquet"
    DATASET_PATH = os.path.join(DATA_DIRECTORY, "airline_20000000.parquet")
    df = pd.read_parquet(DATASET_PATH)
    df = df.head(nrows)

    X = df.drop(["ArrDelayBinary"], axis=1)
    y = df["ArrDelayBinary"].astype('int32')
    return X, y

def prepare_higgs(nrows=None):
    # see https://github.com/NVIDIA/gbm-bench/blob/04f052febb95436762c67c59eaa33d6cd3ebcdbc/datasets.py#L176
    DATASET_PATH = os.path.join(DATA_DIRECTORY, "HIGGS.csv.gz")
    higgs = pd.read_csv(DATASET_PATH, nrows=nrows)
    X = higgs.iloc[:, 1:].to_numpy(dtype=np.float32)
    y = higgs.iloc[:, 0].to_numpy(dtype=np.int64)
    return X, y

dataset_mapping = {
    "airline": prepare_airlines,
    "higgs": prepare_higgs,
}


NROWS = BENCHMARK_CONFIG.get("nrows")
DATASET = BENCHMARK_CONFIG.get("dataset")

X, y = dataset_mapping[DATASET](nrows=NROWS)

print(f"X.shape: {X.shape}")
print(f"y.shape: {y.shape}")

# TPOT configuration
seed = BENCHMARK_CONFIG.get("seed")
generations = BENCHMARK_CONFIG.get("generations")
pop_size = BENCHMARK_CONFIG.get("population_size")
cv = BENCHMARK_CONFIG.get("cv")
max_time_mins = BENCHMARK_CONFIG.get("max_time_mins")

config_mapping = {
    "Default": None,
    "cuML": "TPOT cuML",
    "Light": "TPOT Light",
}

config_dict = config_mapping[BENCHMARK_CONFIG.get("config_dict")]
njobs = 1 if config_dict == "TPOT cuML" else -1
verbosity = BENCHMARK_CONFIG.get("verbosity")
scoring_metric = BENCHMARK_CONFIG.get("scoring_metric")


tpot = TPOTClassifier(
    generations=generations,
    population_size=pop_size,
    random_state=seed,
    config_dict=config_dict,
    n_jobs=njobs,
    max_time_mins=max_time_mins,
    cv=cv,
    scoring=scoring_metric,
    verbosity=2,
)

with Timer() as fit_time:
    tpot.fit(X, y)


with Timer() as predict_time:
    preds = tpot.predict(X)

benchmark_payload = BENCHMARK_CONFIG.copy()
benchmark_payload["n_jobs"] = njobs
benchmark_payload["evaluated_pipelines"] = len(tpot.evaluated_individuals_.keys())
benchmark_payload["fit_time"] = fit_time.elapsed
benchmark_payload["predict_time"] = predict_time.elapsed
benchmark_payload["best_cv_score"] = get_highest_internal_cv(tpot)
benchmark_payload["fitted_pipeline"] = str(tpot.fitted_pipeline_)
benchmark_payload["exported_pipeline"] = tpot.export()
benchmark_payload["cuml_version"] = cuml.__version__

outpath = f"tpot-benchmark-all-results.txt"
with open(outpath, "a") as fh:
    fh.write(json.dumps(benchmark_payload))
    fh.write("\n")
	import os
	import time
	import sys
	import yaml
	import json
	import pandas as pd
	import numpy as np
	from sklearn.model_selection import train_test_split
	from sklearn.metrics import accuracy_score
	from sklearn.datasets import fetch_covtype

	import cudf
	from dask.utils import parse_bytes

	cudf.set_allocator(
	pool=True,
	initial_pool_size=parse_bytes("20GB")
	)

	from tpot import TPOTClassifier
	import cuml


	class Timer:
	def __enter__(self):
	self.tick = time.time()
	return self

	def __exit__(self, args, *kwargs):
	self.tock = time.time()
	self.elapsed = self.tock - self.tick


	def get_highest_internal_cv(fitted_tpot):
	top_score = 0
	for k, v in fitted_tpot.evaluated_individuals_.items():
	current_score = v.get('internal_cv_score', 0)
	if current_score > top_score:
	top_score = current_score

	return top_score

	benchamrk_config_path = sys.argv[1]

	with open(benchamrk_config_path) as fp:
	BENCHMARK_CONFIG = yaml.safe_load(fp.read())


	DATA_DIRECTORY = "/raid/nicholasb"


	def prepare_airlines(nrows=None):
	# see https://transtats.bts.gov/Tables.asp?DB_ID=120&DB_Name=Airline%20On-Time%20Performance%20Data&DB_Short_Name=On-Time#
	# mirror: https://rapidsai-cloud-ml-sample-data.s3-us-west-2.amazonaws.com/airline_20000000.parquet"
	DATASET_PATH = os.path.join(DATA_DIRECTORY, "airline_20000000.parquet")
	df = pd.read_parquet(DATASET_PATH)
	df = df.head(nrows)

	X = df.drop(["ArrDelayBinary"], axis=1)
	y = df["ArrDelayBinary"].astype('int32')
	return X, y

	def prepare_higgs(nrows=None):
	# see https://github.com/NVIDIA/gbm-bench/blob/04f052febb95436762c67c59eaa33d6cd3ebcdbc/datasets.py#L176
	DATASET_PATH = os.path.join(DATA_DIRECTORY, "HIGGS.csv.gz")
	higgs = pd.read_csv(DATASET_PATH, nrows=nrows)
	X = higgs.iloc[:, 1:].to_numpy(dtype=np.float32)
	y = higgs.iloc[:, 0].to_numpy(dtype=np.int64)
	return X, y

	dataset_mapping = {
	"airline": prepare_airlines,
	"higgs": prepare_higgs,
	}


	NROWS = BENCHMARK_CONFIG.get("nrows")
	DATASET = BENCHMARK_CONFIG.get("dataset")

	X, y = dataset_mapping[DATASET](nrows=NROWS)

	print(f"X.shape: {X.shape}")
	print(f"y.shape: {y.shape}")

	# TPOT configuration
	seed = BENCHMARK_CONFIG.get("seed")
	generations = BENCHMARK_CONFIG.get("generations")
	pop_size = BENCHMARK_CONFIG.get("population_size")
	cv = BENCHMARK_CONFIG.get("cv")
	max_time_mins = BENCHMARK_CONFIG.get("max_time_mins")

	config_mapping = {
	"Default": None,
	"cuML": "TPOT cuML",
	"Light": "TPOT Light",
	}

	config_dict = config_mapping[BENCHMARK_CONFIG.get("config_dict")]
	njobs = 1 if config_dict == "TPOT cuML" else -1
	verbosity = BENCHMARK_CONFIG.get("verbosity")
	scoring_metric = BENCHMARK_CONFIG.get("scoring_metric")


	tpot = TPOTClassifier(
	generations=generations,
	population_size=pop_size,
	random_state=seed,
	config_dict=config_dict,
	n_jobs=njobs,
	max_time_mins=max_time_mins,
	cv=cv,
	scoring=scoring_metric,
	verbosity=2,
	)

	with Timer() as fit_time:
	tpot.fit(X, y)


	with Timer() as predict_time:
	preds = tpot.predict(X)

	benchmark_payload = BENCHMARK_CONFIG.copy()
	benchmark_payload["n_jobs"] = njobs
	benchmark_payload["evaluated_pipelines"] = len(tpot.evaluated_individuals_.keys())
	benchmark_payload["fit_time"] = fit_time.elapsed
	benchmark_payload["predict_time"] = predict_time.elapsed
	benchmark_payload["best_cv_score"] = get_highest_internal_cv(tpot)
	benchmark_payload["fitted_pipeline"] = str(tpot.fitted_pipeline_)
	benchmark_payload["exported_pipeline"] = tpot.export()
	benchmark_payload["cuml_version"] = cuml.__version__

	outpath = f"tpot-benchmark-all-results.txt"
	with open(outpath, "a") as fh:
	fh.write(json.dumps(benchmark_payload))
	fh.write("\n")