Skip to content

Instantly share code, notes, and snippets.

@beckernick
Last active September 14, 2020 16:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save beckernick/22e0c117a96832ea2afab05484e30580 to your computer and use it in GitHub Desktop.
Save beckernick/22e0c117a96832ea2afab05484e30580 to your computer and use it in GitHub Desktop.
TPOT Timed Higgs Boson Experiments
import os
import time
import sys
import yaml
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.datasets import fetch_covtype
import cudf
from dask.utils import parse_bytes
cudf.set_allocator(
pool=True,
initial_pool_size=parse_bytes("20GB")
)
from tpot import TPOTClassifier
import cuml
class Timer:
def __enter__(self):
self.tick = time.time()
return self
def __exit__(self, *args, **kwargs):
self.tock = time.time()
self.elapsed = self.tock - self.tick
def get_highest_internal_cv(fitted_tpot):
top_score = 0
for k, v in fitted_tpot.evaluated_individuals_.items():
current_score = v.get('internal_cv_score', 0)
if current_score > top_score:
top_score = current_score
return top_score
benchamrk_config_path = sys.argv[1]
with open(benchamrk_config_path) as fp:
BENCHMARK_CONFIG = yaml.safe_load(fp.read())
DATA_DIRECTORY = "/raid/nicholasb"
def prepare_airlines(nrows=None):
# see https://transtats.bts.gov/Tables.asp?DB_ID=120&DB_Name=Airline%20On-Time%20Performance%20Data&DB_Short_Name=On-Time#
# mirror: https://rapidsai-cloud-ml-sample-data.s3-us-west-2.amazonaws.com/airline_20000000.parquet"
DATASET_PATH = os.path.join(DATA_DIRECTORY, "airline_20000000.parquet")
df = pd.read_parquet(DATASET_PATH)
df = df.head(nrows)
X = df.drop(["ArrDelayBinary"], axis=1)
y = df["ArrDelayBinary"].astype('int32')
return X, y
def prepare_higgs(nrows=None):
# see https://github.com/NVIDIA/gbm-bench/blob/04f052febb95436762c67c59eaa33d6cd3ebcdbc/datasets.py#L176
DATASET_PATH = os.path.join(DATA_DIRECTORY, "HIGGS.csv.gz")
higgs = pd.read_csv(DATASET_PATH, nrows=nrows)
X = higgs.iloc[:, 1:].to_numpy(dtype=np.float32)
y = higgs.iloc[:, 0].to_numpy(dtype=np.int64)
return X, y
dataset_mapping = {
"airline": prepare_airlines,
"higgs": prepare_higgs,
}
NROWS = BENCHMARK_CONFIG.get("nrows")
DATASET = BENCHMARK_CONFIG.get("dataset")
X, y = dataset_mapping[DATASET](nrows=NROWS)
print(f"X.shape: {X.shape}")
print(f"y.shape: {y.shape}")
# TPOT configuration
seed = BENCHMARK_CONFIG.get("seed")
generations = BENCHMARK_CONFIG.get("generations")
pop_size = BENCHMARK_CONFIG.get("population_size")
cv = BENCHMARK_CONFIG.get("cv")
max_time_mins = BENCHMARK_CONFIG.get("max_time_mins")
config_mapping = {
"Default": None,
"cuML": "TPOT cuML",
"Light": "TPOT Light",
}
config_dict = config_mapping[BENCHMARK_CONFIG.get("config_dict")]
njobs = 1 if config_dict == "TPOT cuML" else -1
verbosity = BENCHMARK_CONFIG.get("verbosity")
scoring_metric = BENCHMARK_CONFIG.get("scoring_metric")
tpot = TPOTClassifier(
generations=generations,
population_size=pop_size,
random_state=seed,
config_dict=config_dict,
n_jobs=njobs,
max_time_mins=max_time_mins,
cv=cv,
scoring=scoring_metric,
verbosity=2,
)
with Timer() as fit_time:
tpot.fit(X, y)
with Timer() as predict_time:
preds = tpot.predict(X)
benchmark_payload = BENCHMARK_CONFIG.copy()
benchmark_payload["n_jobs"] = njobs
benchmark_payload["evaluated_pipelines"] = len(tpot.evaluated_individuals_.keys())
benchmark_payload["fit_time"] = fit_time.elapsed
benchmark_payload["predict_time"] = predict_time.elapsed
benchmark_payload["best_cv_score"] = get_highest_internal_cv(tpot)
benchmark_payload["fitted_pipeline"] = str(tpot.fitted_pipeline_)
benchmark_payload["exported_pipeline"] = tpot.export()
benchmark_payload["cuml_version"] = cuml.__version__
outpath = f"tpot-benchmark-all-results.txt"
with open(outpath, "a") as fh:
fh.write(json.dumps(benchmark_payload))
fh.write("\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment