Skip to content

Instantly share code, notes, and snippets.

@oliverholworthy
Last active August 30, 2022 09:54
Show Gist options
  • Save oliverholworthy/db8dfca6e08d29fa6481f5f9d66c8049 to your computer and use it in GitHub Desktop.
Save oliverholworthy/db8dfca6e08d29fa6481f5f9d66c8049 to your computer and use it in GitHub Desktop.
XGBoost Example - Eval Metrics Discrepancy using DaskDeviceQuantileDMatrix
import dask_cudf
import cudf
from dask_cuda import LocalCUDACluster
from dask.distributed import Client
import xgboost
import sklearn.datasets
cluster = LocalCUDACluster()
client = Client(cluster)
# setup
n_features = 16
n_classes = 2
feature_cols = [str(i) for i in range(n_features)]
target_cols = ["label"]
npartitions = 1
# train data
X_train, y_train = sklearn.datasets.make_classification(
n_samples=1000,
n_features=n_features,
n_classes=n_classes,
random_state=0,
)
X_train_df = dask_cudf.from_cudf(cudf.DataFrame(X_train, columns=feature_cols), npartitions=npartitions)
y_train_df = dask_cudf.from_cudf(cudf.DataFrame(y_train, columns=target_cols), npartitions=npartitions)
dtrain_dask_quantile = xgboost.dask.DaskDeviceQuantileDMatrix(client, X_train_df, label=y_train_df)
# valid data
X_valid, y_valid = sklearn.datasets.make_classification(
n_samples=100,
n_features=n_features,
n_classes=n_classes,
random_state=1,
)
X_valid_df = dask_cudf.from_cudf(cudf.DataFrame(X_valid, columns=feature_cols), npartitions=npartitions)
y_valid_df = dask_cudf.from_cudf(cudf.DataFrame(y_valid, columns=target_cols), npartitions=npartitions)
dvalid_dask_quantile = xgboost.dask.DaskDeviceQuantileDMatrix(client, X_valid_df, label=y_valid_df)
dvalid_dask = xgboost.dask.DaskDMatrix(client, X_valid_df, label=y_valid_df)
dvalid = xgboost.DMatrix(X_valid_df.compute(), label=y_valid_df.compute())
# model train/eval
params = {"tree_method": "gpu_hist", "objective": "binary:logistic", "eval_metric": "auc"}
# with 'quantile' dmatrix in evals
result = xgboost.dask.train(
client, params, dtrain_dask_quantile, evals=[(dvalid_dask_quantile, "valid")], num_boost_round=100
)
# [0] valid-auc:0.61345
# [1] valid-auc:0.52681
# [2] valid-auc:0.52141
# [3] valid-auc:0.52521
# [4] valid-auc:0.51481
# [5] valid-auc:0.52061
# [6] valid-auc:0.52641
# [7] valid-auc:0.52561
# [8] valid-auc:0.52341
# [9] valid-auc:0.51721
result["booster"].eval(dvalid)
# => '[0]\teval-auc:0.51760704281712688'
# last metric in train history does not match result of booster eval on same dataset
# 0.51721 != 0.51760
# with standard dmatrix in evals
result = xgboost.dask.train(
client, params, dtrain_dask_quantile, evals=[(dvalid_dask, "valid")], num_boost_round=10
)
# [0] valid-auc:0.61985
# [1] valid-auc:0.52681
# [2] valid-auc:0.52141
# [3] valid-auc:0.52521
# [4] valid-auc:0.51521
# [5] valid-auc:0.52221
# [6] valid-auc:0.52921
# [7] valid-auc:0.52721
# [8] valid-auc:0.52461
# [9] valid-auc:0.51761
result["booster"].eval(dvalid)
# => '[0]\teval-auc:0.51760704281712688'
# matches
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment