Skip to content

Instantly share code, notes, and snippets.

@honzasterba
Created October 8, 2020 08:01
Show Gist options
  • Save honzasterba/79bfcb95e6c183479b45218a40b8af3b to your computer and use it in GitHub Desktop.
Save honzasterba/79bfcb95e6c183479b45218a40b8af3b to your computer and use it in GitHub Desktop.
Reproducing NaN in metrics when using reg:gamma
import sys
sys.path.insert(1,"../h2o-3/h2o-py") # fix this to match your env
from tests import pyunit_utils
import importlib
import pandas as pd
import numpy as np
import xgboost as xgb
import h2o
from h2o.estimators import H2OXGBoostEstimator
h2o.init()
data = "https://s3.amazonaws.com/h2o-public-test-data/bigdata/laptop/airlines_all.05p.csv"
df_full = h2o.import_file(data)
df = df_full
df["Year"] = (df["Year"] < 2000).ifelse(0, df["Year"])
train, valid = df.split_frame([0.95], seed=1234)
y = "Year"
enum_cols = df.names
enum_cols.remove(y)
for col in df.types.keys():
if df.types[col] != "enum" and col in enum_cols:
enum_cols.remove(col)
# reproduce with xgboost - uses a lot of memory (90G)
dtrain = pyunit_utils.convertH2OFrameToDMatrix(train, y, enumCols=enum_cols)
param = {
'booster': "gbtree",
'tree_method': "approx",
'max_depth': 6,
"objective": "reg:gamma",
"lambda": 1.0,
'gamma': 0.0,
'nthread': 16
}
bst = xgb.train(param, dtrain, 10, [(dtrain, "train")])
# reproduce with H2O
xgb = H2OXGBoostEstimator(
distribution="gamma",
ntrees=100,
score_tree_interval=5
)
xgb.train(training_frame=train, validation_frame=valid, y=y)
xgb.model_performance(valid)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment