honzasterba/script.py

## script.py
import sys
sys.path.insert(1,"../h2o-3/h2o-py") # fix this to match your env
from tests import pyunit_utils

import importlib
import pandas as pd
import numpy as np
import xgboost as xgb

import h2o
from h2o.estimators import H2OXGBoostEstimator

h2o.init()

data = "https://s3.amazonaws.com/h2o-public-test-data/bigdata/laptop/airlines_all.05p.csv"
df_full = h2o.import_file(data)
df = df_full
df["Year"] = (df["Year"] < 2000).ifelse(0, df["Year"])
train, valid = df.split_frame([0.95], seed=1234)

y = "Year"
enum_cols = df.names
enum_cols.remove(y)
for col in df.types.keys():
    if df.types[col] != "enum" and col in enum_cols:
        enum_cols.remove(col)

# reproduce with xgboost - uses a lot of memory (90G)
dtrain = pyunit_utils.convertH2OFrameToDMatrix(train, y, enumCols=enum_cols)
param = {
    'booster': "gbtree",
    'tree_method': "approx",
    'max_depth': 6,
    "objective": "reg:gamma",
    "lambda": 1.0,
    'gamma': 0.0,
    'nthread': 16
}
bst = xgb.train(param, dtrain, 10, [(dtrain, "train")])

# reproduce with H2O
xgb = H2OXGBoostEstimator(
    distribution="gamma",
    ntrees=100,
    score_tree_interval=5
)
xgb.train(training_frame=train, validation_frame=valid, y=y)
xgb.model_performance(valid)
	import sys
	sys.path.insert(1,"../h2o-3/h2o-py") # fix this to match your env
	from tests import pyunit_utils

	import importlib
	import pandas as pd
	import numpy as np
	import xgboost as xgb

	import h2o
	from h2o.estimators import H2OXGBoostEstimator

	h2o.init()

	data = "https://s3.amazonaws.com/h2o-public-test-data/bigdata/laptop/airlines_all.05p.csv"
	df_full = h2o.import_file(data)
	df = df_full
	df["Year"] = (df["Year"] < 2000).ifelse(0, df["Year"])
	train, valid = df.split_frame([0.95], seed=1234)

	y = "Year"
	enum_cols = df.names
	enum_cols.remove(y)
	for col in df.types.keys():
	if df.types[col] != "enum" and col in enum_cols:
	enum_cols.remove(col)

	# reproduce with xgboost - uses a lot of memory (90G)
	dtrain = pyunit_utils.convertH2OFrameToDMatrix(train, y, enumCols=enum_cols)
	param = {
	'booster': "gbtree",
	'tree_method': "approx",
	'max_depth': 6,
	"objective": "reg:gamma",
	"lambda": 1.0,
	'gamma': 0.0,
	'nthread': 16
	}
	bst = xgb.train(param, dtrain, 10, [(dtrain, "train")])

	# reproduce with H2O
	xgb = H2OXGBoostEstimator(
	distribution="gamma",
	ntrees=100,
	score_tree_interval=5
	)
	xgb.train(training_frame=train, validation_frame=valid, y=y)
	xgb.model_performance(valid)