Skip to content

Instantly share code, notes, and snippets.

@patrickbrus
Created January 10, 2021 18:34
Show Gist options
  • Save patrickbrus/420ef4d4d9f6c364eb1c901dc5c6bd14 to your computer and use it in GitHub Desktop.
Save patrickbrus/420ef4d4d9f6c364eb1c901dc5c6bd14 to your computer and use it in GitHub Desktop.
Code for training a xgboost estimator in sagemaker
from sagemaker.tuner import IntegerParameter
from sagemaker.tuner import ContinuousParameter
from sagemaker.tuner import HyperparameterTuner
from sagemaker.inputs import TrainingInput
# Define exploration boundaries (default suggested values from Amazon SageMaker Documentation)
hyperparameter_ranges = {
'alpha': ContinuousParameter(0, 1000, scaling_type="Auto"),
'eta': ContinuousParameter(0.1, 0.5, scaling_type='Logarithmic'),
'gamma':ContinuousParameter(0, 5, scaling_type='Auto'),
'lambda': ContinuousParameter(0,100,scaling_type='Auto'),
'max_delta_step': IntegerParameter(0,10,scaling_type='Auto'),
'max_depth': IntegerParameter(0,10,scaling_type='Auto'),
'min_child_weight': ContinuousParameter(0,10,scaling_type='Auto'),
'subsample': ContinuousParameter(0.5,1,scaling_type='Logarithmic'),
'num_round': IntegerParameter(50,1000,scaling_type='Auto')
}
container = sagemaker.image_uris.retrieve('xgboost', region=region, version="1.0-1")
prefix = "retail_data_analytics"
xgb = sagemaker.estimator.Estimator(container,
role,
instance_count=1,
instance_type='ml.m4.xlarge',
output_path='s3://{}/{}/output'.format(bucket, prefix),
sagemaker_session=sagemaker_session)
tuner_log = HyperparameterTuner(xgb,
objective_metric_name="validation:rmse",
objective_type='Minimize',
hyperparameter_ranges=hyperparameter_ranges,
max_jobs=20,
max_parallel_jobs=4,
strategy='Bayesian')
# prepare input data for training
train_channel = TrainingInput(train_input, content_type='text/csv')
valid_channel = TrainingInput(val_input, content_type='text/csv')
data_channels = {'train': train_channel, 'validation': valid_channel}
# start the hyperparameter tuning job
tuner_log.fit(inputs=data_channels, logs=True)
# Create a new estimator object attached to the best training job found during hyperparameter tuning
xgb_attached = sagemaker.estimator.Estimator.attach(tuner_log.best_training_job())
xgb_transformer = xgb_attached.transformer(instance_count = 1, instance_type = 'ml.m4.xlarge')
# Start the transform job. Make sure to specify the content type and the split type of the test data.
xgb_transformer.transform(test_input, content_type='text/csv', split_type='Line')
# copy the predictions file from s3 to this notebook
!aws s3 cp --recursive $xgb_transformer.output_path $data_dir
# compute MSE
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
test_y_preds = pd.read_csv(os.path.join(data_dir, 'test.csv.out'), header=None)
rmse_xgboost = mean_squared_error(test_y_true, test_y_preds, squared=False)
r2_xgboost = r2_score(test_y_true, test_y_preds)
print(f"Normalized RMSE: {rmse_xgboost/normalization_factor}")
print(f"R-Squared Score: {r2_xgboost}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment