patrickbrus/xgboost_sagemaker.py

## xgboost_sagemaker.py
from sagemaker.tuner import IntegerParameter
from sagemaker.tuner import ContinuousParameter
from sagemaker.tuner import HyperparameterTuner
from sagemaker.inputs import TrainingInput

# Define exploration boundaries (default suggested values from Amazon SageMaker Documentation)
hyperparameter_ranges = {
    'alpha': ContinuousParameter(0, 1000, scaling_type="Auto"),
    'eta': ContinuousParameter(0.1, 0.5, scaling_type='Logarithmic'),
    'gamma':ContinuousParameter(0, 5, scaling_type='Auto'),
    'lambda': ContinuousParameter(0,100,scaling_type='Auto'),
    'max_delta_step': IntegerParameter(0,10,scaling_type='Auto'),
    'max_depth': IntegerParameter(0,10,scaling_type='Auto'),
    'min_child_weight': ContinuousParameter(0,10,scaling_type='Auto'),
    'subsample': ContinuousParameter(0.5,1,scaling_type='Logarithmic'),
    'num_round': IntegerParameter(50,1000,scaling_type='Auto')
}

container = sagemaker.image_uris.retrieve('xgboost', region=region, version="1.0-1")

prefix = "retail_data_analytics"

xgb = sagemaker.estimator.Estimator(container,
                                    role,
                                    instance_count=1,
                                    instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/output'.format(bucket, prefix),
                                    sagemaker_session=sagemaker_session)

tuner_log = HyperparameterTuner(xgb,
                                objective_metric_name="validation:rmse",
                                objective_type='Minimize',
                                hyperparameter_ranges=hyperparameter_ranges,
                                max_jobs=20,
                                max_parallel_jobs=4,
                                strategy='Bayesian')

# prepare input data for training
train_channel = TrainingInput(train_input, content_type='text/csv')
valid_channel = TrainingInput(val_input, content_type='text/csv')
data_channels = {'train': train_channel, 'validation': valid_channel}

# start the hyperparameter tuning job
tuner_log.fit(inputs=data_channels,  logs=True)

# Create a new estimator object attached to the best training job found during hyperparameter tuning
xgb_attached = sagemaker.estimator.Estimator.attach(tuner_log.best_training_job())
xgb_transformer = xgb_attached.transformer(instance_count = 1, instance_type = 'ml.m4.xlarge')
# Start the transform job. Make sure to specify the content type and the split type of the test data.
xgb_transformer.transform(test_input, content_type='text/csv', split_type='Line')

# copy the predictions file from s3 to this notebook
!aws s3 cp --recursive $xgb_transformer.output_path $data_dir

# compute MSE
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

test_y_preds = pd.read_csv(os.path.join(data_dir, 'test.csv.out'), header=None)
rmse_xgboost = mean_squared_error(test_y_true, test_y_preds, squared=False)
r2_xgboost = r2_score(test_y_true, test_y_preds)
print(f"Normalized RMSE: {rmse_xgboost/normalization_factor}")
print(f"R-Squared Score: {r2_xgboost}")
	from sagemaker.tuner import IntegerParameter
	from sagemaker.tuner import ContinuousParameter
	from sagemaker.tuner import HyperparameterTuner
	from sagemaker.inputs import TrainingInput

	# Define exploration boundaries (default suggested values from Amazon SageMaker Documentation)
	hyperparameter_ranges = {
	'alpha': ContinuousParameter(0, 1000, scaling_type="Auto"),
	'eta': ContinuousParameter(0.1, 0.5, scaling_type='Logarithmic'),
	'gamma':ContinuousParameter(0, 5, scaling_type='Auto'),
	'lambda': ContinuousParameter(0,100,scaling_type='Auto'),
	'max_delta_step': IntegerParameter(0,10,scaling_type='Auto'),
	'max_depth': IntegerParameter(0,10,scaling_type='Auto'),
	'min_child_weight': ContinuousParameter(0,10,scaling_type='Auto'),
	'subsample': ContinuousParameter(0.5,1,scaling_type='Logarithmic'),
	'num_round': IntegerParameter(50,1000,scaling_type='Auto')
	}

	container = sagemaker.image_uris.retrieve('xgboost', region=region, version="1.0-1")

	prefix = "retail_data_analytics"

	xgb = sagemaker.estimator.Estimator(container,
	role,
	instance_count=1,
	instance_type='ml.m4.xlarge',
	output_path='s3://{}/{}/output'.format(bucket, prefix),
	sagemaker_session=sagemaker_session)

	tuner_log = HyperparameterTuner(xgb,
	objective_metric_name="validation:rmse",
	objective_type='Minimize',
	hyperparameter_ranges=hyperparameter_ranges,
	max_jobs=20,
	max_parallel_jobs=4,
	strategy='Bayesian')

	# prepare input data for training
	train_channel = TrainingInput(train_input, content_type='text/csv')
	valid_channel = TrainingInput(val_input, content_type='text/csv')
	data_channels = {'train': train_channel, 'validation': valid_channel}

	# start the hyperparameter tuning job
	tuner_log.fit(inputs=data_channels, logs=True)

	# Create a new estimator object attached to the best training job found during hyperparameter tuning
	xgb_attached = sagemaker.estimator.Estimator.attach(tuner_log.best_training_job())
	xgb_transformer = xgb_attached.transformer(instance_count = 1, instance_type = 'ml.m4.xlarge')
	# Start the transform job. Make sure to specify the content type and the split type of the test data.
	xgb_transformer.transform(test_input, content_type='text/csv', split_type='Line')

	# copy the predictions file from s3 to this notebook
	!aws s3 cp --recursive $xgb_transformer.output_path $data_dir

	# compute MSE
	from sklearn.metrics import mean_squared_error
	from sklearn.metrics import r2_score

	test_y_preds = pd.read_csv(os.path.join(data_dir, 'test.csv.out'), header=None)
	rmse_xgboost = mean_squared_error(test_y_true, test_y_preds, squared=False)
	r2_xgboost = r2_score(test_y_true, test_y_preds)
	print(f"Normalized RMSE: {rmse_xgboost/normalization_factor}")
	print(f"R-Squared Score: {r2_xgboost}")