qtangs/sagemaker_fraud_detection.py

## sagemaker_fraud_detection.py
# i. Download sample data and extract features and label (fraud/nonfraud).
...

# ii. Convert the n-dimensional arrays into RecordIO format (a highly efficient data format).
import sagemaker.amazon.common as smac
buf = io.BytesIO()
smac.write_numpy_to_dense_tensor(buf, features, labels)
...

# iii. Store the RecordIO data into S3 bucket.
bucket = "fraud-detection-end-to-end-demo"
prefix = 'linear-learner'
key = 'recordio-pb-data'
boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buf)
...

# iv. Retrieve the Docker image for the linear learner algorithm.
container = get_image_uri(boto3.Session().region_name, 'linear-learner')
...

# v. Create a training job with the desired instance type and instance count, change the (hyper)parameters of the algorithm and start training using the training data uploaded to S3 earlier. You can see how simple it is to set up a cluster of servers to train a model and only pay for the time that it takes to train, a major cost saver.
import sagemaker
s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)
output_location = 's3://{}/{}/output'.format(bucket, prefix)
linear = sagemaker.estimator.Estimator(container,
                                       get_execution_role(),
                                       train_instance_count=1,
                                       train_instance_type='ml.c4.xlarge',
                                       output_path=output_location,
                                       sagemaker_session=session)
linear.set_hyperparameters(feature_dim=features.shape[1],
                           predictor_type='binary_classifier',
                           mini_batch_size=200)
linear.fit({'train': s3_train_data})
	# i. Download sample data and extract features and label (fraud/nonfraud).
	...

	# ii. Convert the n-dimensional arrays into RecordIO format (a highly efficient data format).
	import sagemaker.amazon.common as smac
	buf = io.BytesIO()
	smac.write_numpy_to_dense_tensor(buf, features, labels)
	...

	# iii. Store the RecordIO data into S3 bucket.
	bucket = "fraud-detection-end-to-end-demo"
	prefix = 'linear-learner'
	key = 'recordio-pb-data'
	boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buf)
	...

	# iv. Retrieve the Docker image for the linear learner algorithm.
	container = get_image_uri(boto3.Session().region_name, 'linear-learner')
	...

	# v. Create a training job with the desired instance type and instance count, change the (hyper)parameters of the algorithm and start training using the training data uploaded to S3 earlier. You can see how simple it is to set up a cluster of servers to train a model and only pay for the time that it takes to train, a major cost saver.
	import sagemaker
	s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)
	output_location = 's3://{}/{}/output'.format(bucket, prefix)
	linear = sagemaker.estimator.Estimator(container,
	get_execution_role(),
	train_instance_count=1,
	train_instance_type='ml.c4.xlarge',
	output_path=output_location,
	sagemaker_session=session)
	linear.set_hyperparameters(feature_dim=features.shape[1],
	predictor_type='binary_classifier',
	mini_batch_size=200)
	linear.fit({'train': s3_train_data})