Skip to content

Instantly share code, notes, and snippets.

@jpbarto
Created June 20, 2019 08:05
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jpbarto/80807ca9c84057af561af19ce433a56b to your computer and use it in GitHub Desktop.
Save jpbarto/80807ca9c84057af561af19ce433a56b to your computer and use it in GitHub Desktop.
Simple SKLearn script to transform some inputs using the SKLearn Estimator on Amazon SageMaker
import pandas as pd
import numpy as np
import argparse
from sklearn import preprocessing
import pickle
import os
from io import StringIO
from sagemaker_containers.beta.framework import (
content_types, encoders, env, modules, transformer, worker)
feature_columns_names = [
'sex', # M, F, and I (infant)
'length', # Longest shell measurement
'diameter', # perpendicular to length
'height', # with meat in shell
'whole_weight', # whole abalone
'shucked_weight', # weight of meat
'viscera_weight', # gut weight (after bleeding)
'shell_weight'] # after being dried
label_column = 'rings'
feature_columns_dtype = {
'sex': str,
'length': np.float64,
'diameter': np.float64,
'height': np.float64,
'whole_weight': np.float64,
'shucked_weight': np.float64,
'viscera_weight': np.float64,
'shell_weight': np.float64}
label_column_dtype = {'rings': np.float64} # +1.5 gives the age in years
def merge_two_dicts(x, y):
z = x.copy() # start with x's keys and values
z.update(y) # modifies z with y's keys and values & returns None
return z
num_features = list(feature_columns_names)
num_features.remove('sex')
cat_features = ['sex']
numeric_features = list(feature_columns_names)
numeric_features.remove('sex')
####
# Batch transformation / inference code
####
def model_fn (model_dir):
f = open (os.path.join(model_dir, "model.pkl"), 'rb')
[scaler, encoder] = pickle.load (f)
f.close ()
return {'encoder': encoder, 'scaler': scaler}
def input_fn (input_data, content_type):
if content_type == 'text/csv':
# Read the raw input data as CSV.
df = pd.read_csv(StringIO(input_data), header=None)
df.columns=feature_columns_names
# assumes no labels, headers or unanticipated values are provided
print ("Processing CSV input")
print (df.info ())
return df
else:
raise ValueError("{} not supported by script!".format(content_type))
return input_data
def predict_fn (input_data, model):
scaler = model['scaler']
encoder = model['encoder']
scaled_data = scaler.transform (input_data[numeric_features])
df_scaled = pd.DataFrame (scaled_data, columns=numeric_features)
encoded_data = encoder.transform (input_data['sex'])
df_scaled['sex'] = encoded_data
print ("Processed records with shape {}".format (df_scaled.shape))
return df_scaled
def output_fn (prediction, accept):
if accept == "application/json":
instances = []
for row in prediction.values:
instances.append({"features": row})
json_output = {"instances": instances}
return worker.Response(json.dumps(json_output), mimetype=accept)
elif accept == 'text/csv':
return worker.Response(encoders.encode(prediction, accept), mimetype=accept)
else:
raise RuntimeException("{} accept type is not supported by this script.".format(accept))
####
# Training job code
####
# executed as __main__ if performing a training job
if __name__ == '__main__':
parser = argparse.ArgumentParser()
# Sagemaker specific arguments. Defaults are set in the environment variables.
parser.add_argument('--output-data-dir', type=str, default=os.environ['SM_OUTPUT_DATA_DIR'])
parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAIN'])
args = parser.parse_args()
print ("Training with args: {}".format (args))
raw_data = pd.read_csv(
args.train +'/abalone.csv',
header=None,
names=feature_columns_names + [label_column],
dtype=merge_two_dicts(feature_columns_dtype, label_column_dtype))
print ("Read abalone.csv")
print (raw_data.info ())
# print (raw_data.shape)
# raw_data.head(3)
num_scaler = preprocessing.MinMaxScaler ()
num_scaler.fit (raw_data[numeric_features])
num_scaled = num_scaler.transform (raw_data[numeric_features])
df_scaled = pd.DataFrame (num_scaled, columns=numeric_features)
# df_scaled.head(3)
cat_encoder = preprocessing.LabelEncoder ()
cat_encoder.fit (raw_data['sex'])
df_encoded = cat_encoder.transform (raw_data[cat_features])
df_scaled['sex'] = df_encoded
df_scaled = df_scaled[feature_columns_names]
# df_scaled.head(3)
f = open(args.model_dir + '/model.pkl', 'wb')
pickle.dump ([num_scaler, cat_encoder], f)
f.close ()
print ("Trained encoder and saved as model.pkl")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment