Olalekan Fuad Elesin oelesinsc24

## boto_dynamodb_methods.py
# Copyright (C) 2016 Martina Pugliese

from boto3 import resource
from boto3.dynamodb.conditions import Key

# The boto3 dynamoDB resource
dynamodb_resource = resource('dynamodb')


def get_table_metadata(table_name):

## gist:6bcc2a6d8309276964ca420cbc18cd55

      
              1 file
            
          
              0 forks
            
          
                0 comments
              
            
              0 stars
            
          
                oelesinsc24
                / gist:6bcc2a6d8309276964ca420cbc18cd55
            
            
              Created
              August 1, 2018 13:42
                — forked from CristinaSolana/gist:1885435
            
              
                Keeping a fork up to date
              
          
    1. Clone your fork:

git clone git@github.com:YOUR-USERNAME/YOUR-FORKED-REPO.git

2. Add remote from original repository in your forked repository:

cd into/cloned/fork-repo
git remote add upstream git://github.com/ORIGINAL-DEV-USERNAME/REPO-YOU-FORKED-FROM.git
git fetch upstream


## glue-etl-processing.py
input_df = spark.read.option("header", "true").csv(s3_input_data_path)

rearranged_col_names_df = input_df.select(*columns)

# drop null values
cleaned_df = rearranged_col_names_df.dropna()
print("Dropped null values")

# split dataframe into train and validation
splits = cleaned_df.randomSplit([0.7, 0.3], 0)

## boto3-create-glue-job.py
glue = boto3.client('glue')
glue_job_name = 'MyDataProcessingETL'


s3_script_path = 's3://my-code-bucket/glue/glue-etl-processing.py'
my_glue_role = 'MyGlueJobRole' # created earlier


response = glue.create_job(
    Name=glue_job_name,

## stepfunctions-sdk-glue-job-step.py
data_processing_step = GlueStartJobRunStep(
    state_id='GlueDataProcessingStep',
    parameters={
        'JobName': glue_job_name,
        'Arguments': {
            '--s3_input_data_path': execution_input['S3InputDataPath'],
            '--s3_processed_data_path': execution_input['S3OutputDataPath']#
        }
    }
)

## stepfunctions-sdk-training-and-model-step.py
xgb = sagemaker.estimator.Estimator(
    get_image_uri(region, 'xgboost'),
    sagemaker_execution_role,
    train_instance_count = 1,
    train_instance_type = 'ml.m4.4xlarge',
    train_volume_size = 5,
    output_path = f's3://{model_bucket}/{prefix}',
    sagemaker_session = session
)


## stepfunctions-sdk-chain-steps.py
workflow_definition = Chain([
    data_processing_step,
    training_step,
    model_step,
    transform_step
])


workflow = Workflow(
    name='MyTrainTransformDeployWithGlue_v2',

## create-sagemaker-processing-job.py
sm_client = boto3.client('sagemaker')
from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput
BASE_PROCESSING_IMAGE = ''
INPUT_DATA_DESTINATION = '/opt/ml/processing/input_data'
PROCESSED_DATA_PATH = '/opt/ml/processing/processed_data'
DEFAULT_VOLUME_SIZE = 100
DEFAULT_INSTANCE_TYPE = 'ml.m5.xlarge'
DEFAULT_INSTANCE_COUNT = 1


## check-processing-job-status.py
import boto3
import json

sm_client = boto3.client('sagemaker')


def lambda_handler(event, context):
    """

    :param event:

## create-processing-job-step.py
data_processing_configuration = dict(
    JobName=execution_input['JobName'],
    IAMRole=execution_input['IAMRole'],
    LocalStorageSizeGB=50,
    S3CodePath=execution_input['S3CodePath'],
    S3InputDataPath=execution_input['S3InputDataPath'],
    S3OutputDataPath=execution_input['S3OutputDataPath'],
    EcrContainerUri=execution_input['EcrContainerUri']
)
	# Copyright (C) 2016 Martina Pugliese

	from boto3 import resource
	from boto3.dynamodb.conditions import Key

	# The boto3 dynamoDB resource
	dynamodb_resource = resource('dynamodb')


	def get_table_metadata(table_name):
	input_df = spark.read.option("header", "true").csv(s3_input_data_path)

	rearranged_col_names_df = input_df.select(*columns)

	# drop null values
	cleaned_df = rearranged_col_names_df.dropna()
	print("Dropped null values")

	# split dataframe into train and validation
	splits = cleaned_df.randomSplit([0.7, 0.3], 0)
	glue = boto3.client('glue')
	glue_job_name = 'MyDataProcessingETL'


	s3_script_path = 's3://my-code-bucket/glue/glue-etl-processing.py'
	my_glue_role = 'MyGlueJobRole' # created earlier


	response = glue.create_job(
	Name=glue_job_name,
	data_processing_step = GlueStartJobRunStep(
	state_id='GlueDataProcessingStep',
	parameters={
	'JobName': glue_job_name,
	'Arguments': {
	'--s3_input_data_path': execution_input['S3InputDataPath'],
	'--s3_processed_data_path': execution_input['S3OutputDataPath']#
	}
	}
	)
	xgb = sagemaker.estimator.Estimator(
	get_image_uri(region, 'xgboost'),
	sagemaker_execution_role,
	train_instance_count = 1,
	train_instance_type = 'ml.m4.4xlarge',
	train_volume_size = 5,
	output_path = f's3://{model_bucket}/{prefix}',
	sagemaker_session = session
	)
	workflow_definition = Chain([
	data_processing_step,
	training_step,
	model_step,
	transform_step
	])


	workflow = Workflow(
	name='MyTrainTransformDeployWithGlue_v2',
	sm_client = boto3.client('sagemaker')
	from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput
	BASE_PROCESSING_IMAGE = ''
	INPUT_DATA_DESTINATION = '/opt/ml/processing/input_data'
	PROCESSED_DATA_PATH = '/opt/ml/processing/processed_data'
	DEFAULT_VOLUME_SIZE = 100
	DEFAULT_INSTANCE_TYPE = 'ml.m5.xlarge'
	DEFAULT_INSTANCE_COUNT = 1
	import boto3
	import json

	sm_client = boto3.client('sagemaker')


	def lambda_handler(event, context):
	"""

	:param event:
	data_processing_configuration = dict(
	JobName=execution_input['JobName'],
	IAMRole=execution_input['IAMRole'],
	LocalStorageSizeGB=50,
	S3CodePath=execution_input['S3CodePath'],
	S3InputDataPath=execution_input['S3InputDataPath'],
	S3OutputDataPath=execution_input['S3OutputDataPath'],
	EcrContainerUri=execution_input['EcrContainerUri']
	)