git clone git@github.com:YOUR-USERNAME/YOUR-FORKED-REPO.git
cd into/cloned/fork-repo
git remote add upstream git://github.com/ORIGINAL-DEV-USERNAME/REPO-YOU-FORKED-FROM.git
git fetch upstream
| # Copyright (C) 2016 Martina Pugliese | |
| from boto3 import resource | |
| from boto3.dynamodb.conditions import Key | |
| # The boto3 dynamoDB resource | |
| dynamodb_resource = resource('dynamodb') | |
| def get_table_metadata(table_name): |
| input_df = spark.read.option("header", "true").csv(s3_input_data_path) | |
| rearranged_col_names_df = input_df.select(*columns) | |
| # drop null values | |
| cleaned_df = rearranged_col_names_df.dropna() | |
| print("Dropped null values") | |
| # split dataframe into train and validation | |
| splits = cleaned_df.randomSplit([0.7, 0.3], 0) |
| glue = boto3.client('glue') | |
| glue_job_name = 'MyDataProcessingETL' | |
| s3_script_path = 's3://my-code-bucket/glue/glue-etl-processing.py' | |
| my_glue_role = 'MyGlueJobRole' # created earlier | |
| response = glue.create_job( | |
| Name=glue_job_name, |
| data_processing_step = GlueStartJobRunStep( | |
| state_id='GlueDataProcessingStep', | |
| parameters={ | |
| 'JobName': glue_job_name, | |
| 'Arguments': { | |
| '--s3_input_data_path': execution_input['S3InputDataPath'], | |
| '--s3_processed_data_path': execution_input['S3OutputDataPath']# | |
| } | |
| } | |
| ) |
| xgb = sagemaker.estimator.Estimator( | |
| get_image_uri(region, 'xgboost'), | |
| sagemaker_execution_role, | |
| train_instance_count = 1, | |
| train_instance_type = 'ml.m4.4xlarge', | |
| train_volume_size = 5, | |
| output_path = f's3://{model_bucket}/{prefix}', | |
| sagemaker_session = session | |
| ) | |
| workflow_definition = Chain([ | |
| data_processing_step, | |
| training_step, | |
| model_step, | |
| transform_step | |
| ]) | |
| workflow = Workflow( | |
| name='MyTrainTransformDeployWithGlue_v2', |
| sm_client = boto3.client('sagemaker') | |
| from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput | |
| BASE_PROCESSING_IMAGE = '' | |
| INPUT_DATA_DESTINATION = '/opt/ml/processing/input_data' | |
| PROCESSED_DATA_PATH = '/opt/ml/processing/processed_data' | |
| DEFAULT_VOLUME_SIZE = 100 | |
| DEFAULT_INSTANCE_TYPE = 'ml.m5.xlarge' | |
| DEFAULT_INSTANCE_COUNT = 1 | |
| import boto3 | |
| import json | |
| sm_client = boto3.client('sagemaker') | |
| def lambda_handler(event, context): | |
| """ | |
| :param event: |
| data_processing_configuration = dict( | |
| JobName=execution_input['JobName'], | |
| IAMRole=execution_input['IAMRole'], | |
| LocalStorageSizeGB=50, | |
| S3CodePath=execution_input['S3CodePath'], | |
| S3InputDataPath=execution_input['S3InputDataPath'], | |
| S3OutputDataPath=execution_input['S3OutputDataPath'], | |
| EcrContainerUri=execution_input['EcrContainerUri'] | |
| ) |