Olalekan Fuad Elesin oelesinsc24

## boto3-create-glue-job.py
glue = boto3.client('glue')
glue_job_name = 'MyDataProcessingETL'


s3_script_path = 's3://my-code-bucket/glue/glue-etl-processing.py'
my_glue_role = 'MyGlueJobRole' # created earlier


response = glue.create_job(
    Name=glue_job_name,

## glue-etl-processing.py
input_df = spark.read.option("header", "true").csv(s3_input_data_path)

rearranged_col_names_df = input_df.select(*columns)

# drop null values
cleaned_df = rearranged_col_names_df.dropna()
print("Dropped null values")

# split dataframe into train and validation
splits = cleaned_df.randomSplit([0.7, 0.3], 0)

## gist:6bcc2a6d8309276964ca420cbc18cd55

      
              1 file
            
          
              0 forks
            
          
                0 comments
              
            
              0 stars
            
          
                oelesinsc24
                / gist:6bcc2a6d8309276964ca420cbc18cd55
            
            
              Created
              August 1, 2018 13:42
                — forked from CristinaSolana/gist:1885435
            
              
                Keeping a fork up to date
              
          
    1. Clone your fork:

git clone git@github.com:YOUR-USERNAME/YOUR-FORKED-REPO.git

2. Add remote from original repository in your forked repository:

cd into/cloned/fork-repo
git remote add upstream git://github.com/ORIGINAL-DEV-USERNAME/REPO-YOU-FORKED-FROM.git
git fetch upstream


## boto_dynamodb_methods.py
# Copyright (C) 2016 Martina Pugliese

from boto3 import resource
from boto3.dynamodb.conditions import Key

# The boto3 dynamoDB resource
dynamodb_resource = resource('dynamodb')


def get_table_metadata(table_name):
	glue = boto3.client('glue')
	glue_job_name = 'MyDataProcessingETL'


	s3_script_path = 's3://my-code-bucket/glue/glue-etl-processing.py'
	my_glue_role = 'MyGlueJobRole' # created earlier


	response = glue.create_job(
	Name=glue_job_name,
	input_df = spark.read.option("header", "true").csv(s3_input_data_path)

	rearranged_col_names_df = input_df.select(*columns)

	# drop null values
	cleaned_df = rearranged_col_names_df.dropna()
	print("Dropped null values")

	# split dataframe into train and validation
	splits = cleaned_df.randomSplit([0.7, 0.3], 0)
	# Copyright (C) 2016 Martina Pugliese

	from boto3 import resource
	from boto3.dynamodb.conditions import Key

	# The boto3 dynamoDB resource
	dynamodb_resource = resource('dynamodb')


	def get_table_metadata(table_name):