Skip to content

Instantly share code, notes, and snippets.

@dardanxhymshiti
Last active May 21, 2020 15:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dardanxhymshiti/98061dc5e82af26df48b9702f7385607 to your computer and use it in GitHub Desktop.
Save dardanxhymshiti/98061dc5e82af26df48b9702f7385607 to your computer and use it in GitHub Desktop.
import boto3
from os.path import join
def lambda_handler(event, context):
emr = boto3.client('emr')
version = 'latest'
main_path = join('s3://<artifacts-bucket-name>', version, 'main.py')
modules_path = join('s3://<artifacts-bucket-name>', version, 'module_seed.zip')
job_parameters = {
'job_name': '<your-job-name>',
'input_path': 's3://<raw-data-path>',
'output_path': 's3://<processed-data-path>',
'spark_config': {
'--executor-memory': '1G',
'--driver-memory': '2G'
}
}
step_args = [
"/usr/bin/spark-submit",
'--py-files', modules_path,
main_path, str(job_parameters)
]
step = {
"Name": job_parameters['job_name'],
'ActionOnFailure': 'CONTINUE',
'HadoopJarStep': {
'Jar': 's3://us-east-1.elasticmapreduce/libs/script-runner/script-runner.jar',
'Args': step_args
}
}
action = emr.add_job_flow_steps(JobFlowId='<emr-cluster-id>', Steps=[step])
return action
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment