Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
No Dataflow Runner
import boto3
connection = boto3.client(
'emr',
region_name='eu-central-1',
aws_access_key_id=ACCESS_KEY,
aws_secret_access_key=SECRET_KEY,
)
cluster_id = connection.run_job_flow(
Name='RDB Shredder',
LogUri='s3://example-snowplow/logs',
ReleaseLabel='emr-6.4.0',
Applications=[
{
'Name': 'Spark'
},
],
Instances={
'InstanceGroups': [
{
'Name': "Master nodes",
'Market': 'ON_DEMAND',
'InstanceRole': 'MASTER',
'InstanceType': 'm1.xlarge',
'InstanceCount': 1,
},
{
'Name': "Slave nodes",
'Market': 'ON_DEMAND',
'InstanceRole': 'CORE',
'InstanceType': 'm5.xlarge',
'InstanceCount': 1,
}
],
'Ec2KeyName': 'example-ec2-key',
'TerminationProtected': False,
'Ec2SubnetId': 'subnet-abcdef124567890ab',
},
Steps=[
{
'Name': 'S3DistCp enriched data archiving',
'ActionOnFailure': 'CANCEL_AND_WAIT',
'HadoopJarStep': {
'Jar': '/usr/share/aws/emr/s3-dist-cp/lib/s3-dist-cp.jar',
'Args': ['--iglu-config', YOUR_B64_ENCODED_IGLU, '--config', YOUR_B64_ENCODED_CONFIG]
}
}
{
'Name': 'RDB Shredder',
'ActionOnFailure': 'CANCEL_AND_WAIT',
'HadoopJarStep': {
'Jar': 's3://snowplow-hosted-assets-eu-central-1/4-storage/rdb-shredder/snowplow-rdb-shredder-2.1.0-rc2.jar',
'Args': [
"--src", "s3://com-acme/enriched/sink/",
"--dest", YOUR_PATH_WITH_PROPER_RUN_ID,
"--s3Endpoint", "s3-eu-central-1.amazonaws.com",
"--srcPattern", ".*",
"--outputCodec", "gz",
"--deleteOnSuccess"
]
}
}
],
VisibleToAllUsers=True,
JobFlowRole='EMR_EC2_DefaultRole',
ServiceRole='EMR_DefaultRole',
Tags=[],
)
print ('cluster created with the step...', cluster_id['JobFlowId'])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment