ruanbekker/boto3_emr_create_cluster_with_wordcount_step.py

## boto3_emr_create_cluster_with_wordcount_step.py
import boto3

client = boto3.client(
    'emr',
    region_name='eu-west-1'
)

cmd = "hadoop jar /usr/lib/hadoop-mapreduce/hadoop-mapreduce-examples.jar wordcount file:///etc/services /output"

emrcluster = client.run_job_flow(
    Name='EMR Cluster with Boto',
    LogUri='s3://<bucket>/logs/',
    ReleaseLabel='emr-5.3.0',
    Instances={
        'InstanceGroups': [
            {
                'Name': "Master nodes",
                'Market': 'ON_DEMAND',
                'InstanceRole': 'MASTER',
                'InstanceType': 'm1.medium',
                'InstanceCount': 1,
            },
            {
                'Name': "Slave nodes",
                'Market': 'ON_DEMAND',
                'InstanceRole': 'CORE',
                'InstanceType': 'm1.medium',
                'InstanceCount': 2,
            }
        ],
        'Ec2KeyName': '<keyname>',
        'KeepJobFlowAliveWhenNoSteps': True,
        'TerminationProtected': False,
        'Ec2SubnetId': 'subnet-<id>',
    },
    Steps=[
        {
         'Name': 'Wordcount Job',
         'HadoopJarStep': {
             'Jar': 'command-runner.jar',
             'Args': cmd.split()
             }
        }
    ],
    VisibleToAllUsers=True,
    JobFlowRole='EMR_EC2_DefaultRole',
    ServiceRole='EMR_DefaultRole',
    Tags=[
        {
            'Key': 'Name',
            'Value': 'EMR with Boto',
        },
        {
            'Key': 'TerminationVal',
            'Value': 'OK',
        },
    ],
)

print(
    'ClusterID: {} , DateCreated: {} , RequestId: {}'
    .format(
        emrcluster['JobFlowId'],
        emrcluster['ResponseMetadata']['HTTPHeaders']['date'],
        emrcluster['ResponseMetadata']['RequestId']
        )
    )
	import boto3

	client = boto3.client(
	'emr',
	region_name='eu-west-1'
	)

	cmd = "hadoop jar /usr/lib/hadoop-mapreduce/hadoop-mapreduce-examples.jar wordcount file:///etc/services /output"

	emrcluster = client.run_job_flow(
	Name='EMR Cluster with Boto',
	LogUri='s3://<bucket>/logs/',
	ReleaseLabel='emr-5.3.0',
	Instances={
	'InstanceGroups': [
	{
	'Name': "Master nodes",
	'Market': 'ON_DEMAND',
	'InstanceRole': 'MASTER',
	'InstanceType': 'm1.medium',
	'InstanceCount': 1,
	},
	{
	'Name': "Slave nodes",
	'Market': 'ON_DEMAND',
	'InstanceRole': 'CORE',
	'InstanceType': 'm1.medium',
	'InstanceCount': 2,
	}
	],
	'Ec2KeyName': '<keyname>',
	'KeepJobFlowAliveWhenNoSteps': True,
	'TerminationProtected': False,
	'Ec2SubnetId': 'subnet-<id>',
	},
	Steps=[
	{
	'Name': 'Wordcount Job',
	'HadoopJarStep': {
	'Jar': 'command-runner.jar',
	'Args': cmd.split()
	}
	}
	],
	VisibleToAllUsers=True,
	JobFlowRole='EMR_EC2_DefaultRole',
	ServiceRole='EMR_DefaultRole',
	Tags=[
	{
	'Key': 'Name',
	'Value': 'EMR with Boto',
	},
	{
	'Key': 'TerminationVal',
	'Value': 'OK',
	},
	],
	)

	print(
	'ClusterID: {} , DateCreated: {} , RequestId: {}'
	.format(
	emrcluster['JobFlowId'],
	emrcluster['ResponseMetadata']['HTTPHeaders']['date'],
	emrcluster['ResponseMetadata']['RequestId']
	)
	)