Skip to content

Instantly share code, notes, and snippets.

@snehamehrin
Created July 19, 2020 14:15
Show Gist options
  • Save snehamehrin/4608e5bb1f1d554904859c04d24fd3aa to your computer and use it in GitHub Desktop.
Save snehamehrin/4608e5bb1f1d554904859c04d24fd3aa to your computer and use it in GitHub Desktop.
import boto3
import setup
def create_emr():
client = boto3.client('emr', region_name='us-east-1')
cluster_id = client.run_job_flow(Name='stackoverflow', ReleaseLabel='emr-5.18.0',
Applications=[
{
'Name': 'Spark'
},
{
'Name':'hue'
},
{
'Name': 'hive'
},
{
'Name': 'presto'
}
],
Instances={
'InstanceGroups': [
{
'Name': "Master",
'Market': 'SPOT',
'InstanceRole': 'MASTER',
'InstanceType': 'm1.xlarge',
'InstanceCount': 1,
},
{
'Name': "Slave",
'Market': 'SPOT',
'InstanceRole': 'CORE',
'InstanceType': 'm1.xlarge',
'InstanceCount': 1,
}
],
'Ec2KeyName': setup.key_name,
'KeepJobFlowAliveWhenNoSteps': True,
'TerminationProtected': False,
},
VisibleToAllUsers = True,
JobFlowRole = 'EMR_EC2_DefaultRole',
ServiceRole = 'EMR_DefaultRole',
Steps=[
{
'Name': 'Copy Script',
'ActionOnFailure': 'TERMINATE_CLUSTER',
'HadoopJarStep': {
'Jar': 'command-runner.jar',
'Args': ['aws', 's3', 'cp', 's3://stack-overflow-bucket/stack-processing.py', '/home/hadoop/']
}
},
{
'Name': 'Run Spark',
'ActionOnFailure': 'TERMINATE_CLUSTER',
'HadoopJarStep': {
'Jar': 'command-runner.jar',
'Args': ['spark-submit', '/home/hadoop/stack-processing.py']
}
}
]
)
response = client.list_clusters(
ClusterStates=['RUNNING'
],
)
print(response)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment