Skip to content

Instantly share code, notes, and snippets.

@snehamehrin
Last active August 7, 2020 12:31
Show Gist options
  • Save snehamehrin/5e7777afec8621763387e58f7c79aa04 to your computer and use it in GitHub Desktop.
Save snehamehrin/5e7777afec8621763387e58f7c79aa04 to your computer and use it in GitHub Desktop.
import boto3
import setup
def create_emr():
client = boto3.client('emr', region_name='us-east-1')
cluster_id = client.run_job_flow(Name='stackoverflow', ReleaseLabel='emr-5.18.0',
Applications=[
{
'Name': 'Spark'
},
{
'Name':'hue'
},
{
'Name': 'hive'
},
{
'Name': 'presto'
}
],
Instances={
'InstanceGroups': [
{
'Name': "Master",
'Market': 'SPOT',
'InstanceRole': 'MASTER',
'InstanceType': 'm1.xlarge',
'InstanceCount': 1,
},
{
'Name': "Slave",
'Market': 'SPOT',
'InstanceRole': 'CORE',
'InstanceType': 'm1.xlarge',
'InstanceCount': 1,
}
],
'Ec2KeyName': setup.key_name,
'KeepJobFlowAliveWhenNoSteps': True,
'TerminationProtected': False,
},
VisibleToAllUsers = True,
JobFlowRole = 'EMR_EC2_DefaultRole',
ServiceRole = 'EMR_DefaultRole',
Steps=[
{
'Name': 'Copy Processing Script',
'ActionOnFailure': 'TERMINATE_CLUSTER',
'HadoopJarStep': {
'Jar': 'command-runner.jar',
'Args': ['aws', 's3', 'cp', 's3://stack-overflow-bucket/scripts/stack-processing.py', '/home/hadoop/']
}
},
{
'Name': 'Copy Executing Script',
'ActionOnFailure': 'TERMINATE_CLUSTER',
'HadoopJarStep': {
'Jar': 'command-runner.jar',
'Args': ['aws', 's3', 'cp', 's3://stack-overflow-bucket/scripts/Execute.sh','/home/hadoop/']
}
},
{
'Name': 'Copy Jar',
'ActionOnFailure': 'TERMINATE_CLUSTER',
'HadoopJarStep': {
'Jar': 'command-runner.jar',
'Args': ['aws', 's3', 'cp',
's3://stack-overflow-bucket/scripts/RedshiftJDBC42-no-awssdk-1.2.20.1043.jar', '/home/hadoop/']
}
}
]
)
response = client.list_clusters(
ClusterStates=['RUNNING'
],
)
print(response)
create_emr()
import boto3
import setup
def run_jobs():
client = boto3.client('emr', region_name='us-east-1')
response = client.list_clusters(
ClusterStates=['WAITING'
],
)
for cluster in response['Clusters']:
cluster_id=cluster['Id']
response = client.add_job_flow_steps(
JobFlowId=cluster_id,
Steps=[
{
'Name': 'Execute Script',
'ActionOnFailure': 'CANCEL_AND_WAIT',
'HadoopJarStep': {
'Jar': 'command-runner.jar',
'Args': ['bash', '/home/hadoop/Execute.sh']
}
}
]
)
print(response)
run_jobs()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment