Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
Run Impala query on EMR cluster and store results as CSV file on S3
import boto.emr
from boto.s3.key import Key
from boto.s3.connection import S3Connection
aws_key = 'your-key'
aws_secret = 'your-secret'
bucket_name = 'your-bucket-name'
result_file_name = 'your-result-file-path'
s3_connection = S3Connection(aws_key, aws_secret, validate_certs=False)
bucket = s3_connection.get_bucket(bucket_name)
query = Key(bucket)
query.key = '{}.sql'.format(result_file_name)
query.set_contents_from_string('''
SELECT * FROM my_table;
''')
query_runner = Key(bucket)
query_runner.key = '{}.runner'.format(result_file_name)
query_runner.set_contents_from_string('''
REMOTE_QUERY_FILE=$1
LOCAL_QUERY_FILE=./impala.query
LOCAL_OUTPUT_FILE=./output.csv
S3_OUTPUT_FILE=s3://{}/{}.csv
hadoop fs -copyToLocal $REMOTE_QUERY_FILE $LOCAL_QUERY_FILE
impala-shell --query_file $LOCAL_QUERY_FILE \
--output_file $LOCAL_OUTPUT_FILE \
--print_header \
--delimited \
--output_delimiter=','
hadoop fs -rm $S3_OUTPUT_FILE
hadoop fs -mkdir -p $(dirname $S3_OUTPUT_FILE)
hadoop fs -copyFromLocal $LOCAL_OUTPUT_FILE $S3_OUTPUT_FILE
'''.format(bucket.name, result_file_name))
emr_steps = [
boto.emr.step.ScriptRunnerStep(
name='Run query {}'.format(result_file_name),
step_args=[
's3://{}/{}'.format(bucket.name, query_runner.key),
's3://{}/{}'.format(bucket.name, query.key)
]
)
]
emr_connection = boto.emr.connection.EmrConnection(
aws_key,
aws_secret,
region=emr_region('us-east-1'),
)
emr_connection.run_jobflow(
name='Cluster name',
log_uri='s3://{}/logs'.format(bucket.name),
ec2_keyname='my_key',
master_instance_type='m3.xlarge',
slave_instance_type='m3.2xlarge',
num_instances=2,
action_on_failure='CANCEL_AND_WAIT',
keep_alive=True,
ami_version='3.3.1',
bootstrap_actions=[
boto.emr.BootstrapAction(
'Install Impala',
's3://us-east-1.elasticmapreduce/libs/impala/setup-impala',
[
'--install-impala',
'--base-path', 's3://us-east-1.elasticmapreduce',
'--impala-version', 'latest',
'IMPALA_BACKEND_PORT=22001',
'IMPALA_MEM_LIMIT=90%',
]
)],
steps=emr_steps,
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment