Skip to content

Instantly share code, notes, and snippets.

@larsxschneider
Last active August 29, 2015 14:14
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save larsxschneider/922b8aaf07bc9b71ff39 to your computer and use it in GitHub Desktop.
Save larsxschneider/922b8aaf07bc9b71ff39 to your computer and use it in GitHub Desktop.
Run Impala query on EMR cluster and store results as CSV file on S3
import boto.emr
from boto.s3.key import Key
from boto.s3.connection import S3Connection
aws_key = 'your-key'
aws_secret = 'your-secret'
bucket_name = 'your-bucket-name'
result_file_name = 'your-result-file-path'
s3_connection = S3Connection(aws_key, aws_secret, validate_certs=False)
bucket = s3_connection.get_bucket(bucket_name)
query = Key(bucket)
query.key = '{}.sql'.format(result_file_name)
query.set_contents_from_string('''
SELECT * FROM my_table;
''')
query_runner = Key(bucket)
query_runner.key = '{}.runner'.format(result_file_name)
query_runner.set_contents_from_string('''
REMOTE_QUERY_FILE=$1
LOCAL_QUERY_FILE=./impala.query
LOCAL_OUTPUT_FILE=./output.csv
S3_OUTPUT_FILE=s3://{}/{}.csv
hadoop fs -copyToLocal $REMOTE_QUERY_FILE $LOCAL_QUERY_FILE
impala-shell --query_file $LOCAL_QUERY_FILE \
--output_file $LOCAL_OUTPUT_FILE \
--print_header \
--delimited \
--output_delimiter=','
hadoop fs -rm $S3_OUTPUT_FILE
hadoop fs -mkdir -p $(dirname $S3_OUTPUT_FILE)
hadoop fs -copyFromLocal $LOCAL_OUTPUT_FILE $S3_OUTPUT_FILE
'''.format(bucket.name, result_file_name))
emr_steps = [
boto.emr.step.ScriptRunnerStep(
name='Run query {}'.format(result_file_name),
step_args=[
's3://{}/{}'.format(bucket.name, query_runner.key),
's3://{}/{}'.format(bucket.name, query.key)
]
)
]
emr_connection = boto.emr.connection.EmrConnection(
aws_key,
aws_secret,
region=emr_region('us-east-1'),
)
emr_connection.run_jobflow(
name='Cluster name',
log_uri='s3://{}/logs'.format(bucket.name),
ec2_keyname='my_key',
master_instance_type='m3.xlarge',
slave_instance_type='m3.2xlarge',
num_instances=2,
action_on_failure='CANCEL_AND_WAIT',
keep_alive=True,
ami_version='3.3.1',
bootstrap_actions=[
boto.emr.BootstrapAction(
'Install Impala',
's3://us-east-1.elasticmapreduce/libs/impala/setup-impala',
[
'--install-impala',
'--base-path', 's3://us-east-1.elasticmapreduce',
'--impala-version', 'latest',
'IMPALA_BACKEND_PORT=22001',
'IMPALA_MEM_LIMIT=90%',
]
)],
steps=emr_steps,
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment