Skip to content

Instantly share code, notes, and snippets.

@hakanilter
Created July 5, 2019 23:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hakanilter/8a9f8850915a987d25ecc2a91b889d1c to your computer and use it in GitHub Desktop.
Save hakanilter/8a9f8850915a987d25ecc2a91b889d1c to your computer and use it in GitHub Desktop.
S3 Select Example
import boto3
import pandas as pd
s3 = boto3.client('s3', 'eu-west-1')
def execute_query(query):
response = s3.select_object_content(
Bucket='my-bucket',
Key='nyse/NYSE-2000-2001.tsv.gz',
ExpressionType='SQL',
Expression=query,
InputSerialization={
'CompressionType': 'GZIP',
'CSV': {
'FileHeaderInfo': 'IGNORE',
'RecordDelimiter': '\n',
'FieldDelimiter': '\t',
}
},
OutputSerialization={
'CSV': {
#'RecordDelimiter': '\n',
#'FieldDelimiter': ',',
}
}
)
for event in response['Payload']:
if 'Records' in event:
records = event['Records']['Payload'].decode('utf-8')
elif 'Stats' in event:
statsDetails = event['Stats']['Details']
print("Stats details bytesScanned: ")
print(statsDetails['BytesScanned'])
print("Stats details bytesProcessed: ")
print(statsDetails['BytesProcessed'])
rows = [record.split(",") for record in records.split("\n")[:-1]]
return pd.DataFrame(rows)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment