Skip to content

Instantly share code, notes, and snippets.

@anna-anisienia
Created October 31, 2020 13:55
Show Gist options
  • Save anna-anisienia/241baccc99d782bcc7e0c2a1913d3a93 to your computer and use it in GitHub Desktop.
Save anna-anisienia/241baccc99d782bcc7e0c2a1913d3a93 to your computer and use it in GitHub Desktop.
import boto3
import os
S3_KEY = 'taxi_2020-06.csv'
S3_BUCKET = 'playground-datasets'
TARGET_FILE = 'unknown_payment_type.csv'
s3_client = boto3.client(service_name='s3')
query = """SELECT VendorID, tpep_pickup_datetime, tpep_dropoff_datetime,
passenger_count, trip_distance, tip_amount, total_amount
FROM S3Object
WHERE payment_type = '5'"""
result = s3_client.select_object_content(Bucket=S3_BUCKET,
Key=S3_KEY,
ExpressionType='SQL',
Expression=query,
InputSerialization={'CSV': {'FileHeaderInfo': 'Use'}},
OutputSerialization={'CSV': {}})
# remove the file if exists, since we append filtered rows line by line
if os.path.exists(TARGET_FILE):
os.remove(TARGET_FILE)
with open(TARGET_FILE, 'a+') as filtered_file:
# write header as a first line, then append each row from S3 select
filtered_file.write('ID,pickup,dropoff,passenger_count,distance,tip,total\n')
for record in result['Payload']:
if 'Records' in record:
res = record['Records']['Payload'].decode('utf-8')
filtered_file.write(res)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment