Skip to content

Instantly share code, notes, and snippets.

@gregjurman
Created October 12, 2016 21:34
Show Gist options
  • Save gregjurman/3f67128102564fd88fef0dce6582a5b2 to your computer and use it in GitHub Desktop.
Save gregjurman/3f67128102564fd88fef0dce6582a5b2 to your computer and use it in GitHub Desktop.
from __future__ import print_function
import json
import urllib
import boto3
import csv
import zipfile
print('Loading function')
s3 = boto3.client('s3')
def lambda_handler(event, context):
#print("Received event: " + json.dumps(event, indent=2))
log = []
rem_ids = []
# Get the object from the event and show its content type
bucket = event['Records'][0]['s3']['bucket']['name']
key = urllib.unquote_plus(event['Records'][0]['s3']['object']['key']).decode('utf8')
response=None
try:
s3.download_file(Bucket=bucket, Key=key, Filename="/tmp/inbound.zip")
except Exception as e:
print(e)
print('Error getting object {} from bucket {}. Make sure they exist and your bucket is in the same region as this function.'.format(key, bucket))
raise e
# open our inbound zip
with zipfile.ZipFile("/tmp/inbound.zip") as zfile:
# get the businesses file and open it as a csv
with zfile.open("businesses.csv") as biz_file:
biz_csv = csv.reader(biz_file)
# make a new temp csv for output
with open("/tmp/businesses.csv", "wb") as out_biz_file:
out_biz_csv = csv.writer(out_biz_file)
out_biz_csv.writerow(biz_csv.next()) # write header out
# iterate the businesses and filter out ones that arent for Danbury
for r in biz_csv:
if str(r[3]).lower() not in ["danbury"]: # City is not 'danbury'
rem_ids.append(int(r[0]))
log.append("Removed '%s'(%i) from file: City not Danbury. Got: %s" % (r[1],int(r[0]),r[3]))
continue
if str(r[4]).lower() not in ["ct"]: # State is not 'ct'
rem_ids.append(int(r[0]))
log.append("Removed '%s'(%i) from file: State not CT. Got: %s" % (r[1],int(r[0]), r[4]))
continue
if str(r[5])[0:5] not in ['06810','06811','06817','06813','06814','06816']:
rem_ids.append(int(r[0]))
log.append("Removed '%s'(%i) from file: Zipcode doesn't belong. Got: %s" % (r[1],int(r[0]),r[5]))
continue
out_biz_csv.writerow(r) # all tests passed
# get inspections data and purge bad business listings
with zfile.open("inspections.csv") as insp_file:
insp_csv = csv.reader(insp_file)
# create new inspections file and NOT copy over rem_id lines
with open("/tmp/inspections.csv", "wb") as out_insp_file:
out_insp_csv = csv.writer(out_insp_file)
out_insp_csv.writerow(insp_csv.next()) # copy header
for r in insp_csv:
if int(r[0]) in rem_ids:
continue
out_insp_csv.writerow(r)
# get violations data and purge bad business listings
with zfile.open("violations.csv") as insp_file:
vio_csv = csv.reader(insp_file)
# create new inspections file and NOT copy over rem_id lines
with open("/tmp/violations.csv", "wb") as out_vio_file:
out_vio_csv = csv.writer(out_vio_file)
out_vio_csv.writerow(vio_csv.next()) # copy header
for r in vio_csv:
if int(r[0]) in rem_ids:
continue
out_vio_csv.writerow(r)
# get the feed info data
zfile.extract("feed_info.csv", "/tmp/")
# done purging data, make a new zip file
with zipfile.ZipFile("/tmp/outbound.zip", "w") as out_zip:
out_zip.write("/tmp/feed_info.csv","feed_info.csv")
out_zip.write("/tmp/businesses.csv", "businesses.csv")
out_zip.write("/tmp/violations.csv", "violations.csv")
out_zip.write("/tmp/inspections.csv", "inspections.csv")
# spit out log for CloudWatch
for l in log:
print(l)
try:
s3.upload_file(Filename="/tmp/outbound.zip", Bucket="cod-yelp-outbound", Key="healthinsp.zip")
return
except Exception as e:
print(e)
print('Error putting object {} into bucket {}. Make sure they exist and your bucket is in the same region as this function.'.format("healthinsp.zip", "cod-yelp-outbound"))
raise e
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment