gregjurman/yelp.py

## yelp.py
from __future__ import print_function

import json
import urllib
import boto3
import csv
import zipfile

print('Loading function')

s3 = boto3.client('s3')

def lambda_handler(event, context):
    #print("Received event: " + json.dumps(event, indent=2))
    log = []
    rem_ids = []

    # Get the object from the event and show its content type
    bucket = event['Records'][0]['s3']['bucket']['name']
    key = urllib.unquote_plus(event['Records'][0]['s3']['object']['key']).decode('utf8')
    response=None
    try:
        s3.download_file(Bucket=bucket, Key=key, Filename="/tmp/inbound.zip")
    except Exception as e:
        print(e)
        print('Error getting object {} from bucket {}. Make sure they exist and your bucket is in the same region as this function.'.format(key, bucket))
        raise e

    # open our inbound zip
    with zipfile.ZipFile("/tmp/inbound.zip") as zfile:

        # get the businesses file and open it as a csv
        with zfile.open("businesses.csv") as biz_file:
            biz_csv = csv.reader(biz_file)

            # make a new temp csv for output
            with open("/tmp/businesses.csv", "wb") as out_biz_file:
                out_biz_csv = csv.writer(out_biz_file)
                out_biz_csv.writerow(biz_csv.next()) # write header out

                # iterate the businesses and filter out ones that arent for Danbury
                for r in biz_csv:
                    if str(r[3]).lower() not in ["danbury"]: # City is not 'danbury'
                        rem_ids.append(int(r[0]))
                        log.append("Removed '%s'(%i) from file: City not Danbury. Got: %s" % (r[1],int(r[0]),r[3]))
                        continue
                    if str(r[4]).lower() not in ["ct"]: # State is not 'ct'
                        rem_ids.append(int(r[0]))
                        log.append("Removed '%s'(%i) from file: State not CT. Got: %s" % (r[1],int(r[0]), r[4]))
                        continue
                    if str(r[5])[0:5] not in ['06810','06811','06817','06813','06814','06816']:
                        rem_ids.append(int(r[0]))
                        log.append("Removed '%s'(%i) from file: Zipcode doesn't belong. Got: %s" % (r[1],int(r[0]),r[5]))
                        continue

                    out_biz_csv.writerow(r) # all tests passed

        # get inspections data and purge bad business listings
        with zfile.open("inspections.csv") as insp_file:
            insp_csv = csv.reader(insp_file)

            # create new inspections file and NOT copy over rem_id lines
            with open("/tmp/inspections.csv", "wb") as out_insp_file:
                out_insp_csv = csv.writer(out_insp_file)
                out_insp_csv.writerow(insp_csv.next()) # copy header
                for r in insp_csv:
                    if int(r[0]) in rem_ids:
                        continue
                    out_insp_csv.writerow(r)

        # get violations data and purge bad business listings
        with zfile.open("violations.csv") as insp_file:
            vio_csv = csv.reader(insp_file)

            # create new inspections file and NOT copy over rem_id lines
            with open("/tmp/violations.csv", "wb") as out_vio_file:
                out_vio_csv = csv.writer(out_vio_file)
                out_vio_csv.writerow(vio_csv.next()) # copy header
                for r in vio_csv:
                    if int(r[0]) in rem_ids:
                        continue
                    out_vio_csv.writerow(r)

        # get the feed info data
        zfile.extract("feed_info.csv", "/tmp/")

    # done purging data, make a new zip file
    with zipfile.ZipFile("/tmp/outbound.zip", "w") as out_zip:
        out_zip.write("/tmp/feed_info.csv","feed_info.csv")
        out_zip.write("/tmp/businesses.csv", "businesses.csv")
        out_zip.write("/tmp/violations.csv", "violations.csv")
        out_zip.write("/tmp/inspections.csv", "inspections.csv")

    # spit out log for CloudWatch
    for l in log:
        print(l)

    try:
        s3.upload_file(Filename="/tmp/outbound.zip", Bucket="cod-yelp-outbound", Key="healthinsp.zip")
        return
    except Exception as e:
        print(e)
        print('Error putting object {} into bucket {}. Make sure they exist and your bucket is in the same region as this function.'.format("healthinsp.zip", "cod-yelp-outbound"))
        raise e
	from __future__ import print_function

	import json
	import urllib
	import boto3
	import csv
	import zipfile

	print('Loading function')

	s3 = boto3.client('s3')

	def lambda_handler(event, context):
	#print("Received event: " + json.dumps(event, indent=2))
	log = []
	rem_ids = []

	# Get the object from the event and show its content type
	bucket = event['Records'][0]['s3']['bucket']['name']
	key = urllib.unquote_plus(event['Records'][0]['s3']['object']['key']).decode('utf8')
	response=None
	try:
	s3.download_file(Bucket=bucket, Key=key, Filename="/tmp/inbound.zip")
	except Exception as e:
	print(e)
	print('Error getting object {} from bucket {}. Make sure they exist and your bucket is in the same region as this function.'.format(key, bucket))
	raise e

	# open our inbound zip
	with zipfile.ZipFile("/tmp/inbound.zip") as zfile:

	# get the businesses file and open it as a csv
	with zfile.open("businesses.csv") as biz_file:
	biz_csv = csv.reader(biz_file)

	# make a new temp csv for output
	with open("/tmp/businesses.csv", "wb") as out_biz_file:
	out_biz_csv = csv.writer(out_biz_file)
	out_biz_csv.writerow(biz_csv.next()) # write header out

	# iterate the businesses and filter out ones that arent for Danbury
	for r in biz_csv:
	if str(r[3]).lower() not in ["danbury"]: # City is not 'danbury'
	rem_ids.append(int(r[0]))
	log.append("Removed '%s'(%i) from file: City not Danbury. Got: %s" % (r[1],int(r[0]),r[3]))
	continue
	if str(r[4]).lower() not in ["ct"]: # State is not 'ct'
	rem_ids.append(int(r[0]))
	log.append("Removed '%s'(%i) from file: State not CT. Got: %s" % (r[1],int(r[0]), r[4]))
	continue
	if str(r[5])[0:5] not in ['06810','06811','06817','06813','06814','06816']:
	rem_ids.append(int(r[0]))
	log.append("Removed '%s'(%i) from file: Zipcode doesn't belong. Got: %s" % (r[1],int(r[0]),r[5]))
	continue

	out_biz_csv.writerow(r) # all tests passed

	# get inspections data and purge bad business listings
	with zfile.open("inspections.csv") as insp_file:
	insp_csv = csv.reader(insp_file)

	# create new inspections file and NOT copy over rem_id lines
	with open("/tmp/inspections.csv", "wb") as out_insp_file:
	out_insp_csv = csv.writer(out_insp_file)
	out_insp_csv.writerow(insp_csv.next()) # copy header
	for r in insp_csv:
	if int(r[0]) in rem_ids:
	continue
	out_insp_csv.writerow(r)

	# get violations data and purge bad business listings
	with zfile.open("violations.csv") as insp_file:
	vio_csv = csv.reader(insp_file)

	# create new inspections file and NOT copy over rem_id lines
	with open("/tmp/violations.csv", "wb") as out_vio_file:
	out_vio_csv = csv.writer(out_vio_file)
	out_vio_csv.writerow(vio_csv.next()) # copy header
	for r in vio_csv:
	if int(r[0]) in rem_ids:
	continue
	out_vio_csv.writerow(r)

	# get the feed info data
	zfile.extract("feed_info.csv", "/tmp/")

	# done purging data, make a new zip file
	with zipfile.ZipFile("/tmp/outbound.zip", "w") as out_zip:
	out_zip.write("/tmp/feed_info.csv","feed_info.csv")
	out_zip.write("/tmp/businesses.csv", "businesses.csv")
	out_zip.write("/tmp/violations.csv", "violations.csv")
	out_zip.write("/tmp/inspections.csv", "inspections.csv")

	# spit out log for CloudWatch
	for l in log:
	print(l)

	try:
	s3.upload_file(Filename="/tmp/outbound.zip", Bucket="cod-yelp-outbound", Key="healthinsp.zip")
	return
	except Exception as e:
	print(e)
	print('Error putting object {} into bucket {}. Make sure they exist and your bucket is in the same region as this function.'.format("healthinsp.zip", "cod-yelp-outbound"))
	raise e