Skip to content

Instantly share code, notes, and snippets.

@amywieliczka
Created October 29, 2019 19:18
Show Gist options
  • Save amywieliczka/a57bd602cadf8faae42dcd3a06ef46ab to your computer and use it in GitHub Desktop.
Save amywieliczka/a57bd602cadf8faae42dcd3a06ef46ab to your computer and use it in GitHub Desktop.
Strips non-google log entries from access logs in an s3 bucket
import boto3
import uuid
from urllib.parse import unquote_plus
import csv
import socket
import re
import gzip
s3_client = boto3.client('s3')
googlebot_whitelist_ip = []
def verify_google(ip_address, log_path):
if ip_address in googlebot_whitelist_ip:
return True
try:
hostname = socket.gethostbyaddr(ip_address)[0]
except socket.error as err:
print(f'Error on reverse DNS: {ip_address}, {log_path}, {err}')
exp = r'^[a-zA-Z0-9-]*\.google(bot){0,1}\.com'
if not re.search(exp, hostname):
# spoofing google user agent
return False
try:
ip_lookup = socket.gethostbyname(hostname)
except socket.error as err:
print(f'Error on forward DNS: {ip_address}, {log_path}, {err}')
if ip_address != ip_lookup:
# ip in roundtrip DNS doesn't match
return False
googlebot_whitelist_ip.append(ip_address)
return True
def filter_log_file(log_path, google_log_path):
google_log = gzip.open(google_log_path, "wt")
with gzip.open(log_path, 'rt') as cloudfront_log:
logs = csv.reader(cloudfront_log, delimiter='\t')
print(f'{log_path}: {next(logs)[0]}')
fields = next(logs)[0].split(' ')
user_agent_i = fields.index('cs(User-Agent)') - 1
ip_i = fields.index('c-ip') - 1
for log in logs:
user_agent = log[user_agent_i]
ip_address = log[ip_i]
# quick filter out all non-google user agents
if (user_agent.find('google') < 0 or
user_agent.find('Google') < 0):
continue
# check by IP address
if verify_google(ip_address, log_path):
google_log.write('\t'.join(log))
google_log.write('\n')
else:
continue
def handler(event, context):
for record in event['Records']:
bucket = record['s3']['bucket']['name']
key = unquote_plus(record['s3']['object']['key'])
filename = key.split('/')[1]
download_path = '/tmp/{}'.format(filename)
upload_path = '/tmp/google-{}'.format(filename)
s3_client.download_file(bucket, key, download_path)
filter_log_file(download_path, upload_path)
s3_client.upload_file(upload_path, bucket, f'google/{filename}')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment