Skip to content

Instantly share code, notes, and snippets.

@AwsGeek
Last active January 10, 2021 18:45
Show Gist options
  • Save AwsGeek/b5c3d258d53a54b5ed4ca6032a74ede3 to your computer and use it in GitHub Desktop.
Save AwsGeek/b5c3d258d53a54b5ed4ca6032a74ede3 to your computer and use it in GitHub Desktop.
Parse S3 logs and send to Google Analytics using the Google Analytics Measurement Protocol
from __future__ import print_function
import re
import urllib
import boto3
from botocore.vendored import requests
s3 = boto3.client('s3')
pattern = re.compile(r'(?P<owner>\S+) (?P<bucket>\S+) \[(?P<time>.*)\] (?P<ip>\S+) (?P<requester>\S+) (?P<reqid>\S+) (?P<operation>\S+) (?P<key>\S+) (?P<request>"[^"]*") (?P<status>\S+) (?P<error>\S+) (?P<bytes>\S+) (?P<size>\S+) (?P<totaltime>\S+) (?P<turnaround>\S+) (?P<referrer>"[^"]*") (?P<useragent>"[^"]*") (?P<version>\S)')
# Parse S3 logs and send to Google Analytics using the Measurement Protocol
# https://developers.google.com/analytics/devguides/collection/protocol/v1/
def lambda_handler(event, context):
try:
bucket = event['Records'][0]['s3']['bucket']['name']
key = event['Records'][0]['s3']['object']['key']
response = s3.get_object(Bucket=bucket, Key=key)
contentType = response['ContentType']
if contentType == 'text/plain':
log = response['Body'].read().decode('utf-8')
for line in log.strip().split('\n'):
line = line.strip()
if line.startswith('#'):
continue
match = pattern.match(line)
if not match:
continue
operation = match.group('operation')
if operation != 'WEBSITE.GET.OBJECT':
continue
ip = match.group('ip')
page = match.group('key')
params = {'v': 1, 'tid': 'UA-XXXXX-Y', 'uip': ip, 'cid': ip, 't':'pageview', 'dp': page}
params = urllib.parse.urlencode(params, quote_via=urllib.parse.quote)
response = requests.get("https://www.google-analytics.com/collect", params=params)
except Exception as e:
print("Exception: %s" % (e))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment