bungoume/parser.py

## parser.py
from datetime import datetime
import os
import re
from urllib.parse import unquote_plus

import boto3


RE_TEXT = r"""
    ^(?P<bucket_owner>\S+)\u0020
    (?P<bucket>\S+)\u0020
    \[(?P<datetime>[^\]]+)\]\u0020
    (?P<client_ip>\S+)\u0020
    (?P<requester>\S+)\u0020
    (?P<request_id>[0-9A-F]{16})\u0020
    (?P<operation>[A-Z0-9\.\_]+)\u0020
    (?P<key>\S+)\u0020
    \"(?P<request_uri>.*)\"\u0020
    (?P<http_status>([0-9]{3}|-))\u0020
    (?P<error_code>([A-Za-z]+|-))\u0020
    (?P<sent_bytes>([0-9]+|-))\u0020
    (?P<object_size>([0-9]+|-))\u0020
    (?P<total_time_ms>([0-9]+|-))\u0020
    (?P<turn_around_time>([0-9]+|-))\u0020
    \"(?P<referrer>.*)\"\u0020
    \"(?P<user_agent>.*)\"\u0020
    (?P<version_id>[A-Za-z0-9\_\.]+|-)
    (\u0020(?P<undocumented_field>\S+))?
    """
RE_TEXT2 = r"""
    \u0020
    (?P<host_id>([A-Za-z0-9\/\+\=]+|-))\u0020
    (?P<signature_version>(SigV2|SigV4|-))\u0020
    (?P<cipher_suite>[A-Z0-9\-]+)\u0020
    (?P<authentication_type>(AuthHeader|QueryString|-))\u0020
    (?P<host_header>\S+)
    (?P<new_fields>\u0020.*)?
    """
RE_TEXT_NOQUOTE = RE_TEXT.replace("\\\"", "").replace(".*", r"\S*")

RE_FORMAT_V4 = re.compile(RE_TEXT + RE_TEXT2, flags=re.VERBOSE)
RE_FORMAT_V3 = re.compile(RE_TEXT, flags=re.VERBOSE)
RE_FORMAT_V2 = re.compile(RE_TEXT_NOQUOTE + RE_TEXT2, flags=re.VERBOSE)


NUMBER_FIELDS = [
    "sent_bytes",
    "total_time_ms",
    "http_status",
    "turn_around_time",
    "object_size",
]
RE_REQUEST = re.compile(r"^([\w\-]+) ([^\s\?]*)(\?([^\s]*))? (.*)$")


def parse_line(line):
    line = line.rstrip("\n")
    m = RE_FORMAT_V4.match(line)
    if not m:
        m = RE_FORMAT_V3.match(line)
    if not m:
        m = RE_FORMAT_V2.match(line)
    doc = m.groupdict()
    doc.pop("new_fields", None)
    dt = datetime.strptime(doc["datetime"], "%d/%b/%Y:%H:%M:%S %z")
    doc.pop("datetime")
    doc["@timestamp"] = dt.strftime("%Y-%m-%dT%H:%M:%SZ")
    for field in NUMBER_FIELDS:
        if doc[field] != "-":
            doc[field] = int(doc[field])
        else:
            doc.pop(field)

    if doc["client_ip"] == "-":
        doc.pop("client_ip")

    if doc["request_uri"] != "-":
        request_line = RE_REQUEST.match(doc["request_uri"])
        try:
            doc["method"] = request_line.group(1)
            doc["path"] = request_line.group(2)
            doc["query"] = request_line.group(4) or ""
            doc["protocol_version"] = request_line.group(5)
        except Exception:
            print("Parse Error request: ", doc["request_uri"])

    return doc


def handle(event, context):
    bucket = event["Records"][0]["s3"]["bucket"]["name"]
    key = unquote_plus(event["Records"][0]["s3"]["object"]["key"])
    fn = os.path.basename(key)
    temp_filename = f"/tmp/{fn}"
    s3 = boto3.client("s3")
    s3.download_file(bucket, key, temp_filename)

    with open(temp_filename, encoding="utf-8", errors="replace") as f:
        for i, line in enumerate(f):
            doc = parse_line(line)
            ...

    os.remove(temp_filename)
	from datetime import datetime
	import os
	import re
	from urllib.parse import unquote_plus

	import boto3


	RE_TEXT = r"""
	^(?P<bucket_owner>\S+)\u0020
	(?P<bucket>\S+)\u0020
	\[(?P<datetime>[^\]]+)\]\u0020
	(?P<client_ip>\S+)\u0020
	(?P<requester>\S+)\u0020
	(?P<request_id>[0-9A-F]{16})\u0020
	(?P<operation>[A-Z0-9\.\_]+)\u0020
	(?P<key>\S+)\u0020
	\"(?P<request_uri>.*)\"\u0020
	(?P<http_status>([0-9]{3}\|-))\u0020
	(?P<error_code>([A-Za-z]+\|-))\u0020
	(?P<sent_bytes>([0-9]+\|-))\u0020
	(?P<object_size>([0-9]+\|-))\u0020
	(?P<total_time_ms>([0-9]+\|-))\u0020
	(?P<turn_around_time>([0-9]+\|-))\u0020
	\"(?P<referrer>.*)\"\u0020
	\"(?P<user_agent>.*)\"\u0020
	(?P<version_id>[A-Za-z0-9\_\.]+\|-)
	(\u0020(?P<undocumented_field>\S+))?
	"""
	RE_TEXT2 = r"""
	\u0020
	(?P<host_id>([A-Za-z0-9\/\+\=]+\|-))\u0020
	(?P<signature_version>(SigV2\|SigV4\|-))\u0020
	(?P<cipher_suite>[A-Z0-9\-]+)\u0020
	(?P<authentication_type>(AuthHeader\|QueryString\|-))\u0020
	(?P<host_header>\S+)
	(?P<new_fields>\u0020.*)?
	"""
	RE_TEXT_NOQUOTE = RE_TEXT.replace("\\\"", "").replace(".", r"\S")

	RE_FORMAT_V4 = re.compile(RE_TEXT + RE_TEXT2, flags=re.VERBOSE)
	RE_FORMAT_V3 = re.compile(RE_TEXT, flags=re.VERBOSE)
	RE_FORMAT_V2 = re.compile(RE_TEXT_NOQUOTE + RE_TEXT2, flags=re.VERBOSE)


	NUMBER_FIELDS = [
	"sent_bytes",
	"total_time_ms",
	"http_status",
	"turn_around_time",
	"object_size",
	]
	RE_REQUEST = re.compile(r"^([\w\-]+) ([^\s\?])(\?([^\s]))? (.*)$")


	def parse_line(line):
	line = line.rstrip("\n")
	m = RE_FORMAT_V4.match(line)
	if not m:
	m = RE_FORMAT_V3.match(line)
	if not m:
	m = RE_FORMAT_V2.match(line)
	doc = m.groupdict()
	doc.pop("new_fields", None)
	dt = datetime.strptime(doc["datetime"], "%d/%b/%Y:%H:%M:%S %z")
	doc.pop("datetime")
	doc["@timestamp"] = dt.strftime("%Y-%m-%dT%H:%M:%SZ")
	for field in NUMBER_FIELDS:
	if doc[field] != "-":
	doc[field] = int(doc[field])
	else:
	doc.pop(field)

	if doc["client_ip"] == "-":
	doc.pop("client_ip")

	if doc["request_uri"] != "-":
	request_line = RE_REQUEST.match(doc["request_uri"])
	try:
	doc["method"] = request_line.group(1)
	doc["path"] = request_line.group(2)
	doc["query"] = request_line.group(4) or ""
	doc["protocol_version"] = request_line.group(5)
	except Exception:
	print("Parse Error request: ", doc["request_uri"])

	return doc


	def handle(event, context):
	bucket = event["Records"][0]["s3"]["bucket"]["name"]
	key = unquote_plus(event["Records"][0]["s3"]["object"]["key"])
	fn = os.path.basename(key)
	temp_filename = f"/tmp/{fn}"
	s3 = boto3.client("s3")
	s3.download_file(bucket, key, temp_filename)

	with open(temp_filename, encoding="utf-8", errors="replace") as f:
	for i, line in enumerate(f):
	doc = parse_line(line)
	...

	os.remove(temp_filename)