Skip to content

Instantly share code, notes, and snippets.

@bungoume bungoume/parser.py
Last active Mar 25, 2019

Embed
What would you like to do?
S3 log parser
from datetime import datetime
import os
import re
from urllib.parse import unquote_plus
import boto3
RE_TEXT = r"""
^(?P<bucket_owner>\S+)\u0020
(?P<bucket>\S+)\u0020
\[(?P<datetime>[^\]]+)\]\u0020
(?P<client_ip>\S+)\u0020
(?P<requester>\S+)\u0020
(?P<request_id>[0-9A-F]{16})\u0020
(?P<operation>[A-Z0-9\.\_]+)\u0020
(?P<key>\S+)\u0020
\"(?P<request_uri>.*)\"\u0020
(?P<http_status>([0-9]{3}|-))\u0020
(?P<error_code>([A-Za-z]+|-))\u0020
(?P<sent_bytes>([0-9]+|-))\u0020
(?P<object_size>([0-9]+|-))\u0020
(?P<total_time_ms>([0-9]+|-))\u0020
(?P<turn_around_time>([0-9]+|-))\u0020
\"(?P<referrer>.*)\"\u0020
\"(?P<user_agent>.*)\"\u0020
(?P<version_id>[A-Za-z0-9\_\.]+|-)
(\u0020(?P<undocumented_field>\S+))?
"""
RE_TEXT2 = r"""
\u0020
(?P<host_id>([A-Za-z0-9\/\+\=]+|-))\u0020
(?P<signature_version>(SigV2|SigV4|-))\u0020
(?P<cipher_suite>[A-Z0-9\-]+)\u0020
(?P<authentication_type>(AuthHeader|QueryString|-))\u0020
(?P<host_header>\S+)
(?P<new_fields>\u0020.*)?
"""
RE_TEXT_NOQUOTE = RE_TEXT.replace("\\\"", "").replace(".*", r"\S*")
RE_FORMAT_V4 = re.compile(RE_TEXT + RE_TEXT2, flags=re.VERBOSE)
RE_FORMAT_V3 = re.compile(RE_TEXT, flags=re.VERBOSE)
RE_FORMAT_V2 = re.compile(RE_TEXT_NOQUOTE + RE_TEXT2, flags=re.VERBOSE)
NUMBER_FIELDS = [
"sent_bytes",
"total_time_ms",
"http_status",
"turn_around_time",
"object_size",
]
RE_REQUEST = re.compile(r"^([\w\-]+) ([^\s\?]*)(\?([^\s]*))? (.*)$")
def parse_line(line):
line = line.rstrip("\n")
m = RE_FORMAT_V4.match(line)
if not m:
m = RE_FORMAT_V3.match(line)
if not m:
m = RE_FORMAT_V2.match(line)
doc = m.groupdict()
doc.pop("new_fields", None)
dt = datetime.strptime(doc["datetime"], "%d/%b/%Y:%H:%M:%S %z")
doc.pop("datetime")
doc["@timestamp"] = dt.strftime("%Y-%m-%dT%H:%M:%SZ")
for field in NUMBER_FIELDS:
if doc[field] != "-":
doc[field] = int(doc[field])
else:
doc.pop(field)
if doc["client_ip"] == "-":
doc.pop("client_ip")
if doc["request_uri"] != "-":
request_line = RE_REQUEST.match(doc["request_uri"])
try:
doc["method"] = request_line.group(1)
doc["path"] = request_line.group(2)
doc["query"] = request_line.group(4) or ""
doc["protocol_version"] = request_line.group(5)
except Exception:
print("Parse Error request: ", doc["request_uri"])
return doc
def handle(event, context):
bucket = event["Records"][0]["s3"]["bucket"]["name"]
key = unquote_plus(event["Records"][0]["s3"]["object"]["key"])
fn = os.path.basename(key)
temp_filename = f"/tmp/{fn}"
s3 = boto3.client("s3")
s3.download_file(bucket, key, temp_filename)
with open(temp_filename, encoding="utf-8", errors="replace") as f:
for i, line in enumerate(f):
doc = parse_line(line)
...
os.remove(temp_filename)
@bungoume

This comment has been minimized.

Copy link
Owner Author

bungoume commented Mar 25, 2019

2019/3/26 現在

  • 以下operationsはダブルクオートなし(RE_FORMAT_V2)
    • REST.COPY.OBJECT_GET
    • BATCH.DELETE.OBJECT
  • 以下operationsはRE_TEXT2の項目なし(RE_FORMAT_V3)
    • S3.TRANSITION.OBJECT
    • S3.EXPIRE.OBJECT
    • S3.TRANSITION_SIA.OBJECT
    • S3.CREATE.DELETEMARKER
  • その他の操作は全項目+new_field(TLS) (RE_FORMAT_V4)

履歴

  • 2019/03/05頃より RE_TEXT2 の項目が追加される
  • 2019/03/20頃より new_fieldsとしてTLS Versionらしき項目追加
  • 2019/03/23頃より undocumented_fieldがログから(ほぼ)除外
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.