Skip to content

Instantly share code, notes, and snippets.

@MarkRoddy
Last active February 28, 2024 18:42
Show Gist options
  • Star 16 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save MarkRoddy/563b9194f5d196545eef4a59c913d8a6 to your computer and use it in GitHub Desktop.
Save MarkRoddy/563b9194f5d196545eef4a59c913d8a6 to your computer and use it in GitHub Desktop.
DuckDB: Query S3 Access Logs
/*
Usage: you'll want to search for the strings <bucket> and <prefix>, and insert the S3 bucket where your access
logs are being delivered. Use (or delete) <prefix> to filter to a subset of your logs.
*/
/*
These commented out configuration settings you can either run yourself in the REPL and source this file using
`.read parse_s3_access_logs.sql`, or you can uncomment them and supply values for yourself.
*/
-- install https;
-- load https;
-- SET s3_region='us-west-2';
-- SET s3_access_key_id='';
-- SET s3_secret_access_key='';
WITH parsed_logs AS (
SELECT
regexp_extract(col1, '^([0-9a-zA-Z]+)\s+([a-z0-9.\-]+)\s+\[([0-9/A-Za-z: +]+)\] ([^ ]+) ([^ ]+) ([^ ]+) ([^ ]+) ([^ ]+) ("[^"]*"|-) ([^ ]+) ([^ ]+) (\d+|-) (\d+|-) (\d+|-) (\d+|-) ("[^"]*"|-) ("[^"]*"|-) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+) (\S+)(.*)$',
[
'bucket_owner', 'bucket', 'timestamp', 'remote_ip', 'request', 'request_id',
'operation', 's3_key', 'request_uri', 'http_status', 's3_errorcode', 'bytes_sent',
'object_size', 'total_time', 'turn_around_time', 'referer', 'user_agent',
'version_id', 'host_id', 'sigver', 'cyphersuite', 'auth_type', 'host_header',
'tls_version', 'access_point_arn', 'acl_required', 'extra'
]) AS log_struct
FROM
-- Trick the CSV reader into reading as a single column
read_csv(
's3://<bucket>/<prefix>/*',
columns={'col1': 'VARCHAR'},
-- Use a *hopefully* nonsensical deliminator, so no ',' chars screw us up
delim='\0'
)
)
SELECT
-- Grab everything from the struct that we want as strings, exclude stuff we'll coersce to diff types
log_struct.* exclude (timestamp, bytes_sent, object_size, total_time, turn_around_time),
strptime(log_struct.timestamp, '%d/%b/%Y:%H:%M:%S %z') AS timestamp,
CASE
WHEN log_struct.bytes_sent = '-' THEN NULL
ELSE CAST(log_struct.bytes_sent AS INTEGER)
END AS bytes_sent,
CASE
WHEN log_struct.object_size = '-' THEN NULL
ELSE CAST(log_struct.object_size AS INTEGER)
END AS object_size,
CASE
WHEN log_struct.total_time = '-' THEN NULL
ELSE CAST(log_struct.total_time AS INTEGER)
END AS total_time,
CASE
WHEN log_struct.turn_around_time = '-' THEN NULL
ELSE CAST(log_struct.turn_around_time AS INTEGER)
END AS turn_around_time
FROM parsed_logs;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment