Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
s3logs_analysis via Athena
/* Create a database in Athena */
CREATE DATABASE s3_AccessLogsDB;
/*
Create a table within the Athena database
Replace s3://MY_S3_BUCKET/ with your S3 bucket
which has the access logs you want to query
*/
CREATE EXTERNAL TABLE IF NOT EXISTS s3_AccessLogs.Accesslogs(
BucketOwner string,
Bucket string,
RequestDateTime string,
RemoteIP string,
Requester string,
RequestID string,
Operation string,
Key string,
RequestURI_operation string,
RequestURI_key string,
RequestURI_httpProtoversion string,
HTTPstatus string,
ErrorCode string,
BytesSent string,
ObjectSize string,
TotalTime string,
TurnAroundTime string,
Referrer string,
UserAgent string,
VersionId string)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
'serialization.format' = '1',
'input.regex' = '([^ ]*) ([^ ]*) \\[(.*?)\\] ([^ ]*) ([^ ]*) ([^ ]*) ([^ ]*) ([^ ]*) \\\"([^ ]*) ([^ ]*) (- |[^ ]*)\\\" (-|[0-9]*) ([^ ]*) ([^ ]*) ([^ ]*) ([^ ]*) ([^ ]*) ([^ ]*) (\"[^\"]*\") ([^ ]*)$'
) LOCATION 's3://MY_S3_BUCKET/'
/*
Query for access outside of AWS and your VPC
Replace 172.31% with your VPC CIDR
*/
SELECT Key,
UserAgent,
RemoteIp,
count(*) AS cnt
FROM Accesslogs
WHERE regexp_like(RequestURI_operation, 'GET|HEAD')
AND Requester LIKE '-'
AND NOT regexp_like(UserAgent, 'Elastic|aws')
AND RemoteIp NOT LIKE '172.31%'
GROUP BY RemoteIp, Key, UserAgent, RemoteIp
ORDER BY cnt DESC LIMIT 10
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.