Skip to content

Instantly share code, notes, and snippets.

@eldondevcg
Created January 25, 2017 21:53
Show Gist options
  • Save eldondevcg/a5e1dac06f3f206807df636bb0287d21 to your computer and use it in GitHub Desktop.
Save eldondevcg/a5e1dac06f3f206807df636bb0287d21 to your computer and use it in GitHub Desktop.
Breaking out the data from bucket logs for Amazon Athena
CREATE EXTERNAL TABLE default.bucket_logs (
`bucketowner` string,
`bucket` string,
`datetime` string,
`sourceip` string,
`requestor_id` string,
`request_id` string,
`operation` string,
`key` string,
`http_line` string,
`response_code` string,
`s3_error` string,
`bytes_sent` string,
`object_size` string,
`total_time` string,
`turnaround_time` string,
`referrer` string,
`user_agent` string,
`rest` string
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.RegexSerDe'
WITH SERDEPROPERTIES (
'serialization.format' = '1',
'input.regex' = '([^ ]*) ([^ ]*) ([^\]]*\]) ([^ ]*) ([^ ]*) ([^ ]*) ([^ ]*) ([^ ]*) ("[^"]*") ([^ ]*) ([^ ]*) ([^ ]*) ([^ ]*) ([^ ]*) ([^ ]*) ([^ ]*) ("[^"]*") (.*)'
) LOCATION 's3://my-bucket-log-bucket/logs/'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment