Created
December 28, 2022 22:03
-
-
Save maiconbaum/0a2a98901e0577262de918843fe94a3a to your computer and use it in GitHub Desktop.
AWS Glue Catalog Table for AWS VPC Flow Logs using Apache Hive Compatible Format and Partition Projection.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CREATE EXTERNAL TABLE IF NOT EXISTS aws_vpc_flow_logs ( | |
`version` int, | |
`account_id` string, | |
`interface_id` string, | |
`srcaddr` string, | |
`dstaddr` string, | |
`srcport` int, | |
`dstport` int, | |
`protocol` bigint, | |
`packets` bigint, | |
`bytes` bigint, | |
`start` bigint, | |
`end` bigint, | |
`action` string, | |
`log_status` string, | |
`vpc_id` string, | |
`subnet_id` string, | |
`instance_id` string, | |
`tcp_flags` int, | |
`type` string, | |
`pkt_srcaddr` string, | |
`pkt_dstaddr` string, | |
`az_id` string, | |
`sublocation_type` string, | |
`sublocation_id` string, | |
`pkt_src_aws_service` string, | |
`pkt_dst_aws_service` string, | |
`flow_direction` string, | |
`traffic_path` int | |
) | |
PARTITIONED BY ( | |
`region` string, | |
`year` string, | |
`month` int, | |
`day` int, | |
`hour` int | |
) | |
ROW FORMAT SERDE | |
'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' | |
STORED AS INPUTFORMAT | |
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' | |
OUTPUTFORMAT | |
'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' | |
LOCATION | |
's3://<BUCKET-NAME>/AWSLogs/aws-account-id=012345678912/aws-service=vpcflowlogs/' | |
TBLPROPERTIES ( | |
"skip.header.line.count"="1", | |
"projection.enabled" = "true", | |
"projection.region.type" = "enum", | |
"projection.region.values" = "sa-east-1,us-east-1", | |
"projection.year.type" = "date", | |
"projection.year.format" = "yyyy", | |
"projection.year.range" = "2022,NOW", | |
"projection.year.interval" = "1", | |
"projection.year.unit" = "YEARS", | |
"projection.month.type" = "integer", | |
"projection.month.range" = "01,12", | |
"projection.month.digits" = "2", | |
"projection.day.type" = "integer", | |
"projection.day.range" = "01,31", | |
"projection.day.digits" = "2", | |
"projection.hour.type" = "integer", | |
"projection.hour.range" = "00,23", | |
"projection.hour.digits" = "2", | |
"storage.location.template" = "s3://<BUCKET-NAME>/AWSLogs/aws-account-id=012345678912/aws-service=vpcflowlogs/aws-region=${region}/year=${year}/month=${month}/day=${day}/hour=${hour}/" | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
So, it's easier to understand how did I ended up here looking at this question in re:Post forum.
The above AWS Athena Query creates a table in AWS Glue Catalog for AWS VPC Flow Logs using Apache Hive Compatible Format (with per-hour partition) and using the AWS Athena Partition Projection feature in order to avoid running scheduled
MSCK REPAIR TABLE
orALTER TABLE ADD PARTITION
.Following is an example of the AWS VPC Flow Logs creation using Terraform: