maiconbaum/aws_athena_query.sql

## aws_athena_query.sql
CREATE EXTERNAL TABLE IF NOT EXISTS aws_vpc_flow_logs (
  `version` int,
  `account_id` string,
  `interface_id` string,
  `srcaddr` string,
  `dstaddr` string,
  `srcport` int,
  `dstport` int,
  `protocol` bigint,
  `packets` bigint,
  `bytes` bigint,
  `start` bigint,
  `end` bigint,
  `action` string,
  `log_status` string,
  `vpc_id` string,
  `subnet_id` string,
  `instance_id` string,
  `tcp_flags` int,
  `type` string,
  `pkt_srcaddr` string,
  `pkt_dstaddr` string,
  `az_id` string,
  `sublocation_type` string,
  `sublocation_id` string,
  `pkt_src_aws_service` string,
  `pkt_dst_aws_service` string,
  `flow_direction` string,
  `traffic_path` int
)
PARTITIONED BY (
  `region` string,
  `year` string,
  `month` int,
  `day` int,
  `hour` int
)
ROW FORMAT SERDE
  'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
STORED AS INPUTFORMAT
  'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
OUTPUTFORMAT
  'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
LOCATION
  's3://<BUCKET-NAME>/AWSLogs/aws-account-id=012345678912/aws-service=vpcflowlogs/'
TBLPROPERTIES (
  "skip.header.line.count"="1",
  "projection.enabled" = "true",
  "projection.region.type" = "enum",
  "projection.region.values" = "sa-east-1,us-east-1",
  "projection.year.type" = "date",
  "projection.year.format" = "yyyy",
  "projection.year.range" = "2022,NOW",
  "projection.year.interval" = "1",
  "projection.year.unit" = "YEARS",
  "projection.month.type" = "integer",
  "projection.month.range" = "01,12",
  "projection.month.digits" = "2",
  "projection.day.type" = "integer",
  "projection.day.range" = "01,31",
  "projection.day.digits" = "2",
  "projection.hour.type" = "integer",
  "projection.hour.range" = "00,23",
  "projection.hour.digits" = "2",
  "storage.location.template" = "s3://<BUCKET-NAME>/AWSLogs/aws-account-id=012345678912/aws-service=vpcflowlogs/aws-region=${region}/year=${year}/month=${month}/day=${day}/hour=${hour}/"
)
	CREATE EXTERNAL TABLE IF NOT EXISTS aws_vpc_flow_logs (
	`version` int,
	`account_id` string,
	`interface_id` string,
	`srcaddr` string,
	`dstaddr` string,
	`srcport` int,
	`dstport` int,
	`protocol` bigint,
	`packets` bigint,
	`bytes` bigint,
	`start` bigint,
	`end` bigint,
	`action` string,
	`log_status` string,
	`vpc_id` string,
	`subnet_id` string,
	`instance_id` string,
	`tcp_flags` int,
	`type` string,
	`pkt_srcaddr` string,
	`pkt_dstaddr` string,
	`az_id` string,
	`sublocation_type` string,
	`sublocation_id` string,
	`pkt_src_aws_service` string,
	`pkt_dst_aws_service` string,
	`flow_direction` string,
	`traffic_path` int
	)
	PARTITIONED BY (
	`region` string,
	`year` string,
	`month` int,
	`day` int,
	`hour` int
	)
	ROW FORMAT SERDE
	'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
	STORED AS INPUTFORMAT
	'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
	OUTPUTFORMAT
	'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
	LOCATION
	's3://<BUCKET-NAME>/AWSLogs/aws-account-id=012345678912/aws-service=vpcflowlogs/'
	TBLPROPERTIES (
	"skip.header.line.count"="1",
	"projection.enabled" = "true",
	"projection.region.type" = "enum",
	"projection.region.values" = "sa-east-1,us-east-1",
	"projection.year.type" = "date",
	"projection.year.format" = "yyyy",
	"projection.year.range" = "2022,NOW",
	"projection.year.interval" = "1",
	"projection.year.unit" = "YEARS",
	"projection.month.type" = "integer",
	"projection.month.range" = "01,12",
	"projection.month.digits" = "2",
	"projection.day.type" = "integer",
	"projection.day.range" = "01,31",
	"projection.day.digits" = "2",
	"projection.hour.type" = "integer",
	"projection.hour.range" = "00,23",
	"projection.hour.digits" = "2",
	"storage.location.template" = "s3://<BUCKET-NAME>/AWSLogs/aws-account-id=012345678912/aws-service=vpcflowlogs/aws-region=${region}/year=${year}/month=${month}/day=${day}/hour=${hour}/"
	)