Skip to content

Instantly share code, notes, and snippets.

@frosforever
Last active February 21, 2024 14:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save frosforever/a86aff5961c631b391fb059448da1a40 to your computer and use it in GitHub Desktop.
Save frosforever/a86aff5961c631b391fb059448da1a40 to your computer and use it in GitHub Desktop.
Reading ALB logs using spark
// See https://docs.aws.amazon.com/athena/latest/ug/application-load-balancer-logs.html for regex pattern and column names
val raw = spark.read.text("s3://some_s3_path/albname/AWSLogs/accountId/elasticloadbalancing/region/year/month/day/")
val regex = """([^ ]*) ([^ ]*) ([^ ]*) ([^ ]*):([0-9]*) ([^ ]*)[:-]([0-9]*) ([-.0-9]*) ([-.0-9]*) ([-.0-9]*) (|[-0-9]*) (-|[-0-9]*) ([-0-9]*) ([-0-9]*) \"([^ ]*) (.*) (- |[^ ]*)\" \"([^\"]*)\" ([A-Z0-9-_]+) ([A-Za-z0-9.-]*) ([^ ]*) \"([^\"]*)\" \"([^\"]*)\" \"([^\"]*)\" ([-.0-9]*) ([^ ]*) \"([^\"]*)\" \"([^\"]*)\" \"([^ ]*)\" \"([^\s]+?)\" \"([^\s]+)\" \"([^ ]*)\" \"([^ ]*)\""""
val albLogs = raw.select(
regexp_extract($"value", regex, 1).as("type"),
regexp_extract($"value", regex, 2).as("time"),
regexp_extract($"value", regex, 3).as("elb"),
regexp_extract($"value", regex, 4).as("client_ip"),
regexp_extract($"value", regex, 5).cast("int").as("client_port"),
regexp_extract($"value", regex, 6).as("target_ip"),
regexp_extract($"value", regex, 7).cast("int").as("target_port"),
regexp_extract($"value", regex, 8).cast("double").as("request_processing_time"),
regexp_extract($"value", regex, 9).cast("double").as("target_processing_time"),
regexp_extract($"value", regex, 10).cast("double").as("response_processing_time"),
regexp_extract($"value", regex, 11).cast("int").as("elb_status_code"),
regexp_extract($"value", regex, 12).as("target_status_code"),
regexp_extract($"value", regex, 13).cast("bigint").as("received_bytes"),
regexp_extract($"value", regex, 14).cast("bigint").as("sent_bytes"),
regexp_extract($"value", regex, 15).as("request_verb"),
regexp_extract($"value", regex, 16).as("request_url"),
regexp_extract($"value", regex, 17).as("request_proto"),
regexp_extract($"value", regex, 18).as("user_agent"),
regexp_extract($"value", regex, 19).as("ssl_cipher"),
regexp_extract($"value", regex, 20).as("ssl_protocol"),
regexp_extract($"value", regex, 21).as("target_group_arn"),
regexp_extract($"value", regex, 22).as("trace_id"),
regexp_extract($"value", regex, 23).as("domain_name"),
regexp_extract($"value", regex, 24).as("chosen_cert_arn"),
regexp_extract($"value", regex, 25).as("matched_rule_priority"),
regexp_extract($"value", regex, 26).as("request_creation_time"),
regexp_extract($"value", regex, 27).as("actions_executed"),
regexp_extract($"value", regex, 28).as("redirect_url"),
regexp_extract($"value", regex, 29).as("lambda_error_reason"),
regexp_extract($"value", regex, 30).as("target_port_list"),
regexp_extract($"value", regex, 31).as("target_status_code_list"),
regexp_extract($"value", regex, 32).as("classification"),
regexp_extract($"value", regex, 33).as("classification_reason")
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment