Skip to content

Instantly share code, notes, and snippets.

@huard
Created October 4, 2024 20:19
Show Gist options
  • Save huard/25ca5be3479f72546f748da54f7097e7 to your computer and use it in GitHub Desktop.
Save huard/25ca5be3479f72546f748da54f7097e7 to your computer and use it in GitHub Desktop.
NGINX download log parser and prometheus endpoint
"""
Prometheus server counting download size from THREDDS logs.
D. Huard, 2023
"""
import re
from pathlib import Path
import itertools
import pandas as pd
from prometheus_client import Counter
# NGinx log pattern
nginx = re.compile(r'(?P<remote_addr>(?:^|\b(?<!\.))(?:1?\d\d?|2[0-4]\d|25[0-5])(?:\.(?:1?\d\d?|2[0-4]\d|25[0-5])){3}('
r'?=$|[^\w.]))\s-\s(?P<remote_usr>-|[a-z_][a-z0-9_]{0,30})\s(?P<date_time>\[(?P<date>\d\d\d\d-\d\d-\d\d)T(?P<time>\d\d:\d\d:\d\d).*\])\s(?P<request>\"(?P<req_method>GET|POST|HEAD|PUT|DELETE|CONNECT|OPTIONS|TRACE|PATCH)\s(?P<req_uri>\/[^\s]*)\s(?P<http_ver>HTTP/\d\.\d)\")\s(?P<status>\d{3})\s(?P<body_byte_sent>\d+)\s\"(?P<http_referer>[^\s]+)\"\s\"(?P<user_agent>[^\"]+)\"\s\"(?P<forward_for>[^\"]+)\"')
# DAP request URL pattern
thredds = re.compile(r".+\/thredds\/(?P<tds_service>dodsC|fileServer|ncss)\/(?P<dataset>[^\s]*)(?:\?(?P<variable>\w+))")
# Columns of interest to keep
fields = ["remote_addr", "date", "tds_service", "dataset", "variable", "body_byte_sent"]
def parse(paths):
"""Return generator parsing nginx log files and returning a dictionary for each entry.
Keys are:
- remote_addr
- date
- body_byte_sent
- tds_service (dodsC | fileServer | ncss)
- dataset (path to dataset)
- variable (variable name)
"""
return itertools.chain.from_iterable(_parse(path) for path in paths)
def _parse(path: Path) -> dict:
"""Parse a single log file, returns a generator of dict."""
with open(path) as fh:
for line in fh:
# Parse the Nginx log
m = nginx.match(line)
if m:
out = m.groupdict()
out["body_byte_sent"] = int(out["body_byte_sent"])
# Parse THREDDS requests to extract access method, dataset and variable
mt = thredds.match(out['req_uri'])
if mt:
out.update(mt.groupdict())
# Return only fields of interest
yield {k: out[k] for k in fields}
else:
# TODO: Convert to a logger command
print(line)
# Prometheus counter definition
c = Counter(
name="thredds_transfer_size_kb",
documentation="THREDDS data transferred",
labelnames=fields[:-1],
unit="kb",
)
def for_prometheus(log):
"""Return a Prometheus Counter object for the given log."""
d = _parse(log)
for item in d:
size = item.pop("body_byte_sent") / 1024
c.labels(**item).inc(size)
if __name__ == '__main__':
from prometheus_client import start_http_server
logs = Path("/home/david/Downloads/nginx").glob("access_file.log*")
# Start up the server to expose the metrics.
start_http_server(8000)
# Generate some requests.
while True:
for log in logs:
for_prometheus(log)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment