Created
October 4, 2024 20:19
-
-
Save huard/25ca5be3479f72546f748da54f7097e7 to your computer and use it in GitHub Desktop.
NGINX download log parser and prometheus endpoint
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Prometheus server counting download size from THREDDS logs. | |
D. Huard, 2023 | |
""" | |
import re | |
from pathlib import Path | |
import itertools | |
import pandas as pd | |
from prometheus_client import Counter | |
# NGinx log pattern | |
nginx = re.compile(r'(?P<remote_addr>(?:^|\b(?<!\.))(?:1?\d\d?|2[0-4]\d|25[0-5])(?:\.(?:1?\d\d?|2[0-4]\d|25[0-5])){3}(' | |
r'?=$|[^\w.]))\s-\s(?P<remote_usr>-|[a-z_][a-z0-9_]{0,30})\s(?P<date_time>\[(?P<date>\d\d\d\d-\d\d-\d\d)T(?P<time>\d\d:\d\d:\d\d).*\])\s(?P<request>\"(?P<req_method>GET|POST|HEAD|PUT|DELETE|CONNECT|OPTIONS|TRACE|PATCH)\s(?P<req_uri>\/[^\s]*)\s(?P<http_ver>HTTP/\d\.\d)\")\s(?P<status>\d{3})\s(?P<body_byte_sent>\d+)\s\"(?P<http_referer>[^\s]+)\"\s\"(?P<user_agent>[^\"]+)\"\s\"(?P<forward_for>[^\"]+)\"') | |
# DAP request URL pattern | |
thredds = re.compile(r".+\/thredds\/(?P<tds_service>dodsC|fileServer|ncss)\/(?P<dataset>[^\s]*)(?:\?(?P<variable>\w+))") | |
# Columns of interest to keep | |
fields = ["remote_addr", "date", "tds_service", "dataset", "variable", "body_byte_sent"] | |
def parse(paths): | |
"""Return generator parsing nginx log files and returning a dictionary for each entry. | |
Keys are: | |
- remote_addr | |
- date | |
- body_byte_sent | |
- tds_service (dodsC | fileServer | ncss) | |
- dataset (path to dataset) | |
- variable (variable name) | |
""" | |
return itertools.chain.from_iterable(_parse(path) for path in paths) | |
def _parse(path: Path) -> dict: | |
"""Parse a single log file, returns a generator of dict.""" | |
with open(path) as fh: | |
for line in fh: | |
# Parse the Nginx log | |
m = nginx.match(line) | |
if m: | |
out = m.groupdict() | |
out["body_byte_sent"] = int(out["body_byte_sent"]) | |
# Parse THREDDS requests to extract access method, dataset and variable | |
mt = thredds.match(out['req_uri']) | |
if mt: | |
out.update(mt.groupdict()) | |
# Return only fields of interest | |
yield {k: out[k] for k in fields} | |
else: | |
# TODO: Convert to a logger command | |
print(line) | |
# Prometheus counter definition | |
c = Counter( | |
name="thredds_transfer_size_kb", | |
documentation="THREDDS data transferred", | |
labelnames=fields[:-1], | |
unit="kb", | |
) | |
def for_prometheus(log): | |
"""Return a Prometheus Counter object for the given log.""" | |
d = _parse(log) | |
for item in d: | |
size = item.pop("body_byte_sent") / 1024 | |
c.labels(**item).inc(size) | |
if __name__ == '__main__': | |
from prometheus_client import start_http_server | |
logs = Path("/home/david/Downloads/nginx").glob("access_file.log*") | |
# Start up the server to expose the metrics. | |
start_http_server(8000) | |
# Generate some requests. | |
while True: | |
for log in logs: | |
for_prometheus(log) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment