Skip to content

Instantly share code, notes, and snippets.

@aodin
Last active April 28, 2024 09:10
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save aodin/d774d92019288447dc48a05966fbe2b7 to your computer and use it in GitHub Desktop.
Save aodin/d774d92019288447dc48a05966fbe2b7 to your computer and use it in GitHub Desktop.
Parse an Nginx access.log file into a Pandas DataFrame
"""
Parse an Nginx access.log file into a Pandas DataFrame. Also works with gzipped files.
"""
import argparse
import pathlib
import pandas as pd
parser = argparse.ArgumentParser()
parser.add_argument("path", type=pathlib.Path, help="path to access.log file")
DATE_FORMAT = r"%d/%b/%Y:%H:%M:%S %z"
def parse_nginx_timestamp(value):
return pd.to_datetime(value, format=DATE_FORMAT, exact=False)
def remove_quotes(value):
return value.strip(r'"') if value else value
def main(log_file):
df = pd.read_csv(
log_file,
sep=r'\s(?=(?:[^"]*"[^"]*")*[^"]*$)(?![^\[]*\])',
engine="python",
usecols=[0, 3, 4, 5, 6, 7, 8],
names=["ip", "time", "request", "status", "size", "referer", "agent"],
parse_dates=["time"],
date_parser=parse_nginx_timestamp,
header=None,
converters={
"request": remove_quotes,
"referer": remove_quotes,
"agent": remove_quotes,
},
)
# Aggregate views by day and IP
summary = df.groupby([df["time"].dt.date, df["ip"]]).agg({"status": "count"})
summary.sort_values(by=["time", "status"], ascending=False, inplace=True)
print(summary.to_string())
if __name__ == "__main__":
args = parser.parse_args()
main(args.path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment