Skip to content

Instantly share code, notes, and snippets.

@Tatsh
Created March 7, 2014 20:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Tatsh/9419392 to your computer and use it in GitHub Desktop.
Save Tatsh/9419392 to your computer and use it in GitHub Desktop.
Read in a typical Apache log and parse into consumable YAML format.
#!/usr/bin/env python
import re
import sys
import yaml
line_prog = re.compile(r"""^(?P<ip>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) # IP Address
[^\]]+ # ? What do the numbers represent in this field(s)?
\[(?P<date>\d{1,2}\/(?:[^\/]+)\/\d{4})\: # Date
(?P<time>(?:\d{2}\:){2}\d{2}\s+[\-\+]\d{1,4})\]\s # Time
"(?P<method>GET|POST)\s+ # Method
(?P<path>[^\s]+)\s+ # Path
HTTP\/\d+.\d+(?:\s+)?"\s+ # HTTP version
(?P<status>\d+)\s+ # Status
(?P<size>(?:\d+)|\-)\s+ # Size (can be - for nothing)
"(?P<referer>(?:(?:[^"]|")+)?)"\s+ # HTTP referer
"(?P<user_agent>(?:(?:[^"]|")+)?)"(\s+)? # User agent
""", re.X)
def stream_lines(handle):
while True:
line = handle.readline().strip()
if not line:
break
yield line
if __name__ == '__main__':
matches = []
paths = {
'GET': [],
'POST': [],
}
normal_methods = paths.keys()
with open(sys.argv[1]) as f:
line_no = 1
for line in stream_lines(f):
match = line_prog.match(line)
if match is None:
print(line, file=sys.stderr)
continue
matches_dict = match.groupdict()
matches_dict['status'] = int(matches_dict['status'])
if matches_dict['method'] not in normal_methods:
print('Interesting line: %s' % (line,), file=sys.stderr)
continue
if matches_dict['status'] != 200:
print('Skipping line %d due to status %d' % (line_no, matches_dict['status']), file=sys.stderr)
continue
path = matches_dict['path']
method = matches_dict['method']
caught_paths = paths[method]
if path in caught_paths:
print('Already logged %s (%s)' % (path, method,), file=sys.stderr)
continue
if matches_dict['size'] != '-':
matches_dict['size'] = int(matches_dict['size'])
else:
matches_dict['size'] = 0
matches.append(matches_dict)
paths[method].append(path)
line_no += 1
print(yaml.dump(matches, indent=4, default_flow_style=False, width=9999))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment