Skip to content

Instantly share code, notes, and snippets.

@sergray
Created September 10, 2012 11:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sergray/3690384 to your computer and use it in GitHub Desktop.
Save sergray/3690384 to your computer and use it in GitHub Desktop.
Script for combining access logs and outputting their records sorted by time
#!/usr/bin/env python
"""
Combines provided access logs and outputs records sorted by
time of record.
Time of the record is expected to be the forth element in the
record and must have +0000 timezone.
"""
import logging
from datetime import datetime
TIME_FORMAT = '%d/%b/%Y:%H:%M:%S +0000'
TIME_IDX = 3 # zero-base index of datetime in access log record
NEXT_CHAR = {
'[': ']',
'"': '"',
}
def parse_record(log_record):
"""Returns parts of log_record string as tuple.
Does not make any validation of provided log_record.
"""
tmp_parts = log_record.split(' ')
parts = []
acc = [] # accumulator
idx = 0
while idx < len(tmp_parts):
part = tmp_parts[idx]
part_chr0 = part[0]
if part_chr0 in NEXT_CHAR:
if part[-1] != NEXT_CHAR[part_chr0]:
acc.append(part)
else:
parts.append(part[1:-1])
elif acc:
acc_chr0 = acc[0][0]
acc.append(part)
if part.endswith(NEXT_CHAR[acc_chr0]):
val = ' '.join(acc)[1:-1]
if acc_chr0 == '[':
val = datetime.strptime(val, TIME_FORMAT)
parts.append(val)
acc = []
else:
parts.append(part)
idx += 1
return tuple(parts)
def merged(*sources):
"""Generator of merged access log records sorted by time.
Accepts file-like objects as positional arguments.
Yields tuples with index of the source and its line.
"""
line_src = []
sorted_src = []
for idx, src in enumerate(sources):
# read first strings
try:
str_rec = src.next()
except StopIteration:
continue
rec = parse_record(str_rec)
line_src.append(str_rec)
sorted_src.append((rec, idx))
sorted_src.sort(key=lambda it: it[0][TIME_IDX]) # by datetime
while sorted_src:
curr_rec, curr_idx = sorted_src[0]
src = sources[curr_idx]
try:
next_rec, next_idx = sorted_src[1]
except IndexError:
next_idx = None
if next_idx is None:
for src_rec in src:
yield curr_idx, src_rec.rstrip('\n')
sorted_src = []
continue
while curr_rec[TIME_IDX] <= next_rec[TIME_IDX]:
yield curr_idx, line_src[curr_idx].rstrip('\n')
try:
str_rec = src.next()
except StopIteration:
sorted_src = sorted_src[1:]
break
curr_rec = parse_record(str_rec)
line_src[curr_idx] = str_rec
sorted_src[0] = (curr_rec, curr_idx)
sorted_src.sort(key=lambda it: it[0][TIME_IDX]) # by datetime
if __name__ == '__main__':
import sys
sources = []
paths = sys.argv[1:]
for path in paths:
try:
fobj = file(path)
except IOError, exc:
logging.error('Ignore %s: %s', path, exc)
else:
sources.append(fobj)
merged_logs = merged(*sources)
for idx, rec in merged_logs:
print "%s:%s" % (paths[idx], rec)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment