Skip to content

Instantly share code, notes, and snippets.

@Mr0grog
Last active January 26, 2023 18:45
Show Gist options
  • Save Mr0grog/5f96712f9cad07fac005963fc4366bc0 to your computer and use it in GitHub Desktop.
Save Mr0grog/5f96712f9cad07fac005963fc4366bc0 to your computer and use it in GitHub Desktop.
Summarize log files from EDGI Wayback imports
from datetime import timedelta
import dateutil.parser
from pathlib import Path
import re
START_LINE = re.compile(r'^\[([^\]]+)\] Starting Internet Archive Import')
END_LINE = re.compile(r'^\s*Internet Archive import completed at (.+)')
SUMMARY_START = re.compile(r'^\s*Loaded (\d+) CDX records:')
SUMMARY_ITEM = re.compile(r'^\s*(\d+)\s([\s\w\-]+)\s\(')
IMPORT_ERRORS = re.compile(r'^\s*Total:\s*(\d+)\serrors')
def cleanline(line):
return line.strip('\n,.')
def summarize_log_file(file):
start_time = None
end_time = None
summary_lines = []
import_errors = 0
summary_lines_by_type = {'total': '', 'unknown errors': '', 'successes': ''}
summary = {'total': 0, 'unknown errors': 0, 'successes': 0}
##
# Logfiles start with a time:
# [Sat Nov 14 03:53:31 UTC 2020] Starting Internet Archive Import
# Then have lots of logs, then a summary like:
# Loaded 75244 CDX records:
# 28491 successes (37.86%),
# 45655 could not be played back (60.68%),
# 309 had no actual memento (0.41%),
# 146 unknown errors (0.19%).
# (Why did I put those commas and periods in there???)
# Optionally followed by a list of import errors:
# Import job errors:
# 77314: 3 errors ["Row 616: ..."]
# 77315: 1 errors ["Row 409: ..."]
# 77319: 2 errors ["Row 720: ..."]
# Total: 6 errors
# And ending with a time:
# Internet Archive import completed at Sat Nov 14 10:31:06 UTC 2020
mode = 'start'
for line in file:
if not start_time:
start_match = START_LINE.match(line)
if start_match:
start_time = dateutil.parser.parse(start_match.group(1))
mode = 'summary_search'
elif mode == 'summary_search':
start_match = SUMMARY_START.match(line)
if start_match:
summary_lines.append(cleanline(line))
summary_lines_by_type['total'] = cleanline(line)
summary['total'] = int(start_match.group(1))
mode = 'summary'
elif mode == 'summary':
is_summary_line = SUMMARY_ITEM.match(line)
if is_summary_line:
summary_lines.append(cleanline(line))
summary_type = is_summary_line.group(2)
summary_lines_by_type[summary_type] = cleanline(line)
summary[summary_type] = int(is_summary_line.group(1))
else:
mode = 'end'
elif mode == 'end':
total_match = IMPORT_ERRORS.match(line)
if total_match:
import_errors = int(total_match.group(1))
continue
##
end_match = END_LINE.match(line)
if end_match:
end_time = dateutil.parser.parse(end_match.group(1).strip())
break
##
if end_time and start_time:
total_time = end_time - start_time
else:
total_time = timedelta(0)
##
summary['time'] = total_time
summary['lines'] = summary_lines
summary['lines_by_type'] = summary_lines_by_type
summary['import_errors'] = import_errors
return summary
def summary_block(summary):
"""Print a complete summary as a block."""
output_lines = [
f'Time: {summary["time"]}',
*summary["lines"],
f'Import errors: {summary["import_errors"]}'
]
return "\n".join(output_lines)
def summarize_dir(logdir):
"""Get summaries for each file in the directory."""
files = [logfile for logfile in logdir.iterdir() if logfile.is_file()]
files.sort()
summaries = []
for logfile in files:
with logfile.open() as file:
summaries.append((logfile, summarize_log_file(file)))
return summaries
def table(headers, rows, delimiter=' '):
"""Print a nice table of data"""
all_rows = [headers, *rows]
sizes = [max(len(str(row[index])) for row in all_rows)
for index in range(len(headers))]
for row in all_rows:
text = ''
for index, value in enumerate(row):
if index == 0:
text += str(value).ljust(sizes[index])
else:
text += f'{delimiter}{str(value).rjust(sizes[index])}'
print(text)
def table(headers, rows, delimiter=' ', markdown=False):
"""Print a nice table of data"""
all_rows = [headers, *rows]
sizes = [max(len(str(row[index])) for row in all_rows)
for index in range(len(headers))]
if markdown:
delimiter = ' | '
row = [((sizes[index] - 1) * '-') + (index == 0 and '-' or ':')
for index, _ in enumerate(headers)]
all_rows.insert(1, row)
for row in all_rows:
text = ''
for index, value in enumerate(row):
if index == 0:
text += str(value).ljust(sizes[index])
else:
text += f'{delimiter}{str(value).rjust(sizes[index])}'
if markdown:
text = f'| {text} |'
print(text)
def normalized_bars(values, size=10):
"""Create a horizontal bar chart as a list of strings."""
most = max(values)
return [(round(size * x / most) * '█').ljust(size)
for x in values]
def summary_field(summary, key):
"""
Get a nice string for one of the summary fields, e.g. '536 (3.21%)'.
"""
total = summary['total']
value = summary.get(key, 0)
if total:
ratio = 100 * value / total
return f'{value} ({ratio:.2f}%)'
else:
return '-'
def percentage(summary, key):
"""Get a summary field as a percentage string, e.g. '3.21%'."""
total = summary['total']
value = summary.get(key, 0)
if total:
ratio = 100 * value / total
return f'{ratio:.2f}%'
else:
return '-'
summaries = summarize_dir(Path('/var/log/cron-ia-import/'))
# Print summary blocks
for logfile, summary in summaries:
print(logfile)
print(summary_block(summary))
print('')
# Print simple time summary
for logfile, summary in summaries:
print(f'{logfile}: {summary["time"]}')
# Fancy table with bars and so on
total_bars = normalized_bars([s['total'] for f, s in summaries], 15)
time_bars = normalized_bars([s['time'].total_seconds() for f, s in summaries], 15)
table(['File', 'Time (bar)', 'Time', 'Total (bar)', 'Total', 'Skipped', 'Errors', 'No Playback', 'No Memento'],
[(
logfile.stem[7:], # Slice the date from logfile.name,
time_bars[index],
summary['time'],
total_bars[index],
summary["total"],
percentage(summary, 'skipped - already in DB'),
percentage(summary, 'unknown errors'),
percentage(summary, 'could not be played back'),
percentage(summary, 'had no actual memento'),
)
for index, (logfile, summary) in enumerate(summaries)],
markdown=True)
# Bar charts for everything!!!
# A) This is not really that helpful
# B) The way I made the percentage bars is too clever and hard to follow,
# would not do again.
total_bars = normalized_bars([s['total'] for f, s in summaries], 15)
time_bars = normalized_bars([s['time'].total_seconds() for f, s in summaries], 15)
unknown_error_bars = [" ".join(item) for item in zip(
[percentage(s, 'unknown errors') for f, s in summaries],
normalized_bars([s.get('unknown errors', 0) for f, s in summaries])
)]
no_playback_bars = [" ".join(item) for item in zip(
[percentage(s, 'could not be played back') for f, s in summaries],
normalized_bars([s.get('could not be played back', 0) for f, s in summaries])
)]
no_memento_bars = [" ".join(item) for item in zip(
[percentage(s, 'had no actual memento') for f, s in summaries],
normalized_bars([s.get('had no actual memento', 0) for f, s in summaries])
)]
table(['File', 'Time (bar)', 'Time', 'Total (bar)', 'Total', 'Errors', 'No Playback', 'No Memento'],
[(
logfile.stem[7:], # Slice the date from logfile.name,
time_bars[index],
summary['time'],
total_bars[index],
summary["total"],
unknown_error_bars[index], # percentage(summary, 'unknown errors'),
no_playback_bars[index], # percentage(summary, 'could not be played back'),
no_memento_bars[index], # percentage(summary, 'had no actual memento'),
)
for index, (logfile, summary) in enumerate(summaries)])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment