Mr0grog/summarize.py

## summarize.py
from datetime import timedelta
import dateutil.parser
from pathlib import Path
import re

START_LINE = re.compile(r'^\[([^\]]+)\] Starting Internet Archive Import')
END_LINE = re.compile(r'^\s*Internet Archive import completed at (.+)')
SUMMARY_START = re.compile(r'^\s*Loaded (\d+) CDX records:')
SUMMARY_ITEM = re.compile(r'^\s*(\d+)\s([\s\w\-]+)\s\(')
IMPORT_ERRORS = re.compile(r'^\s*Total:\s*(\d+)\serrors')

def cleanline(line):
    return line.strip('\n,.')

def summarize_log_file(file):
    start_time = None
    end_time = None
    summary_lines = []
    import_errors = 0
    summary_lines_by_type = {'total': '', 'unknown errors': '', 'successes': ''}
    summary = {'total': 0, 'unknown errors': 0, 'successes': 0}
    ##
    # Logfiles start with a time:
    #   [Sat Nov 14 03:53:31 UTC 2020] Starting Internet Archive Import
    # Then have lots of logs, then a summary like:
    #   Loaded 75244 CDX records:
    #      28491 successes (37.86%),
    #      45655 could not be played back (60.68%),
    #        309 had no actual memento (0.41%),
    #        146 unknown errors (0.19%).
    # (Why did I put those commas and periods in there???)
    # Optionally followed by a list of import errors:
    #   Import job errors:
    #     77314:   3 errors ["Row 616: ..."]
    #     77315:   1 errors ["Row 409: ..."]
    #     77319:   2 errors ["Row 720: ..."]
    #     Total:   6 errors
    # And ending with a time:
    #   Internet Archive import completed at Sat Nov 14 10:31:06 UTC 2020
    mode = 'start'
    for line in file:
        if not start_time:
            start_match = START_LINE.match(line)
            if start_match:
                start_time = dateutil.parser.parse(start_match.group(1))
                mode = 'summary_search'
        elif mode == 'summary_search':
            start_match = SUMMARY_START.match(line)
            if start_match:
                summary_lines.append(cleanline(line))
                summary_lines_by_type['total'] = cleanline(line)
                summary['total'] = int(start_match.group(1))
                mode = 'summary'
        elif mode == 'summary':
            is_summary_line = SUMMARY_ITEM.match(line)
            if is_summary_line:
                summary_lines.append(cleanline(line))
                summary_type = is_summary_line.group(2)
                summary_lines_by_type[summary_type] = cleanline(line)
                summary[summary_type] = int(is_summary_line.group(1))
            else:
                mode = 'end'
        elif mode == 'end':
            total_match = IMPORT_ERRORS.match(line)
            if total_match:
                import_errors = int(total_match.group(1))
                continue
                ##
            end_match = END_LINE.match(line)
            if end_match:
                end_time = dateutil.parser.parse(end_match.group(1).strip())
                break
        ##
    if end_time and start_time:
        total_time = end_time - start_time
    else:
        total_time = timedelta(0)
    ##
    summary['time'] = total_time
    summary['lines'] = summary_lines
    summary['lines_by_type'] = summary_lines_by_type
    summary['import_errors'] = import_errors
    return summary


def summary_block(summary):
    """Print a complete summary as a block."""
    output_lines = [
        f'Time: {summary["time"]}',
        *summary["lines"],
        f'Import errors: {summary["import_errors"]}'
    ]
    return "\n".join(output_lines)


def summarize_dir(logdir):
    """Get summaries for each file in the directory."""
    files = [logfile for logfile in logdir.iterdir() if logfile.is_file()]
    files.sort()
    summaries = []
    for logfile in files:
        with logfile.open() as file:
            summaries.append((logfile, summarize_log_file(file)))
    return summaries


def table(headers, rows, delimiter='   '):
    """Print a nice table of data"""
    all_rows = [headers, *rows]
    sizes = [max(len(str(row[index])) for row in all_rows)
             for index in range(len(headers))]
    for row in all_rows:
        text = ''
        for index, value in enumerate(row):
            if index == 0:
                text += str(value).ljust(sizes[index])
            else:
                text += f'{delimiter}{str(value).rjust(sizes[index])}'
        print(text)

def table(headers, rows, delimiter='   ', markdown=False):
    """Print a nice table of data"""
    all_rows = [headers, *rows]
    sizes = [max(len(str(row[index])) for row in all_rows)
             for index in range(len(headers))]
    if markdown:
        delimiter = ' | '
        row = [((sizes[index] - 1) * '-') + (index == 0 and '-' or ':')
               for index, _ in enumerate(headers)]
        all_rows.insert(1, row)
    for row in all_rows:
        text = ''
        for index, value in enumerate(row):
            if index == 0:
                text += str(value).ljust(sizes[index])
            else:
                text += f'{delimiter}{str(value).rjust(sizes[index])}'
        if markdown:
            text = f'| {text} |'
        print(text)


def normalized_bars(values, size=10):
    """Create a horizontal bar chart as a list of strings."""
    most = max(values)
    return [(round(size * x / most) * '█').ljust(size)
            for x in values]


def summary_field(summary, key):
    """
    Get a nice string for one of the summary fields, e.g. '536 (3.21%)'.
    """
    total = summary['total']
    value = summary.get(key, 0)
    if total:
        ratio = 100 * value / total
        return f'{value} ({ratio:.2f}%)'
    else:
        return '-'


def percentage(summary, key):
    """Get a summary field as a percentage string, e.g. '3.21%'."""
    total = summary['total']
    value = summary.get(key, 0)
    if total:
        ratio = 100 * value / total
        return f'{ratio:.2f}%'
    else:
        return '-'


summaries = summarize_dir(Path('/var/log/cron-ia-import/'))

# Print summary blocks
for logfile, summary in summaries:
    print(logfile)
    print(summary_block(summary))
    print('')

# Print simple time summary
for logfile, summary in summaries:
    print(f'{logfile}:  {summary["time"]}')

# Fancy table with bars and so on
total_bars = normalized_bars([s['total'] for f, s in summaries], 15)
time_bars = normalized_bars([s['time'].total_seconds() for f, s in summaries], 15)
table(['File', 'Time (bar)', 'Time', 'Total (bar)', 'Total', 'Skipped', 'Errors', 'No Playback', 'No Memento'],
      [(
          logfile.stem[7:], # Slice the date from logfile.name,
          time_bars[index],
          summary['time'],
          total_bars[index],
          summary["total"],
          percentage(summary, 'skipped - already in DB'),
          percentage(summary, 'unknown errors'),
          percentage(summary, 'could not be played back'),
          percentage(summary, 'had no actual memento'),
       )
       for index, (logfile, summary) in enumerate(summaries)],
       markdown=True)

# Bar charts for everything!!!
#   A) This is not really that helpful
#   B) The way I made the percentage bars is too clever and hard to follow,
#      would not do again.
total_bars = normalized_bars([s['total'] for f, s in summaries], 15)
time_bars = normalized_bars([s['time'].total_seconds() for f, s in summaries], 15)
unknown_error_bars = [" ".join(item) for item in zip(
    [percentage(s, 'unknown errors') for f, s in summaries],
    normalized_bars([s.get('unknown errors', 0) for f, s in summaries])
)]
no_playback_bars = [" ".join(item) for item in zip(
    [percentage(s, 'could not be played back') for f, s in summaries],
    normalized_bars([s.get('could not be played back', 0) for f, s in summaries])
)]
no_memento_bars = [" ".join(item) for item in zip(
    [percentage(s, 'had no actual memento') for f, s in summaries],
    normalized_bars([s.get('had no actual memento', 0) for f, s in summaries])
)]
table(['File', 'Time (bar)', 'Time', 'Total (bar)', 'Total', 'Errors', 'No Playback', 'No Memento'],
      [(
          logfile.stem[7:], # Slice the date from logfile.name,
          time_bars[index],
          summary['time'],
          total_bars[index],
          summary["total"],
          unknown_error_bars[index], # percentage(summary, 'unknown errors'),
          no_playback_bars[index], # percentage(summary, 'could not be played back'),
          no_memento_bars[index], # percentage(summary, 'had no actual memento'),
       )
       for index, (logfile, summary) in enumerate(summaries)])
	from datetime import timedelta
	import dateutil.parser
	from pathlib import Path
	import re

	START_LINE = re.compile(r'^\[([^\]]+)\] Starting Internet Archive Import')
	END_LINE = re.compile(r'^\s*Internet Archive import completed at (.+)')
	SUMMARY_START = re.compile(r'^\s*Loaded (\d+) CDX records:')
	SUMMARY_ITEM = re.compile(r'^\s*(\d+)\s([\s\w\-]+)\s\(')
	IMPORT_ERRORS = re.compile(r'^\sTotal:\s(\d+)\serrors')

	def cleanline(line):
	return line.strip('\n,.')

	def summarize_log_file(file):
	start_time = None
	end_time = None
	summary_lines = []
	import_errors = 0
	summary_lines_by_type = {'total': '', 'unknown errors': '', 'successes': ''}
	summary = {'total': 0, 'unknown errors': 0, 'successes': 0}
	##
	# Logfiles start with a time:
	# [Sat Nov 14 03:53:31 UTC 2020] Starting Internet Archive Import
	# Then have lots of logs, then a summary like:
	# Loaded 75244 CDX records:
	# 28491 successes (37.86%),
	# 45655 could not be played back (60.68%),
	# 309 had no actual memento (0.41%),
	# 146 unknown errors (0.19%).
	# (Why did I put those commas and periods in there???)
	# Optionally followed by a list of import errors:
	# Import job errors:
	# 77314: 3 errors ["Row 616: ..."]
	# 77315: 1 errors ["Row 409: ..."]
	# 77319: 2 errors ["Row 720: ..."]
	# Total: 6 errors
	# And ending with a time:
	# Internet Archive import completed at Sat Nov 14 10:31:06 UTC 2020
	mode = 'start'
	for line in file:
	if not start_time:
	start_match = START_LINE.match(line)
	if start_match:
	start_time = dateutil.parser.parse(start_match.group(1))
	mode = 'summary_search'
	elif mode == 'summary_search':
	start_match = SUMMARY_START.match(line)
	if start_match:
	summary_lines.append(cleanline(line))
	summary_lines_by_type['total'] = cleanline(line)
	summary['total'] = int(start_match.group(1))
	mode = 'summary'
	elif mode == 'summary':
	is_summary_line = SUMMARY_ITEM.match(line)
	if is_summary_line:
	summary_lines.append(cleanline(line))
	summary_type = is_summary_line.group(2)
	summary_lines_by_type[summary_type] = cleanline(line)
	summary[summary_type] = int(is_summary_line.group(1))
	else:
	mode = 'end'
	elif mode == 'end':
	total_match = IMPORT_ERRORS.match(line)
	if total_match:
	import_errors = int(total_match.group(1))
	continue
	##
	end_match = END_LINE.match(line)
	if end_match:
	end_time = dateutil.parser.parse(end_match.group(1).strip())
	break
	##
	if end_time and start_time:
	total_time = end_time - start_time
	else:
	total_time = timedelta(0)
	##
	summary['time'] = total_time
	summary['lines'] = summary_lines
	summary['lines_by_type'] = summary_lines_by_type
	summary['import_errors'] = import_errors
	return summary


	def summary_block(summary):
	"""Print a complete summary as a block."""
	output_lines = [
	f'Time: {summary["time"]}',
	*summary["lines"],
	f'Import errors: {summary["import_errors"]}'
	]
	return "\n".join(output_lines)


	def summarize_dir(logdir):
	"""Get summaries for each file in the directory."""
	files = [logfile for logfile in logdir.iterdir() if logfile.is_file()]
	files.sort()
	summaries = []
	for logfile in files:
	with logfile.open() as file:
	summaries.append((logfile, summarize_log_file(file)))
	return summaries


	def table(headers, rows, delimiter=' '):
	"""Print a nice table of data"""
	all_rows = [headers, *rows]
	sizes = [max(len(str(row[index])) for row in all_rows)
	for index in range(len(headers))]
	for row in all_rows:
	text = ''
	for index, value in enumerate(row):
	if index == 0:
	text += str(value).ljust(sizes[index])
	else:
	text += f'{delimiter}{str(value).rjust(sizes[index])}'
	print(text)

	def table(headers, rows, delimiter=' ', markdown=False):
	"""Print a nice table of data"""
	all_rows = [headers, *rows]
	sizes = [max(len(str(row[index])) for row in all_rows)
	for index in range(len(headers))]
	if markdown:
	delimiter = ' \| '
	row = [((sizes[index] - 1) * '-') + (index == 0 and '-' or ':')
	for index, _ in enumerate(headers)]
	all_rows.insert(1, row)
	for row in all_rows:
	text = ''
	for index, value in enumerate(row):
	if index == 0:
	text += str(value).ljust(sizes[index])
	else:
	text += f'{delimiter}{str(value).rjust(sizes[index])}'
	if markdown:
	text = f'\| {text} \|'
	print(text)


	def normalized_bars(values, size=10):
	"""Create a horizontal bar chart as a list of strings."""
	most = max(values)
	return [(round(size * x / most) * '█').ljust(size)
	for x in values]


	def summary_field(summary, key):
	"""
	Get a nice string for one of the summary fields, e.g. '536 (3.21%)'.
	"""
	total = summary['total']
	value = summary.get(key, 0)
	if total:
	ratio = 100 * value / total
	return f'{value} ({ratio:.2f}%)'
	else:
	return '-'


	def percentage(summary, key):
	"""Get a summary field as a percentage string, e.g. '3.21%'."""
	total = summary['total']
	value = summary.get(key, 0)
	if total:
	ratio = 100 * value / total
	return f'{ratio:.2f}%'
	else:
	return '-'



	summaries = summarize_dir(Path('/var/log/cron-ia-import/'))

	# Print summary blocks
	for logfile, summary in summaries:
	print(logfile)
	print(summary_block(summary))
	print('')

	# Print simple time summary
	for logfile, summary in summaries:
	print(f'{logfile}: {summary["time"]}')

	# Fancy table with bars and so on
	total_bars = normalized_bars([s['total'] for f, s in summaries], 15)
	time_bars = normalized_bars([s['time'].total_seconds() for f, s in summaries], 15)
	table(['File', 'Time (bar)', 'Time', 'Total (bar)', 'Total', 'Skipped', 'Errors', 'No Playback', 'No Memento'],
	[(
	logfile.stem[7:], # Slice the date from logfile.name,
	time_bars[index],
	summary['time'],
	total_bars[index],
	summary["total"],
	percentage(summary, 'skipped - already in DB'),
	percentage(summary, 'unknown errors'),
	percentage(summary, 'could not be played back'),
	percentage(summary, 'had no actual memento'),
	)
	for index, (logfile, summary) in enumerate(summaries)],
	markdown=True)

	# Bar charts for everything!!!
	# A) This is not really that helpful
	# B) The way I made the percentage bars is too clever and hard to follow,
	# would not do again.
	total_bars = normalized_bars([s['total'] for f, s in summaries], 15)
	time_bars = normalized_bars([s['time'].total_seconds() for f, s in summaries], 15)
	unknown_error_bars = [" ".join(item) for item in zip(
	[percentage(s, 'unknown errors') for f, s in summaries],
	normalized_bars([s.get('unknown errors', 0) for f, s in summaries])
	)]
	no_playback_bars = [" ".join(item) for item in zip(
	[percentage(s, 'could not be played back') for f, s in summaries],
	normalized_bars([s.get('could not be played back', 0) for f, s in summaries])
	)]
	no_memento_bars = [" ".join(item) for item in zip(
	[percentage(s, 'had no actual memento') for f, s in summaries],
	normalized_bars([s.get('had no actual memento', 0) for f, s in summaries])
	)]
	table(['File', 'Time (bar)', 'Time', 'Total (bar)', 'Total', 'Errors', 'No Playback', 'No Memento'],
	[(
	logfile.stem[7:], # Slice the date from logfile.name,
	time_bars[index],
	summary['time'],
	total_bars[index],
	summary["total"],
	unknown_error_bars[index], # percentage(summary, 'unknown errors'),
	no_playback_bars[index], # percentage(summary, 'could not be played back'),
	no_memento_bars[index], # percentage(summary, 'had no actual memento'),
	)
	for index, (logfile, summary) in enumerate(summaries)])