ssokolow/mass_diff.py

## mass_diff.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Simple tool to identify bad bits in a cartridge by comparing many dumps

--snip--

Requires numpy.

"""

# Silence Pylint/Flake8 complaints if run under Python 2
from __future__ import (absolute_import, division, print_function,
                        with_statement, unicode_literals)

__author__ = "Stephan Sokolow (deitarion/SSokolow)"
__appname__ = "Mass Binary Diff"
__version__ = "0.0pre0"
__license__ = "MIT"

import logging, sys
from itertools import groupby

import numpy

log = logging.getLogger(__name__)

def compare(paths, skip_head=0):
    """Identify bits which vary across multiple copies of the same file"""
    bad_bytes = set()
    rom1 = numpy.fromfile(paths[0], numpy.uint8)[skip_head:]

    # Find bytes that differ between runs
    file_count = len(paths)
    for path in paths[1:]:
        rom2 = numpy.fromfile(path, numpy.uint8)[skip_head:]

        if rom1.size != rom2.size:
            log.warn("WARNING: Size mismatch (%s != %s). "
                     "Skipping %r...", rom1.size, rom2.size, path)
            file_count -= 1
            continue

        bad_bits = rom1 ^ rom2
        mismatch_indices = numpy.nonzero(bad_bits)[0]
        for index in mismatch_indices:
            bad_bytes.add((index, bad_bits[index]))
        del bad_bits

    # If a byte had different bits vary across different runs, merge the
    # resulting "which are bad" masks using bitwise OR.
    bad_bytes2 = set()
    for offset, group in groupby(sorted(bad_bytes), lambda x: x[0]):
        combined = 0
        for offset, bad_bits in group:
            combined |= bad_bits

        bad_bytes2.add((offset, combined))

    return list(sorted(bad_bytes2)), {
        'file_count': file_count,
        'file_len': rom1.size
    }

# TODO: This visualization helps to identify potential bad cells or noisy
#       data lines, but I should also make it multi-column to show patterns
#       in the addresses where problems are showing up.

def print_table(rows, stats):
    """Render the output of compare() for human inspection."""
    col1_len = len(hex(stats['file_len']))
    row_lengths = [x[0] for x in rows] or [0]
    assert stats['file_len'] > max(row_lengths)

    bad_bit_count = sum(bin(x[1]).count('1') for x in rows)

    stats_lines = [
        " {}k bytes examined ".format(stats['file_len'] // 1024),
        " {} copies correlated ".format(stats['file_count']),
        " {} unstable bits found ".format(bad_bit_count)
    ]
    max_line_len = max(len(x) for x in stats_lines)

    # Calculate the width of the first column and pre-render a divider row long
    # enough to look good above `sum_line`.
    divider = "-{}-+----------".format("-" * col1_len)
    if len(divider) < max_line_len:
        divider += '-' * (max_line_len - len(divider))

    print(" {:>{col1_len}} | Bad Bits ".format("Offset", col1_len=col1_len))
    print(divider)
    for line in rows:
        print(" {:#{col1_len}x} | {:08b} ".format(*line, col1_len=col1_len))
    print(divider)
    print('\n'.join(stats_lines))

def main():
    """The main entry point, compatible with setuptools entry points."""
    from argparse import ArgumentParser, RawDescriptionHelpFormatter
    parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter,
            description=__doc__.replace('\r\n', '\n').split('\n--snip--\n')[0])
    parser.add_argument('--version', action='version',
            version="%%(prog)s v%s" % __version__)
    parser.add_argument('-v', '--verbose', action="count",
        default=2, help="Increase the verbosity. Use twice for extra effect.")
    parser.add_argument('-q', '--quiet', action="count",
        default=0, help="Decrease the verbosity. Use twice for extra effect.")
    parser.add_argument('--skip-head', action="store", type=int,
        default=0, help="Specify the number of bytes at the beginning of the "
                        "file to exclude from comparison. (eg. specify `16` "
                        "to omit the iNES header when calculating offsets in "
                        "an NES ROM.)")
    parser.add_argument('path', action='store', nargs='+',
        default=[], help="Path to a ROM to be diffed")

    args = parser.parse_args()

    # Set up clean logging to stderr
    log_levels = [logging.CRITICAL, logging.ERROR, logging.WARNING,
                  logging.INFO, logging.DEBUG]
    args.verbose = min(args.verbose - args.quiet, len(log_levels) - 1)
    args.verbose = max(args.verbose, 0)
    logging.basicConfig(level=log_levels[args.verbose],
                        format='%(levelname)s: %(message)s')

    if len(args.path) < 2:
        log.critical("A minimum of two input files are required. Exiting.")
        sys.exit(1)

    bad_bits, stats = compare(args.path, skip_head=args.skip_head)
    print_table(bad_bits, stats)

if __name__ == '__main__':
    main()

# vim: set sw=4 sts=4 expandtab :
	#!/usr/bin/env python3
	# -- coding: utf-8 --
	"""Simple tool to identify bad bits in a cartridge by comparing many dumps

	--snip--

	Requires numpy.

	"""

	# Silence Pylint/Flake8 complaints if run under Python 2
	from __future__ import (absolute_import, division, print_function,
	with_statement, unicode_literals)

	__author__ = "Stephan Sokolow (deitarion/SSokolow)"
	__appname__ = "Mass Binary Diff"
	__version__ = "0.0pre0"
	__license__ = "MIT"

	import logging, sys
	from itertools import groupby

	import numpy

	log = logging.getLogger(__name__)

	def compare(paths, skip_head=0):
	"""Identify bits which vary across multiple copies of the same file"""
	bad_bytes = set()
	rom1 = numpy.fromfile(paths[0], numpy.uint8)[skip_head:]

	# Find bytes that differ between runs
	file_count = len(paths)
	for path in paths[1:]:
	rom2 = numpy.fromfile(path, numpy.uint8)[skip_head:]

	if rom1.size != rom2.size:
	log.warn("WARNING: Size mismatch (%s != %s). "
	"Skipping %r...", rom1.size, rom2.size, path)
	file_count -= 1
	continue

	bad_bits = rom1 ^ rom2
	mismatch_indices = numpy.nonzero(bad_bits)[0]
	for index in mismatch_indices:
	bad_bytes.add((index, bad_bits[index]))
	del bad_bits

	# If a byte had different bits vary across different runs, merge the
	# resulting "which are bad" masks using bitwise OR.
	bad_bytes2 = set()
	for offset, group in groupby(sorted(bad_bytes), lambda x: x[0]):
	combined = 0
	for offset, bad_bits in group:
	combined \|= bad_bits

	bad_bytes2.add((offset, combined))

	return list(sorted(bad_bytes2)), {
	'file_count': file_count,
	'file_len': rom1.size
	}

	# TODO: This visualization helps to identify potential bad cells or noisy
	# data lines, but I should also make it multi-column to show patterns
	# in the addresses where problems are showing up.

	def print_table(rows, stats):
	"""Render the output of compare() for human inspection."""
	col1_len = len(hex(stats['file_len']))
	row_lengths = [x[0] for x in rows] or [0]
	assert stats['file_len'] > max(row_lengths)

	bad_bit_count = sum(bin(x[1]).count('1') for x in rows)

	stats_lines = [
	" {}k bytes examined ".format(stats['file_len'] // 1024),
	" {} copies correlated ".format(stats['file_count']),
	" {} unstable bits found ".format(bad_bit_count)
	]
	max_line_len = max(len(x) for x in stats_lines)

	# Calculate the width of the first column and pre-render a divider row long
	# enough to look good above `sum_line`.
	divider = "-{}-+----------".format("-" * col1_len)
	if len(divider) < max_line_len:
	divider += '-' * (max_line_len - len(divider))

	print(" {:>{col1_len}} \| Bad Bits ".format("Offset", col1_len=col1_len))
	print(divider)
	for line in rows:
	print(" {:#{col1_len}x} \| {:08b} ".format(*line, col1_len=col1_len))
	print(divider)
	print('\n'.join(stats_lines))

	def main():
	"""The main entry point, compatible with setuptools entry points."""
	from argparse import ArgumentParser, RawDescriptionHelpFormatter
	parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter,
	description=__doc__.replace('\r\n', '\n').split('\n--snip--\n')[0])
	parser.add_argument('--version', action='version',
	version="%%(prog)s v%s" % __version__)
	parser.add_argument('-v', '--verbose', action="count",
	default=2, help="Increase the verbosity. Use twice for extra effect.")
	parser.add_argument('-q', '--quiet', action="count",
	default=0, help="Decrease the verbosity. Use twice for extra effect.")
	parser.add_argument('--skip-head', action="store", type=int,
	default=0, help="Specify the number of bytes at the beginning of the "
	"file to exclude from comparison. (eg. specify `16` "
	"to omit the iNES header when calculating offsets in "
	"an NES ROM.)")
	parser.add_argument('path', action='store', nargs='+',
	default=[], help="Path to a ROM to be diffed")

	args = parser.parse_args()

	# Set up clean logging to stderr
	log_levels = [logging.CRITICAL, logging.ERROR, logging.WARNING,
	logging.INFO, logging.DEBUG]
	args.verbose = min(args.verbose - args.quiet, len(log_levels) - 1)
	args.verbose = max(args.verbose, 0)
	logging.basicConfig(level=log_levels[args.verbose],
	format='%(levelname)s: %(message)s')

	if len(args.path) < 2:
	log.critical("A minimum of two input files are required. Exiting.")
	sys.exit(1)

	bad_bits, stats = compare(args.path, skip_head=args.skip_head)
	print_table(bad_bits, stats)

	if __name__ == '__main__':
	main()

	# vim: set sw=4 sts=4 expandtab :