risicle/template_ocr.py

## template_ocr.py
"""
    Collection of utility functions to implement rudimentary template-matching based OCR using OpenCV,
    intended for use with large, uniform blocks of monospaced, non-language-correlated text (effectively
    resulting in a "character grid") where a 100.0% accuracy is required. This could be useful if one, for
    example, found themselves needing to recover a hardcopy-printed backup of an OpenPGP key.

    Modern general-purpose OCR software doesn't tend to do that well with these, relying heavily on
    language-context-based guessing, often trying to be too clever in layout auto-detection, all the while
    being extremely tricky to configure precisely enough to allow us to impart our existing knowledge
    about the constraints of the target text (expected alphabet, line length, expected font...).
"""
from itertools import chain, product

import cv2
import numpy as np

#
# following numpy convention, coordinates and dimensions are specified in (y, x) order.
#

default_alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
default_template_dims = (36, 24,)
default_search_padding = (4, 4,)


def get_perspective_transform(
    px_coords,
    grid_dims,
):
    """
        Return a perspective transform matrix which will transform from character-grid coordinates
        to pixel coordinates of an image.

        :param px_coords: sequence of pixel coordinates defining the four corners of the character grid
                          in the image - ordered top left, top right, bottom left, bottom right
        :param grid_dims: dimensions of character-grid
    """
    return cv2.getPerspectiveTransform(
        np.float32(tuple(product(*((0, a) for a in grid_dims)))),
        np.float32(px_coords),
    )


def get_char_representatives(
    initial_lines,
    grid_dims,
    alphabet=default_alphabet,
):
    """
        Given some initial_lines of a character grid, will generate a sequence of sequences of
        character-grid coordinates of positions that can be used as representatives of a particular
        character for comparisons.

        :param initial_lines: the (known) initial lines of this character grid, the more the better. at
                              least enough will be needed so that each character in the alphabet has
                              at least one example appearance.
        :param grid_dims:     dimensions of character-grid
    """
    if any(len(line) != grid_dims[1] for line in initial_lines[:-1]):
        raise ValueError("All non-final initial_lines must be of length grid_dims[1]")

    representatives = tuple([] for _ in alphabet)
    for y, line in enumerate(initial_lines):
        for x, char in enumerate(line):
            representatives[alphabet.index(char)].append((y, x))

    for i, rprs in enumerate(representatives):
        if not rprs:
            raise ValueError(f"Alphabet character {alphabet.index(i)!r} missing from initial_lines")

    return tuple(tuple(rprs) for rprs in representatives)


def get_correspondences(
    img,
    representatives,
    pmat,
    grid_dims,
    template_dims=default_template_dims,
    search_padding=default_search_padding,
    method=cv2.TM_SQDIFF_NORMED,
    score_finalizer=np.amin,
):
    """
        Build an array of shape (len(representatives), grid_dims[0], grid_dims[1]) denoting the "scores"
        of correspondence between each character/grid-position combination.

        :param img:             two-dimensional numpy array of monochrome image
        :param representatives: character-grid coordinates of representatives for each character in
                                alphabet, as returned by ``get_char_representatives``
        :param pmat:            perspective transform matrix from character-grid space to pixel space, as
                                returned by ``get_perspective_transform``
        :param grid_dims:       dimensions of character-grid
    """
    correspondences = np.zeros((len(representatives), *grid_dims), "float32")
    # pre-allocate matchTemplate result array for heavy reuse
    match_result = np.zeros(((2*search_padding[0])+1, (2*search_padding[1])+1, 1,), "float32")

    for gridpos in product(range(grid_dims[0]), range(grid_dims[1])):
        pxpos = np.int32(cv2.perspectiveTransform(np.float32(((gridpos,),)), pmat)[0][0])
        sample = img[
            pxpos[0]-search_padding[0]:pxpos[0]+template_dims[0]+search_padding[0],
            pxpos[1]-search_padding[1]:pxpos[1]+template_dims[1]+search_padding[0],
            0,
        ]

        for char_i, char_rprs in enumerate(representatives):
            finalized_results = []
            for t_gridpos in char_rprs:
                t_pxpos = np.int32(cv2.perspectiveTransform(np.float32(((t_gridpos,),)), pmat)[0][0])
                template = img[
                    t_pxpos[0]:t_pxpos[0]+template_dims[0],
                    t_pxpos[1]:t_pxpos[1]+template_dims[1],
                    0,
                ]

                cv2.matchTemplate(
                    sample,
                    template,
                    method,
                    match_result,
                )
                finalized_results.append(score_finalizer(match_result))

            correspondences[(char_i, *gridpos,)] = score_finalizer(finalized_results)
    return correspondences


def get_lines_from_correspondences_simple(
    correspondences,
    alphabet=default_alphabet,
    selector=np.argmin,
):
    """
        Return probable lines of target text guessed using extremely naive "best score wins"

        :param correspondences: numpy array of character correspondences, as returned by
                                ``get_correspondences``
    """
    return tuple(
        "".join(
            alphabet[selector(correspondences[:, y, x])]
            for x in range(correspondences.shape[2])
        ) for y in range(correspondences.shape[1])
    )


def get_contentions(correspondences, score_inverse=True):
    """
        Return sequence of grid-coordinates, ordered by how small the score difference was in determining
        the top match in ``correspondences``.
    """
    return sorted(
        (abs(sc[1]-sc[0]), gridpos)
        for gridpos, sc in (
            (gridpos, sorted(correspondences[:,gridpos[0],gridpos[1]], reverse=(not score_inverse)))
            for gridpos in product(
                range(correspondences.shape[1]),
                range(correspondences.shape[2]),
            )
        )
    )


def occurrence_comparison(
    img,
    lines,
    pmat,
    alphabet=default_alphabet,
    template_dims=default_template_dims,
    search_padding=default_search_padding,
    method=cv2.TM_SQDIFF_NORMED,
    score_finalizer=np.amin,
):
    """
        Compares each occurence of a character in the guessed output to all other occurrences. This output
        can be used to try to identify the "odd ones out" in a character family, those presumably being
        among the ones with the most dissimilarity.

        :param img:   two-dimensional numpy array of monochrome image
        :param lines: sequence of strings of guessed contents of character grid
        :param pmat:  perspective transform matrix from character-grid space to pixel space, as returned
                      by ``get_perspective_transform``
        :returns:     sequence of sequences of grid-position combinations and their respective similarity
                      scores, each entry being (score, grid_position0, grid_position1). grid-position
                      combinations are de-duplicated.
    """
    char_locs = tuple([] for _ in alphabet)
    similarities = tuple([] for _ in alphabet)

    # pre-allocate matchTemplate result array for heavy reuse
    match_result = np.zeros(((2*search_padding[0])+1, (2*search_padding[1])+1, 1,), "float32")

    for y, line in enumerate(lines):
        for x, char in enumerate(line):
            char_i = alphabet.index(char)
            pxpos = np.int32(cv2.perspectiveTransform(np.float32((((y, x,),),)), pmat)[0][0])
            sample = img[
                pxpos[0]-search_padding[0]:pxpos[0]+template_dims[0]+search_padding[0],
                pxpos[1]-search_padding[1]:pxpos[1]+template_dims[1]+search_padding[0],
                0,
            ]

            new_similarities = []
            for t_gridpos in char_locs[char_i]:
                t_pxpos = np.int32(cv2.perspectiveTransform(np.float32(((t_gridpos,),)), pmat)[0][0])
                template = img[
                    t_pxpos[0]:t_pxpos[0]+template_dims[0],
                    t_pxpos[1]:t_pxpos[1]+template_dims[1],
                    0,
                ]

                cv2.matchTemplate(
                    sample,
                    template,
                    method,
                    match_result,
                )
                new_similarities.append((score_finalizer(match_result), t_gridpos, (y, x),))

            similarities[char_i].extend(new_similarities)
            char_locs[char_i].append((y, x))

    return similarities


def get_char_atlas(
    img,
    lines,
    pmat,
    alphabet=default_alphabet,
    template_dims=default_template_dims,
    search_padding=default_search_padding,
):
    """
        Generates a "character atlas" image of all character samples in an image, grouped by character.
        This allows "odd ones out" to be more easily identified by eye.

        :param img:   two-dimensional numpy array of monochrome image
        :param lines: sequence of strings of guessed contents of character grid
        :param pmat:  perspective transform matrix from character-grid space to pixel space, as returned
                      by ``get_perspective_transform``
        :returns:     tuple of output image followed by an index of character positions each sample in
                      output represents
    """
    char_locs = tuple([] for _ in alphabet)

    for y, line in enumerate(lines):
        for x, char in enumerate(line):
            char_locs[alphabet.index(char)].append((y, x))

    sample_dims = (template_dims[0] + (search_padding[0] * 2), template_dims[1] + (search_padding[1] * 2),)
    atlas_width = max(len(locs) for locs in char_locs) * sample_dims[1]

    atlas = np.zeros((sample_dims[0] * len(alphabet), atlas_width,), "uint8")

    for atlas_grid_y, locs in enumerate(char_locs):
        for atlas_grid_x, loc in enumerate(locs):
            pxpos = np.int32(cv2.perspectiveTransform(np.float32(((loc,),)), pmat)[0][0])
            sample = img[
                pxpos[0]-search_padding[0]:pxpos[0]+template_dims[0]+search_padding[0],
                pxpos[1]-search_padding[1]:pxpos[1]+template_dims[1]+search_padding[0],
                0,
            ]

            atlas[
                atlas_grid_y * sample_dims[0]:(atlas_grid_y * sample_dims[0]) + sample.shape[0],
                atlas_grid_x * sample_dims[1]:(atlas_grid_x * sample_dims[1]) + sample.shape[1],
            ] = sample

    return atlas, char_locs
	"""
	Collection of utility functions to implement rudimentary template-matching based OCR using OpenCV,
	intended for use with large, uniform blocks of monospaced, non-language-correlated text (effectively
	resulting in a "character grid") where a 100.0% accuracy is required. This could be useful if one, for
	example, found themselves needing to recover a hardcopy-printed backup of an OpenPGP key.

	Modern general-purpose OCR software doesn't tend to do that well with these, relying heavily on
	language-context-based guessing, often trying to be too clever in layout auto-detection, all the while
	being extremely tricky to configure precisely enough to allow us to impart our existing knowledge
	about the constraints of the target text (expected alphabet, line length, expected font...).
	"""
	from itertools import chain, product

	import cv2
	import numpy as np

	#
	# following numpy convention, coordinates and dimensions are specified in (y, x) order.
	#

	default_alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
	default_template_dims = (36, 24,)
	default_search_padding = (4, 4,)


	def get_perspective_transform(
	px_coords,
	grid_dims,
	):
	"""
	Return a perspective transform matrix which will transform from character-grid coordinates
	to pixel coordinates of an image.

	:param px_coords: sequence of pixel coordinates defining the four corners of the character grid
	in the image - ordered top left, top right, bottom left, bottom right
	:param grid_dims: dimensions of character-grid
	"""
	return cv2.getPerspectiveTransform(
	np.float32(tuple(product(*((0, a) for a in grid_dims)))),
	np.float32(px_coords),
	)


	def get_char_representatives(
	initial_lines,
	grid_dims,
	alphabet=default_alphabet,
	):
	"""
	Given some initial_lines of a character grid, will generate a sequence of sequences of
	character-grid coordinates of positions that can be used as representatives of a particular
	character for comparisons.

	:param initial_lines: the (known) initial lines of this character grid, the more the better. at
	least enough will be needed so that each character in the alphabet has
	at least one example appearance.
	:param grid_dims: dimensions of character-grid
	"""
	if any(len(line) != grid_dims[1] for line in initial_lines[:-1]):
	raise ValueError("All non-final initial_lines must be of length grid_dims[1]")

	representatives = tuple([] for _ in alphabet)
	for y, line in enumerate(initial_lines):
	for x, char in enumerate(line):
	representatives[alphabet.index(char)].append((y, x))

	for i, rprs in enumerate(representatives):
	if not rprs:
	raise ValueError(f"Alphabet character {alphabet.index(i)!r} missing from initial_lines")

	return tuple(tuple(rprs) for rprs in representatives)


	def get_correspondences(
	img,
	representatives,
	pmat,
	grid_dims,
	template_dims=default_template_dims,
	search_padding=default_search_padding,
	method=cv2.TM_SQDIFF_NORMED,
	score_finalizer=np.amin,
	):
	"""
	Build an array of shape (len(representatives), grid_dims[0], grid_dims[1]) denoting the "scores"
	of correspondence between each character/grid-position combination.

	:param img: two-dimensional numpy array of monochrome image
	:param representatives: character-grid coordinates of representatives for each character in
	alphabet, as returned by ``get_char_representatives``
	:param pmat: perspective transform matrix from character-grid space to pixel space, as
	returned by ``get_perspective_transform``
	:param grid_dims: dimensions of character-grid
	"""
	correspondences = np.zeros((len(representatives), *grid_dims), "float32")
	# pre-allocate matchTemplate result array for heavy reuse
	match_result = np.zeros(((2search_padding[0])+1, (2search_padding[1])+1, 1,), "float32")

	for gridpos in product(range(grid_dims[0]), range(grid_dims[1])):
	pxpos = np.int32(cv2.perspectiveTransform(np.float32(((gridpos,),)), pmat)[0][0])
	sample = img[
	pxpos[0]-search_padding[0]:pxpos[0]+template_dims[0]+search_padding[0],
	pxpos[1]-search_padding[1]:pxpos[1]+template_dims[1]+search_padding[0],
	0,
	]

	for char_i, char_rprs in enumerate(representatives):
	finalized_results = []
	for t_gridpos in char_rprs:
	t_pxpos = np.int32(cv2.perspectiveTransform(np.float32(((t_gridpos,),)), pmat)[0][0])
	template = img[
	t_pxpos[0]:t_pxpos[0]+template_dims[0],
	t_pxpos[1]:t_pxpos[1]+template_dims[1],
	0,
	]

	cv2.matchTemplate(
	sample,
	template,
	method,
	match_result,
	)
	finalized_results.append(score_finalizer(match_result))

	correspondences[(char_i, *gridpos,)] = score_finalizer(finalized_results)
	return correspondences


	def get_lines_from_correspondences_simple(
	correspondences,
	alphabet=default_alphabet,
	selector=np.argmin,
	):
	"""
	Return probable lines of target text guessed using extremely naive "best score wins"

	:param correspondences: numpy array of character correspondences, as returned by
	``get_correspondences``
	"""
	return tuple(
	"".join(
	alphabet[selector(correspondences[:, y, x])]
	for x in range(correspondences.shape[2])
	) for y in range(correspondences.shape[1])
	)


	def get_contentions(correspondences, score_inverse=True):
	"""
	Return sequence of grid-coordinates, ordered by how small the score difference was in determining
	the top match in ``correspondences``.
	"""
	return sorted(
	(abs(sc[1]-sc[0]), gridpos)
	for gridpos, sc in (
	(gridpos, sorted(correspondences[:,gridpos[0],gridpos[1]], reverse=(not score_inverse)))
	for gridpos in product(
	range(correspondences.shape[1]),
	range(correspondences.shape[2]),
	)
	)
	)


	def occurrence_comparison(
	img,
	lines,
	pmat,
	alphabet=default_alphabet,
	template_dims=default_template_dims,
	search_padding=default_search_padding,
	method=cv2.TM_SQDIFF_NORMED,
	score_finalizer=np.amin,
	):
	"""
	Compares each occurence of a character in the guessed output to all other occurrences. This output
	can be used to try to identify the "odd ones out" in a character family, those presumably being
	among the ones with the most dissimilarity.

	:param img: two-dimensional numpy array of monochrome image
	:param lines: sequence of strings of guessed contents of character grid
	:param pmat: perspective transform matrix from character-grid space to pixel space, as returned
	by ``get_perspective_transform``
	:returns: sequence of sequences of grid-position combinations and their respective similarity
	scores, each entry being (score, grid_position0, grid_position1). grid-position
	combinations are de-duplicated.
	"""
	char_locs = tuple([] for _ in alphabet)
	similarities = tuple([] for _ in alphabet)

	# pre-allocate matchTemplate result array for heavy reuse
	match_result = np.zeros(((2search_padding[0])+1, (2search_padding[1])+1, 1,), "float32")

	for y, line in enumerate(lines):
	for x, char in enumerate(line):
	char_i = alphabet.index(char)
	pxpos = np.int32(cv2.perspectiveTransform(np.float32((((y, x,),),)), pmat)[0][0])
	sample = img[
	pxpos[0]-search_padding[0]:pxpos[0]+template_dims[0]+search_padding[0],
	pxpos[1]-search_padding[1]:pxpos[1]+template_dims[1]+search_padding[0],
	0,
	]

	new_similarities = []
	for t_gridpos in char_locs[char_i]:
	t_pxpos = np.int32(cv2.perspectiveTransform(np.float32(((t_gridpos,),)), pmat)[0][0])
	template = img[
	t_pxpos[0]:t_pxpos[0]+template_dims[0],
	t_pxpos[1]:t_pxpos[1]+template_dims[1],
	0,
	]

	cv2.matchTemplate(
	sample,
	template,
	method,
	match_result,
	)
	new_similarities.append((score_finalizer(match_result), t_gridpos, (y, x),))

	similarities[char_i].extend(new_similarities)
	char_locs[char_i].append((y, x))

	return similarities


	def get_char_atlas(
	img,
	lines,
	pmat,
	alphabet=default_alphabet,
	template_dims=default_template_dims,
	search_padding=default_search_padding,
	):
	"""
	Generates a "character atlas" image of all character samples in an image, grouped by character.
	This allows "odd ones out" to be more easily identified by eye.

	:param img: two-dimensional numpy array of monochrome image
	:param lines: sequence of strings of guessed contents of character grid
	:param pmat: perspective transform matrix from character-grid space to pixel space, as returned
	by ``get_perspective_transform``
	:returns: tuple of output image followed by an index of character positions each sample in
	output represents
	"""
	char_locs = tuple([] for _ in alphabet)

	for y, line in enumerate(lines):
	for x, char in enumerate(line):
	char_locs[alphabet.index(char)].append((y, x))

	sample_dims = (template_dims[0] + (search_padding[0] * 2), template_dims[1] + (search_padding[1] * 2),)
	atlas_width = max(len(locs) for locs in char_locs) * sample_dims[1]

	atlas = np.zeros((sample_dims[0] * len(alphabet), atlas_width,), "uint8")

	for atlas_grid_y, locs in enumerate(char_locs):
	for atlas_grid_x, loc in enumerate(locs):
	pxpos = np.int32(cv2.perspectiveTransform(np.float32(((loc,),)), pmat)[0][0])
	sample = img[
	pxpos[0]-search_padding[0]:pxpos[0]+template_dims[0]+search_padding[0],
	pxpos[1]-search_padding[1]:pxpos[1]+template_dims[1]+search_padding[0],
	0,
	]

	atlas[
	atlas_grid_y * sample_dims[0]:(atlas_grid_y * sample_dims[0]) + sample.shape[0],
	atlas_grid_x * sample_dims[1]:(atlas_grid_x * sample_dims[1]) + sample.shape[1],
	] = sample

	return atlas, char_locs