gigamonkey/canonical.py

## canonical.py
#!/usr/bin/env python3

#
# Find the best matches between a set of dirty names and a canonical set.
#
# Is O(N^2) in the number of names.
#

import json
from lcs import lcs

def similarity(a, b):
    "Similarity of two strings in terms of their longest common subsequence."
    return (2 * len(lcs(a.lower(), b.lower()))) / (len(a) + len(b))


def similarities(canonical, dirty):
    "Return generator of similarity score, canonical, and dirty strings."
    return ((similarity(c, d), c, d) for c in canonical for d in dirty)


def match(canonical, dirty):
    "Map each dirty string to the best canonical string."
    canonical = set(canonical)
    dirty     = set(dirty)

    mapping = dict()

    # Map identical elements and remove them from futher consideration
    for s in canonical & dirty:
        mapping[s] = s
        canonical.remove(s)
        dirty.remove(s)

    # Now find the most similar pairs and map them.
    for _, c, d in reversed(sorted(similarities(canonical, dirty))):
        if c in canonical and d in dirty:
            mapping[d] = c
            canonical.remove(c)
            dirty.remove(d)

    return mapping

if __name__ == '__main__':

    from sys import argv, stdout

    with open(argv[1]) as f:
        canonical = [ line[:-1] for line in f.readlines() ]

    with open(argv[2]) as f:
        dirty = [ line[:-1] for line in f.readlines() ]

    json.dump(match(canonical, dirty), stdout, indent=2)

## lcs.py
#
# Compute the longest common subsequence
# (https://en.wikipedia.org/wiki/Longest_common_subsequence_problem)
# of pairs of strings. Uses an efficient dynamic programming
# implementation.
#

def lcs(a, b):
    return lcs_reconstruct(lcs_matrix(a, b), a, b)

def lcs_matrix(a, b):
    matrix = [ [ 0 for _ in range(len(a) + 1) ] for _ in range(len(b) + 1) ]

    for i in range(len(a) - 1, -1, -1):
        for j in range(len(b) - 1, -1, -1):
            (_, right, down, diag) = neighbors(matrix, i, j)
            matrix[j][i] = max(1 + diag if a[i] == b[j] else 0, right, down)

    return matrix

def lcs_reconstruct(matrix, a, b):
    result = []
    j = 0
    i = 0

    while j < len(b) and i < len(a):

        here, right, down, diag = neighbors(matrix, i, j)

        if right == down == diag == (here - 1):
            result.append(a[i])
            i += 1
            j += 1

        elif down == here:
            j += 1

        elif right == here:
            i += 1

    return ''.join(result)

def neighbors(m, i, j):
    return (m[j][i], m[j][i+1], m[j+1][i], m[j+1][i+1])
	#!/usr/bin/env python3

	#
	# Find the best matches between a set of dirty names and a canonical set.
	#
	# Is O(N^2) in the number of names.
	#

	import json
	from lcs import lcs

	def similarity(a, b):
	"Similarity of two strings in terms of their longest common subsequence."
	return (2 * len(lcs(a.lower(), b.lower()))) / (len(a) + len(b))


	def similarities(canonical, dirty):
	"Return generator of similarity score, canonical, and dirty strings."
	return ((similarity(c, d), c, d) for c in canonical for d in dirty)


	def match(canonical, dirty):
	"Map each dirty string to the best canonical string."
	canonical = set(canonical)
	dirty = set(dirty)

	mapping = dict()

	# Map identical elements and remove them from futher consideration
	for s in canonical & dirty:
	mapping[s] = s
	canonical.remove(s)
	dirty.remove(s)

	# Now find the most similar pairs and map them.
	for _, c, d in reversed(sorted(similarities(canonical, dirty))):
	if c in canonical and d in dirty:
	mapping[d] = c
	canonical.remove(c)
	dirty.remove(d)

	return mapping

	if __name__ == '__main__':

	from sys import argv, stdout

	with open(argv[1]) as f:
	canonical = [ line[:-1] for line in f.readlines() ]

	with open(argv[2]) as f:
	dirty = [ line[:-1] for line in f.readlines() ]

	json.dump(match(canonical, dirty), stdout, indent=2)
	#
	# Compute the longest common subsequence
	# (https://en.wikipedia.org/wiki/Longest_common_subsequence_problem)
	# of pairs of strings. Uses an efficient dynamic programming
	# implementation.
	#

	def lcs(a, b):
	return lcs_reconstruct(lcs_matrix(a, b), a, b)

	def lcs_matrix(a, b):
	matrix = [ [ 0 for _ in range(len(a) + 1) ] for _ in range(len(b) + 1) ]

	for i in range(len(a) - 1, -1, -1):
	for j in range(len(b) - 1, -1, -1):
	(_, right, down, diag) = neighbors(matrix, i, j)
	matrix[j][i] = max(1 + diag if a[i] == b[j] else 0, right, down)

	return matrix

	def lcs_reconstruct(matrix, a, b):
	result = []
	j = 0
	i = 0

	while j < len(b) and i < len(a):

	here, right, down, diag = neighbors(matrix, i, j)

	if right == down == diag == (here - 1):
	result.append(a[i])
	i += 1
	j += 1

	elif down == here:
	j += 1

	elif right == here:
	i += 1

	return ''.join(result)

	def neighbors(m, i, j):
	return (m[j][i], m[j][i+1], m[j+1][i], m[j+1][i+1])