Skip to content

Instantly share code, notes, and snippets.

@gigamonkey
Last active November 10, 2017 01:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gigamonkey/4853ee34233416182567013c864b66ef to your computer and use it in GitHub Desktop.
Save gigamonkey/4853ee34233416182567013c864b66ef to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
#
# Find the best matches between a set of dirty names and a canonical set.
#
# Is O(N^2) in the number of names.
#
import json
from lcs import lcs
def similarity(a, b):
"Similarity of two strings in terms of their longest common subsequence."
return (2 * len(lcs(a.lower(), b.lower()))) / (len(a) + len(b))
def similarities(canonical, dirty):
"Return generator of similarity score, canonical, and dirty strings."
return ((similarity(c, d), c, d) for c in canonical for d in dirty)
def match(canonical, dirty):
"Map each dirty string to the best canonical string."
canonical = set(canonical)
dirty = set(dirty)
mapping = dict()
# Map identical elements and remove them from futher consideration
for s in canonical & dirty:
mapping[s] = s
canonical.remove(s)
dirty.remove(s)
# Now find the most similar pairs and map them.
for _, c, d in reversed(sorted(similarities(canonical, dirty))):
if c in canonical and d in dirty:
mapping[d] = c
canonical.remove(c)
dirty.remove(d)
return mapping
if __name__ == '__main__':
from sys import argv, stdout
with open(argv[1]) as f:
canonical = [ line[:-1] for line in f.readlines() ]
with open(argv[2]) as f:
dirty = [ line[:-1] for line in f.readlines() ]
json.dump(match(canonical, dirty), stdout, indent=2)
#
# Compute the longest common subsequence
# (https://en.wikipedia.org/wiki/Longest_common_subsequence_problem)
# of pairs of strings. Uses an efficient dynamic programming
# implementation.
#
def lcs(a, b):
return lcs_reconstruct(lcs_matrix(a, b), a, b)
def lcs_matrix(a, b):
matrix = [ [ 0 for _ in range(len(a) + 1) ] for _ in range(len(b) + 1) ]
for i in range(len(a) - 1, -1, -1):
for j in range(len(b) - 1, -1, -1):
(_, right, down, diag) = neighbors(matrix, i, j)
matrix[j][i] = max(1 + diag if a[i] == b[j] else 0, right, down)
return matrix
def lcs_reconstruct(matrix, a, b):
result = []
j = 0
i = 0
while j < len(b) and i < len(a):
here, right, down, diag = neighbors(matrix, i, j)
if right == down == diag == (here - 1):
result.append(a[i])
i += 1
j += 1
elif down == here:
j += 1
elif right == here:
i += 1
return ''.join(result)
def neighbors(m, i, j):
return (m[j][i], m[j][i+1], m[j+1][i], m[j+1][i+1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment