Skip to content

Instantly share code, notes, and snippets.

@claczny
Created January 6, 2017 15:45
Show Gist options
  • Save claczny/83c402176b6ab2683ecb4540412c00e4 to your computer and use it in GitHub Desktop.
Save claczny/83c402176b6ab2683ecb4540412c00e4 to your computer and use it in GitHub Desktop.
Python code to fuzzy match two files (A and B) of titles to find missing titles in B, i.e., multiplications in A. Not very efficient, but does the job.
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from collections import Counter
A_title_file = "/tmp/A_titles.txt"
B_title_file = "/tmp/B_titles.txt"
# Open the files and get the titles
A_titles = []
with open(A_title_file) as f:
A_titles = f.readlines()
B_titles = []
with open(B_title_file) as f:
B_titles = f.readlines()
# Will be used to check whether a match occurs multiple times, thereby indicating a missing title in B, i.e., a multiplication in A
best_matches_t = list()
# For each title in A, find the closest match in B
for A_t in A_titles:
A_t = A_t.strip()
max_simil = 0
max_match_t = ""
for B_t in B_titles:
B_t = B_t.strip()
# Fuzzy matching
simil = fuzz.token_sort_ratio(A_t, B_t)
# Store only title in B with max. similarity
if simil > max_simil:
max_simil = simil
max_match_t = B_t
print("%i: %s | %s" % (max_simil, A_t, max_match_t))
# Append
best_matches_t.append(max_match_t)
top_n_most_common = 3
c = Counter(best_matches_t)
# Get the most common hits; easily identify multiplications
most_common = c.most_common(top_n_most_common)
print("\nTop %i most common matches in B: " % top_n_most_common)
print(most_common)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment