claczny/fuzzymatch_titles.py

## fuzzymatch_titles.py
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from collections import Counter

A_title_file = "/tmp/A_titles.txt"
B_title_file = "/tmp/B_titles.txt"

# Open the files and get the titles
A_titles = []
with open(A_title_file) as f:
    A_titles = f.readlines()

B_titles = []
with open(B_title_file) as f:
    B_titles = f.readlines()

# Will be used to check whether a match occurs multiple times, thereby indicating a missing title in B, i.e., a multiplication in A
best_matches_t = list()

# For each title in A, find the closest match in B
for A_t in A_titles:
    A_t = A_t.strip()
    max_simil = 0
    max_match_t = ""
    for B_t in B_titles:
        B_t = B_t.strip()
        # Fuzzy matching
        simil = fuzz.token_sort_ratio(A_t, B_t)
        # Store only title in B with max. similarity
        if simil > max_simil:
            max_simil = simil
            max_match_t = B_t
    print("%i: %s | %s" % (max_simil, A_t, max_match_t))
    # Append
    best_matches_t.append(max_match_t)

top_n_most_common = 3
c = Counter(best_matches_t)
# Get the most common hits; easily identify multiplications
most_common = c.most_common(top_n_most_common)
print("\nTop %i most common matches in B: " % top_n_most_common)
print(most_common)
	from fuzzywuzzy import fuzz
	from fuzzywuzzy import process
	from collections import Counter

	A_title_file = "/tmp/A_titles.txt"
	B_title_file = "/tmp/B_titles.txt"

	# Open the files and get the titles
	A_titles = []
	with open(A_title_file) as f:
	A_titles = f.readlines()

	B_titles = []
	with open(B_title_file) as f:
	B_titles = f.readlines()

	# Will be used to check whether a match occurs multiple times, thereby indicating a missing title in B, i.e., a multiplication in A
	best_matches_t = list()

	# For each title in A, find the closest match in B
	for A_t in A_titles:
	A_t = A_t.strip()
	max_simil = 0
	max_match_t = ""
	for B_t in B_titles:
	B_t = B_t.strip()
	# Fuzzy matching
	simil = fuzz.token_sort_ratio(A_t, B_t)
	# Store only title in B with max. similarity
	if simil > max_simil:
	max_simil = simil
	max_match_t = B_t
	print("%i: %s \| %s" % (max_simil, A_t, max_match_t))
	# Append
	best_matches_t.append(max_match_t)

	top_n_most_common = 3
	c = Counter(best_matches_t)
	# Get the most common hits; easily identify multiplications
	most_common = c.most_common(top_n_most_common)
	print("\nTop %i most common matches in B: " % top_n_most_common)
	print(most_common)