Skip to content

Instantly share code, notes, and snippets.

@companje
Created June 15, 2022 15:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save companje/93f6061629ac27a2027a77888effd6ad to your computer and use it in GitHub Desktop.
Save companje/93f6061629ac27a2027a77888effd6ad to your computer and use it in GitHub Desktop.
Fuzzy matchen in strings
#!/usr/bin/env python3
import json
from fuzzysearch import find_near_matches
from fuzzywuzzy import process
json_file = "A152153-pagina5tm20/page/NL-UtHUA_A152153_000005-f.json"
def fuzzy_extract(qs, ls, threshold):
'''fuzzy matches 'qs' in 'ls' and returns list of
tuples of (word,index)
'''
for word, _ in process.extractBests(qs, (ls,), score_cutoff=threshold):
for match in find_near_matches(qs, word, max_l_dist=1):
match = word[match.start:match.end]
index = ls.find(match)
yield (match, index)
def get_page_text(data):
for text_region in data["PcGts"]["Page"]["TextRegion"]:
for text_line in text_region["TextLine"]:
txt = text_line["TextEquiv"]["Unicode"]
if txt:
yield(txt)
#############################################
with open(json_file) as f:
data = json.load(f)
text = " ".join(get_page_text(data))
q = "op heden den"
q = "is voor ons ondergeteekende"
# q = "oud"
q = "wonende"
q = "die ons verklaard heeft, dat op den "
q = "maand des"
q = "middags te"
# q = "uur"
q = "bevallen is van een kind van het "
q = "geslacht, waaraan men goven"
for match,index in fuzzy_extract(q, text, 10):
print('match: {}\nindex: {}'.format(match, index))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment