Skip to content

Instantly share code, notes, and snippets.

@rrr3try
Last active September 28, 2020 16:31
Show Gist options
  • Save rrr3try/489d7a0faf63b0449de4f8cacd99a04a to your computer and use it in GitHub Desktop.
Save rrr3try/489d7a0faf63b0449de4f8cacd99a04a to your computer and use it in GitHub Desktop.
fuzzywuzzy example
# pip install fuzzywuzzy[speedup]
# pip install tqdm
from fuzzywuzzy import fuzz
THRESHOLD = 90
length = len(data)
data = set(data)
data = list(data)
def compare_fuzzy(compare_text, data_full):
similar = set()
compare_text = compare_text[:256].lower()
for i, text in enumerate(data_full):
if fuzz.ratio(text[:256].lower(), compare_text) > THRESHOLD:
similar.add(i)
return similar.pop()
real_indexes = set()
for text in tqdm_notebook(data):
real_indexes.add(compare_fuzzy(text, data))
real_data = [data[i] for i in real_indexes]
print(f"before {length}|\n after {len(real_data)}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment