Created
July 7, 2021 09:38
-
-
Save renaud/c0f0052572b9921387a64c895ab34113 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# %% | |
# unless you would like me to code the Levenshtein algo from hand, | |
# I will just use a library... | |
from Levenshtein import distance # !pip install python-Levenshtein | |
assert distance("ah", "aho") == 1 | |
# %% | |
# loading dataset, quick inspection | |
import pandas as pd | |
# https://opendata.swiss/en/dataset/hundenamen-aus-dem-hundebestand-der-stadt-zurich2/resource/3e48403f-1ca2-434f-8766-cf0a73d7c2a1 | |
h_namen = pd.read_csv('/Users/ren/Desktop/20210103_hundenamen.csv') | |
print(h_namen.columns) | |
h_namen.head() | |
# %% | |
# loop over dog names | |
matching_names = [] | |
for n in h_namen.HUNDENAME.unique(): # we want disstinct values, so let's work with unique names | |
if 3 <= len(n) <= 5: # shortcut: anyway, name can't be shorter or longer than +/-1 of 4 | |
if distance('Luca', n) <= 1: | |
#print(n) | |
matching_names.append(n) | |
', '.join(matching_names) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment