Last active
July 3, 2021 21:00
-
-
Save pdMa2s/14e1dacac2d3cf2a67723230425601de to your computer and use it in GitHub Desktop.
Levenshtein_distance.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"name": "Levenshtein_distance.ipynb", | |
"provenance": [], | |
"collapsed_sections": [], | |
"authorship_tag": "ABX9TyOp7lbtHBpYOZLPbjrJ/gA+", | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
}, | |
"language_info": { | |
"name": "python" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/pdMa2s/14e1dacac2d3cf2a67723230425601de/levenshtein_distance.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "MgElaHp44_qE" | |
}, | |
"source": [ | |
"This is an example of an exercice with goal of filtering dog names in Zurich using the edit distance, also known as the Levenshtein distance.\n", | |
"\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "QE5TVEuCCdrY" | |
}, | |
"source": [ | |
"# 1. A typical Levenshtein distance implementation using Dynamic programing:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "_kf3SExv48og" | |
}, | |
"source": [ | |
"import numpy as np\n", | |
"\n", | |
"def levenshtein_distance(token1: str, token2: str):\n", | |
" n_rows = len(token1) +1\n", | |
" n_cols = len(token2) +1\n", | |
"\n", | |
" distance_matrix = np.zeros((n_rows, n_cols), dtype=int)\n", | |
"\n", | |
" for i in range(n_rows):\n", | |
" distance_matrix[i][0] = i\n", | |
"\n", | |
" for i in range(n_cols):\n", | |
" distance_matrix[0][i] = i\n", | |
"\n", | |
" for r in range(1, n_rows):\n", | |
" for c in range(1, n_cols):\n", | |
" if token1[r-1] == token2[c-1]:\n", | |
" distance_matrix[r][c] = distance_matrix[r-1][c-1]\n", | |
" else:\n", | |
" distance_matrix[r][c] = min(distance_matrix[r-1][c], \n", | |
" distance_matrix[r-1][c-1], \n", | |
" distance_matrix[r][c-1]) + 1\n", | |
" return distance_matrix[-1][-1]\n" | |
], | |
"execution_count": 2, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "_ZzWpr8qF_Rq" | |
}, | |
"source": [ | |
"# 2. Load the dataset\n", | |
"\n", | |
"\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "PsiRl3ANHBVT", | |
"outputId": "fd7390c4-3ec4-46fe-c572-5e0c362817b1" | |
}, | |
"source": [ | |
"import pandas as pd\n", | |
"import requests\n", | |
"from io import BytesIO\n", | |
"\n", | |
"\n", | |
"url = 'https://data.stadt-zuerich.ch/dataset/sid_stapo_hundenamen/download/20210103_hundenamen.csv'\n", | |
"response = requests.get(url, allow_redirects=True).content\n", | |
"\n", | |
"hundenamen = pd.read_csv(BytesIO(response))\n", | |
"\n", | |
"hundenamen.info()\n", | |
"\n", | |
"print(hundenamen.head())" | |
], | |
"execution_count": 3, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"<class 'pandas.core.frame.DataFrame'>\n", | |
"RangeIndex: 8574 entries, 0 to 8573\n", | |
"Data columns (total 3 columns):\n", | |
" # Column Non-Null Count Dtype \n", | |
"--- ------ -------------- ----- \n", | |
" 0 HUNDENAME 8574 non-null object\n", | |
" 1 GEBURTSJAHR_HUND 8574 non-null int64 \n", | |
" 2 GESCHLECHT_HUND 8574 non-null object\n", | |
"dtypes: int64(1), object(2)\n", | |
"memory usage: 201.1+ KB\n", | |
" HUNDENAME GEBURTSJAHR_HUND GESCHLECHT_HUND\n", | |
"0 Ituma 2011 w\n", | |
"1 \"Bo\" Bendy of Treegarden 2020 m\n", | |
"2 \"Bobby\" Lord Sinclair 2009 m\n", | |
"3 \"Buddy\" Fortheringhay's J. 2011 m\n", | |
"4 \"Fly\" Showring i fly for you 2015 w\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "0AIH3dvwrSGc" | |
}, | |
"source": [ | |
"# 3. Now lets filter dog names with a Levenshtein distance of 1 to 'Luca'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "uTWLrU_Tr0UU", | |
"outputId": "f16ee722-7521-48bb-f3e7-00a56d30197b" | |
}, | |
"source": [ | |
"dog_name_column = 'HUNDENAME'\n", | |
"reference_name = 'Luca'\n", | |
"target_distance = 1\n", | |
"\n", | |
"related_names = hundenamen[dog_name_column].loc[hundenamen[dog_name_column].apply(\n", | |
" lambda n: levenshtein_distance(n, reference_name) == target_distance)].to_list()\n", | |
" \n", | |
"print(\"Number of related names:\", len(related_names))\n", | |
"print(','.join(related_names))" | |
], | |
"execution_count": 5, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Number of related names: 136\n", | |
"Cuca,Lua,Lua,Lua,Lua,Lua,Lua,Luba,Lucas,Luce,Luce,Luce,Lucia,Lucy,Lucy,Lucy,Lucy,Lucy,Lucy,Lucy,Lucy,Lucy,Lucy,Lucy,Lucy,Lucy,Lucy,Lucy,Lucy,Lucy,Lucy,Lucy,Lucy,Lucy,Lula,Lula,Luma,Luma,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Luna,Lupa,Lupa,Lupa,Lupa,Lupa,Yuca\n" | |
], | |
"name": "stdout" | |
} | |
] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment