Skip to content

Instantly share code, notes, and snippets.

@jarnaldich
Created March 18, 2023 08:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jarnaldich/24ece34b6fb441c3ef8878a39a265b82 to your computer and use it in GitHub Desktop.
Save jarnaldich/24ece34b6fb441c3ef8878a39a265b82 to your computer and use it in GitHub Desktop.
[Near Duplicate Detection] #data #qc #jupyter #python #nltk #jaccard #levenshtein
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"language_info": {
"codemirror_mode": {
"name": "python",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8"
},
"kernelspec": {
"name": "python",
"display_name": "Python (Pyodide)",
"language": "python"
}
},
"nbformat_minor": 4,
"nbformat": 4,
"cells": [
{
"cell_type": "code",
"source": "import numbers\nimport pandas as pd\nfrom pandas.api.types import is_string_dtype\nfrom js import fetch\nfrom collections import defaultdict\nimport nltk\nimport matplotlib",
"metadata": {
"trusted": true
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": "%%javascript\nwindow.saveJSONP = async (urlString, file_path, mime_type='text/json', binary=false) => {\n const sc = document.createElement('script');\n var url = new URL(urlString);\n url.searchParams.append('callback', 'window.corsCallBack');\n \n sc.src = url.toString();\n\n window.corsCallBack = async (data) => {\n console.log(data);\n\n // Open (or create) the file storage\n var open = indexedDB.open('JupyterLite Storage');\n\n // Create the schema\n open.onupgradeneeded = function() {\n throw Error('Error opening IndexedDB. Should not ever need to upgrade JupyterLite Storage Schema');\n };\n\n open.onsuccess = function() {\n // Start a new transaction\n var db = open.result;\n var tx = db.transaction(\"files\", \"readwrite\");\n var store = tx.objectStore(\"files\");\n\n var now = new Date();\n\n var value = {\n 'name': file_path.split(/[\\\\/]/).pop(),\n 'path': file_path,\n 'format': binary ? 'binary' : 'text',\n 'created': now.toISOString(),\n 'last_modified': now.toISOString(),\n 'content': JSON.stringify(data),\n 'mimetype': mime_type,\n 'type': 'file',\n 'writable': true\n }; \n\n const countRequest = store.count(file_path);\n countRequest.onsuccess = () => {\n console.log(countRequest.result);\n if(countRequest.result > 0) {\n store.put(value, file_path);\n } else {\n store.add(value, file_path);\n } \n }; \n\n // Close the db when the transaction is done\n tx.oncomplete = function() {\n db.close();\n };\n }\n }\n\n document.getElementsByTagName('head')[0].appendChild(sc);\n}\n",
"metadata": {
"trusted": true
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": "%%javascript\nvar url = 'https://opendata-ajuntament.barcelona.cat/data/es/api/3/action/datastore_search?resource_id=69ae574f-adfc-4660-8f81-73103de169ff'\nwindow.saveJSONP(url, 'data/menors.json')\n",
"metadata": {
"trusted": true
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": "import json\nimport pandas as pd\n\nwith open('data/menors.json', 'r') as f:\n data = json.load(f)\n \ndf = pd.read_json(json.dumps(data['result']['records']))",
"metadata": {
"trusted": true
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": "def near_duplicates(factors, min_jaccard: float, max_levenshtein: int):\n trigrams = [ set(''.join(g) for g in nltk.ngrams(f, 3)) for f in factors ]\n jaccard = dict()\n levenshtein = dict()\n for i in range(len(factors)):\n for j in range(i+1, len(factors)):\n denom = float(len(trigrams[i] | trigrams[j]))\n if denom > 0:\n jaccard[(i,j)] = float(len(trigrams[i] & trigrams[j])) / denom\n else:\n jaccard[(i,j)] = np.NaN\n levenshtein[(i,j)] = nltk.edit_distance(factors[i], factors[j])\n\n acum = []\n for (i,j),v in jaccard.items():\n if v >= min_jaccard and levenshtein[(i,j)] <= max_levenshtein: \n acum.append([i,j,factors[i], factors[j], jaccard[(i,j)], levenshtein[(i,j)]])\n\n return pd.DataFrame(acum, columns=['i', 'j', 'factor_i', 'factor_j', 'jaccard_ij', 'levenshtein_ij'])",
"metadata": {
"trusted": true
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": "def df_dups(df, cols=None, except_cols=[], min_jaccard=0.3, max_levenshtein=4):\n acum = []\n \n if cols is None:\n cols = df.columns\n\n if isinstance(min_jaccard, numbers.Number):\n mj = defaultdict(lambda : min_jaccard)\n else:\n mj = min_jaccard\n\n if isinstance(max_levenshtein, numbers.Number):\n ml = defaultdict(lambda: max_levenshtein)\n else:\n ml = max_levenshtein\n\n for c in cols:\n\n if c in except_cols or not is_string_dtype(df[c]):\n continue\n \n print(c)\n\n factors = df[c].factorize()[1]\n col_dups = near_duplicates(factors, mj[c], ml[c])\n col_dups['col'] = c\n acum.append(col_dups)\n\n return pd.concat(acum)",
"metadata": {
"trusted": true
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": "df_dups(df, cols=['Proveïdor', 'Objecte del contracte', \n 'Tipus Contracte'])",
"metadata": {
"trusted": true
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": "",
"metadata": {},
"execution_count": null,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment