-
-
Save alvations/a4a6e0cc24d2fd9aff86 to your computer and use it in GitHub Desktop.
Approximate Dictionary Matching
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
This is the "Approximate dictionary matching" algorithm described in | |
http://www.aclweb.org/anthology/C10-1096. | |
Given a query string x, a collection of strings V , and a similarity | |
threshold α, the algorithm computes the size range (line 3) given by Table 1. | |
For each size l in the range, the algorithm computes the minimum number of | |
overlaps τ (line 4). The function overlapjoin (line 5) finds similar strings by | |
solving the following problem (τ -overlap join): given a list of features of | |
the query string X and the minimum number of overlaps τ, enumerate strings of | |
size l in the collection V such that they have at least τ feature overlaps with X. | |
Algortihm: | |
Input: V : collection of strings | |
Input: x: query string | |
Input: α: threshold for the similarity | |
Output: Y: list of strings similar to the query | |
1. X ← string to feature(x); | |
2. Y ←[]; | |
3. for l ← min y(|X|, α) to max y(|X|, α) do | |
4. τ ← min overlap(|X|, l, α); | |
5. R ← overlapjoin(X, τ , V , l); | |
6. foreach r ∈ R do append r to Y; | |
7. end | |
8. return Y; | |
Some legalese: | |
The MIT License (MIT) | |
Copyright (c) 2016 Liling Tan | |
Permission is hereby granted, free of charge, to any person obtaining a copy of | |
this software and associated documentation files (the "Software"), to deal in | |
the Software without restriction, including without limitation the rights to | |
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies | |
of the Software, and to permit persons to whom the Software is furnished to do | |
so, subject to the following conditions: | |
The above copyright notice and this permission notice shall be included in all | |
copies or substantial portions of the Software. | |
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, | |
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | |
DEALINGS IN THE SOFTWARE. | |
""" | |
from math import sqrt | |
from collections import defaultdict | |
import numpy as np | |
from sklearn.feature_extraction.text import CountVectorizer | |
def get_V(y, n): | |
""" | |
Extract a ngram feature matrix for string comparison for a specified n order. | |
To extract the vectorized feature matrix, we can use the DIY way of | |
(i) extracting ngrams, (ii) then creating a numpy array for the matrix, | |
(iii) then fit new queries to the matrix. But since scikit-learn already | |
have those functions optimized for speed, we should use them. | |
""" | |
ngram_vectorizer = CountVectorizer(analyzer='char', ngram_range=(n-1, n), min_df=1) | |
V = ngram_vectorizer.fit_transform(y) | |
return V, ngram_vectorizer | |
def get_V_by_size(V): | |
""" | |
Get an "indexed" matrix V, where the key is the size and the values are the | |
the indices of V that has the size corresponding to the key. | |
This is a optimization trick. For this to scale, we might have to query a | |
proper dataframe/database if our V is very very large, otherwise, if it fits | |
on RAM, then we can simply prune the sequential queries by restricting the | |
size. | |
""" | |
V_by_size = defaultdict(list) | |
for i, y_vec in enumerate(V): | |
size_y = np.sum(y_vec.toarray()) | |
V_by_size[size_y].append(i) | |
return V_by_size | |
def min_max_y(size_X, alpha): | |
""" | |
Computes the size threshold of Y given the size of X and alpha. | |
""" | |
return int(alpha * alpha * size_X), int(size_X / (alpha * alpha)) | |
def overlapjoin(vec_x, tau, sub_V, l): | |
""" | |
Find the no. of overlap ngrams between the query *vec_x* and the possible | |
subset of V. | |
""" | |
for i, _y in sub_V: | |
# Possibly this can be optimized by checking through only the | |
# non-zeros in the sparse matrix but there might be some caveats | |
# when synchronizing non-zeros of *vec_x* and *_y*. | |
num_overlaps = sum([1 for x_fi, y_fi in zip(vec_x, _y) | |
if x_fi&y_fi > 0 and x_fi == y_fi]) | |
if num_overlaps > tau: | |
yield i | |
def approx_dict_matching(x, y, V=None, vectorizer=None, V_by_size=None, | |
n=3, alpha=0.7): | |
""" | |
The "approximate dictionary matching" algorithm as described in | |
http://www.aclweb.org/anthology/C10-1096 | |
:param x: The query word. | |
:type x: str | |
:param y: A list of words in the vocabulary. | |
:type y: str | |
:param n: The order of ngrams, e.g. n=3 uses trigrams, n=2 uses bigrams | |
:type n: int | |
:param alpha: A parameter to specify length threshold of similar words. | |
:type alpha: float | |
""" | |
# Use for optimizing V such that we only instantiate V once outside of the | |
# approx_dict_matching() function. | |
if V == vectorizer == V_by_size == None: | |
V, vectorizer = get_V(y, n) | |
# Optimization trick: | |
# Pre-compute a dictionary to index the size of the non-zero values in V. | |
V_by_size = get_V_by_size(V) | |
# Note that sklearn assumes a list of queries, thus the [x] . | |
# Step 1. X ← string to feature(x); | |
vec_x = vectorizer.transform([x]).toarray()[0] | |
# print (V.todense()) # Show feature matrix V in human-readable format. | |
# print (vectorizer.transform([x]).toarray()[0]) # Show vector for query string, x. | |
# Find the size of X. | |
size_X = sum(vec_x) | |
# Get the min and max size of y, given the alpha tolerance paramter. | |
min_y, max_y = min_max_y(size_X, alpha) | |
# Step 2. Y ←[]; | |
output = set() | |
# Step3. for l ← min y(|X|, α) to max y(|X|, α) do | |
for l in range(min_y, max_y): | |
# Step 4: τ ← min overlap(|X|, l, α); | |
tau = alpha * sqrt(size_X * l) | |
# A subset of V where the words are of size l. | |
sub_V_indices = V_by_size.get(l) | |
if sub_V_indices: | |
sub_V = [(i, V[i].toarray()[0]) for i in sub_V_indices] | |
# Step 5: R ← overlapjoin(X, τ , V , l); | |
R = (list(overlapjoin(vec_x, tau, sub_V, l))) | |
# Step 6: foreach r ∈ R do append r to Y; | |
output.update(R) | |
return set([y[i] for i in output]) | |
def demo_adam(): | |
x, y = 'israel', ['rafael', 'israeli', 'notreal', 'quantico', 'foo', 'bar', | |
'geauflaufen', 'israelite', 'isrel', 'irsael', 'israeling'] | |
print (approx_dict_matching(x,y)) | |
x, y = 'apple', ['apple-pie', 'zappel', 'trippel', | |
'grapple', 'apples', 'appleing'] | |
print (approx_dict_matching(x,y)) | |
x, y = 'tame', ['lame', 'tamer', 'lamer', 'gamer', 'taming'] | |
print (approx_dict_matching(x,y)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Cool :)