alvations/adam.py Secret

## adam.py
# -*- coding: utf-8 -*-

"""
This is the "Approximate dictionary matching" algorithm described in
http://www.aclweb.org/anthology/C10-1096.

Given a query string x, a collection of strings V , and a similarity
threshold α, the algorithm computes the size range (line 3) given by Table 1.
For each size l in the range, the algorithm computes the minimum number of
overlaps τ (line 4). The function overlapjoin (line 5) finds similar strings by
solving the following problem (τ -overlap join): given a list of features of
the query string X and the minimum number of overlaps τ, enumerate strings of
size l in the collection V such that they have at least τ feature overlaps with X.

Algortihm:

    Input: V : collection of strings
    Input: x: query string
    Input: α: threshold for the similarity
    Output: Y: list of strings similar to the query

    1. X ← string to feature(x);
    2. Y ←[];
    3. for l ← min y(|X|, α) to max y(|X|, α) do
    4.     τ ← min overlap(|X|, l, α);
    5.     R ← overlapjoin(X, τ , V , l);
    6.     foreach r ∈ R do append r to Y;
    7. end
    8. return Y;

Some legalese:

The MIT License (MIT)
Copyright (c) 2016 Liling Tan

Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
of the Software, and to permit persons to whom the Software is furnished to do
so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.

"""

from math import sqrt
from collections import defaultdict

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer


def get_V(y, n):
    """
    Extract a ngram feature matrix for string comparison for a specified n order.

    To extract the vectorized feature matrix, we can use the DIY way of
    (i) extracting ngrams, (ii) then creating a numpy array for the matrix,
    (iii) then fit new queries to the matrix. But since scikit-learn already
    have those functions optimized for speed, we should use them.
    """
    ngram_vectorizer = CountVectorizer(analyzer='char', ngram_range=(n-1, n), min_df=1)
    V = ngram_vectorizer.fit_transform(y)
    return V,  ngram_vectorizer


def get_V_by_size(V):
    """
    Get an "indexed" matrix V, where the key is the size and the values are the
    the indices of V that has the size corresponding to the key.

    This is a optimization trick. For this to scale, we might have to query a
    proper dataframe/database if our V is very very large, otherwise, if it fits
    on RAM, then we can simply prune the sequential queries by restricting the
    size.
    """
    V_by_size = defaultdict(list)
    for i, y_vec in enumerate(V):
        size_y = np.sum(y_vec.toarray())
        V_by_size[size_y].append(i)
    return V_by_size


def min_max_y(size_X, alpha):
    """
    Computes the size threshold of Y given the size of X and alpha.
    """
    return int(alpha * alpha * size_X), int(size_X / (alpha * alpha))


def overlapjoin(vec_x, tau, sub_V, l):
    """
    Find the no. of overlap ngrams between the query *vec_x* and the possible
    subset of V.
    """
    for i, _y in sub_V:
        # Possibly this can be optimized by checking through only the
        # non-zeros in the sparse matrix but there might be some caveats
        # when synchronizing non-zeros of *vec_x* and *_y*.
        num_overlaps = sum([1 for x_fi, y_fi in zip(vec_x, _y)
                            if x_fi&y_fi > 0 and x_fi == y_fi])
        if num_overlaps > tau:
            yield i


def approx_dict_matching(x, y, V=None, vectorizer=None, V_by_size=None,
                         n=3, alpha=0.7):
    """
    The "approximate dictionary matching" algorithm as described in
    http://www.aclweb.org/anthology/C10-1096

    :param x: The query word.
    :type x: str
    :param y: A list of words in the vocabulary.
    :type y: str
    :param n: The order of ngrams, e.g. n=3 uses trigrams, n=2 uses bigrams
    :type n: int
    :param alpha: A parameter to specify length threshold of similar words.
    :type alpha: float
    """
    # Use for optimizing V such that we only instantiate V once outside of the
    # approx_dict_matching() function.
    if V == vectorizer == V_by_size == None:
        V, vectorizer = get_V(y, n)
        # Optimization trick:
        # Pre-compute a dictionary to index the size of the non-zero values in V.
        V_by_size = get_V_by_size(V)

    # Note that sklearn assumes a list of queries, thus the [x] .
    # Step 1. X ← string to feature(x);
    vec_x = vectorizer.transform([x]).toarray()[0]

    # print (V.todense()) # Show feature matrix V in human-readable format.
    # print (vectorizer.transform([x]).toarray()[0]) # Show vector for query string, x.

    # Find the size of X.
    size_X = sum(vec_x)
    # Get the min and max size of y, given the alpha tolerance paramter.
    min_y, max_y = min_max_y(size_X, alpha)

    # Step 2. Y ←[];
    output = set()
    # Step3. for l ← min y(|X|, α) to max y(|X|, α) do
    for l in range(min_y, max_y):
        # Step 4: τ ← min overlap(|X|, l, α);
        tau = alpha * sqrt(size_X * l)
        # A subset of V where the words are of size l.
        sub_V_indices = V_by_size.get(l)
        if sub_V_indices:
            sub_V = [(i, V[i].toarray()[0]) for i in sub_V_indices]
            # Step 5: R ← overlapjoin(X, τ , V , l);
            R = (list(overlapjoin(vec_x, tau, sub_V, l)))
            # Step 6: foreach r ∈ R do append r to Y;
            output.update(R)

    return set([y[i] for i in output])


def demo_adam():
    x, y = 'israel', ['rafael', 'israeli', 'notreal', 'quantico', 'foo', 'bar',
                      'geauflaufen', 'israelite', 'isrel', 'irsael', 'israeling']
    print (approx_dict_matching(x,y))
    x, y = 'apple', ['apple-pie', 'zappel', 'trippel',
                     'grapple', 'apples', 'appleing']
    print (approx_dict_matching(x,y))
    x, y = 'tame', ['lame', 'tamer', 'lamer', 'gamer', 'taming']
    print (approx_dict_matching(x,y))
	# -- coding: utf-8 --

	"""
	This is the "Approximate dictionary matching" algorithm described in
	http://www.aclweb.org/anthology/C10-1096.

	Given a query string x, a collection of strings V , and a similarity
	threshold α, the algorithm computes the size range (line 3) given by Table 1.
	For each size l in the range, the algorithm computes the minimum number of
	overlaps τ (line 4). The function overlapjoin (line 5) finds similar strings by
	solving the following problem (τ -overlap join): given a list of features of
	the query string X and the minimum number of overlaps τ, enumerate strings of
	size l in the collection V such that they have at least τ feature overlaps with X.

	Algortihm:

	Input: V : collection of strings
	Input: x: query string
	Input: α: threshold for the similarity
	Output: Y: list of strings similar to the query

	1. X ← string to feature(x);
	2. Y ←[];
	3. for l ← min y(\|X\|, α) to max y(\|X\|, α) do
	4. τ ← min overlap(\|X\|, l, α);
	5. R ← overlapjoin(X, τ , V , l);
	6. foreach r ∈ R do append r to Y;
	7. end
	8. return Y;

	Some legalese:

	The MIT License (MIT)
	Copyright (c) 2016 Liling Tan

	Permission is hereby granted, free of charge, to any person obtaining a copy of
	this software and associated documentation files (the "Software"), to deal in
	the Software without restriction, including without limitation the rights to
	use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
	of the Software, and to permit persons to whom the Software is furnished to do
	so, subject to the following conditions:

	The above copyright notice and this permission notice shall be included in all
	copies or substantial portions of the Software.

	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
	ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
	DEALINGS IN THE SOFTWARE.

	"""

	from math import sqrt
	from collections import defaultdict

	import numpy as np
	from sklearn.feature_extraction.text import CountVectorizer


	def get_V(y, n):
	"""
	Extract a ngram feature matrix for string comparison for a specified n order.

	To extract the vectorized feature matrix, we can use the DIY way of
	(i) extracting ngrams, (ii) then creating a numpy array for the matrix,
	(iii) then fit new queries to the matrix. But since scikit-learn already
	have those functions optimized for speed, we should use them.
	"""
	ngram_vectorizer = CountVectorizer(analyzer='char', ngram_range=(n-1, n), min_df=1)
	V = ngram_vectorizer.fit_transform(y)
	return V, ngram_vectorizer


	def get_V_by_size(V):
	"""
	Get an "indexed" matrix V, where the key is the size and the values are the
	the indices of V that has the size corresponding to the key.

	This is a optimization trick. For this to scale, we might have to query a
	proper dataframe/database if our V is very very large, otherwise, if it fits
	on RAM, then we can simply prune the sequential queries by restricting the
	size.
	"""
	V_by_size = defaultdict(list)
	for i, y_vec in enumerate(V):
	size_y = np.sum(y_vec.toarray())
	V_by_size[size_y].append(i)
	return V_by_size


	def min_max_y(size_X, alpha):
	"""
	Computes the size threshold of Y given the size of X and alpha.
	"""
	return int(alpha * alpha * size_X), int(size_X / (alpha * alpha))


	def overlapjoin(vec_x, tau, sub_V, l):
	"""
	Find the no. of overlap ngrams between the query vec_x and the possible
	subset of V.
	"""
	for i, _y in sub_V:
	# Possibly this can be optimized by checking through only the
	# non-zeros in the sparse matrix but there might be some caveats
	# when synchronizing non-zeros of vec_x and _y.
	num_overlaps = sum([1 for x_fi, y_fi in zip(vec_x, _y)
	if x_fi&y_fi > 0 and x_fi == y_fi])
	if num_overlaps > tau:
	yield i


	def approx_dict_matching(x, y, V=None, vectorizer=None, V_by_size=None,
	n=3, alpha=0.7):
	"""
	The "approximate dictionary matching" algorithm as described in
	http://www.aclweb.org/anthology/C10-1096

	:param x: The query word.
	:type x: str
	:param y: A list of words in the vocabulary.
	:type y: str
	:param n: The order of ngrams, e.g. n=3 uses trigrams, n=2 uses bigrams
	:type n: int
	:param alpha: A parameter to specify length threshold of similar words.
	:type alpha: float
	"""
	# Use for optimizing V such that we only instantiate V once outside of the
	# approx_dict_matching() function.
	if V == vectorizer == V_by_size == None:
	V, vectorizer = get_V(y, n)
	# Optimization trick:
	# Pre-compute a dictionary to index the size of the non-zero values in V.
	V_by_size = get_V_by_size(V)

	# Note that sklearn assumes a list of queries, thus the [x] .
	# Step 1. X ← string to feature(x);
	vec_x = vectorizer.transform([x]).toarray()[0]

	# print (V.todense()) # Show feature matrix V in human-readable format.
	# print (vectorizer.transform([x]).toarray()[0]) # Show vector for query string, x.

	# Find the size of X.
	size_X = sum(vec_x)
	# Get the min and max size of y, given the alpha tolerance paramter.
	min_y, max_y = min_max_y(size_X, alpha)

	# Step 2. Y ←[];
	output = set()
	# Step3. for l ← min y(\|X\|, α) to max y(\|X\|, α) do
	for l in range(min_y, max_y):
	# Step 4: τ ← min overlap(\|X\|, l, α);
	tau = alpha * sqrt(size_X * l)
	# A subset of V where the words are of size l.
	sub_V_indices = V_by_size.get(l)
	if sub_V_indices:
	sub_V = [(i, V[i].toarray()[0]) for i in sub_V_indices]
	# Step 5: R ← overlapjoin(X, τ , V , l);
	R = (list(overlapjoin(vec_x, tau, sub_V, l)))
	# Step 6: foreach r ∈ R do append r to Y;
	output.update(R)

	return set([y[i] for i in output])


	def demo_adam():
	x, y = 'israel', ['rafael', 'israeli', 'notreal', 'quantico', 'foo', 'bar',
	'geauflaufen', 'israelite', 'isrel', 'irsael', 'israeling']
	print (approx_dict_matching(x,y))
	x, y = 'apple', ['apple-pie', 'zappel', 'trippel',
	'grapple', 'apples', 'appleing']
	print (approx_dict_matching(x,y))
	x, y = 'tame', ['lame', 'tamer', 'lamer', 'gamer', 'taming']
	print (approx_dict_matching(x,y))