Skip to content

Instantly share code, notes, and snippets.

@maheshakya
Last active November 26, 2015 10:52
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save maheshakya/b22f640f67d7b574fd56 to your computer and use it in GitHub Desktop.
Save maheshakya/b22f640f67d7b574fd56 to your computer and use it in GitHub Desktop.
This is a rough implementation of LSH forest using sorted arrays and binary search for queries. (Still incomplete)
import numpy as np
from sklearn.metrics import euclidean_distances
#Re-implementation of bisect functions of bisect module to suit the application
def bisect_left(a, x):
lo = 0
hi = len(a)
while lo < hi:
mid = (lo+hi)//2
if a[mid] < x:
lo = mid + 1
else:
hi = mid
return lo
def bisect_right(a, x):
lo = 0
hi = len(a)
while lo < hi:
mid = (lo+hi)//2
if x < a[mid] and not a[mid][:len(x)]==x:
hi = mid
else:
lo = mid + 1
return lo
#function which accepts an sorted array of bit strings, a query string
#This returns an array containing all indices which share the first h bits of the query
def simpleFunctionBisectReImplemented(sorted_array, item, h):
left_index = bisect_left(sorted_array, item[:h])
right_index = bisect_right(sorted_array, item[:h])
return np.arange(left_index, right_index)
def find_longest_prefix_match(bit_string_list, query):
hi = len(query)
lo = 0
if len(simpleFunctionBisectReImplemented(bit_string_list, query, hi)) > 0:
return hi
while lo < hi:
mid = (lo+hi)//2
k = len(simpleFunctionBisectReImplemented(bit_string_list, query, mid))
if k > 0:
lo = mid + 1
res = mid
else:
hi = mid
return res
class LSH_forest(object):
"""
LSH forest implementation using numpy sorted arrays and
binary search. This is an initial effor for a hack.
NOT the final version.
attributes
----------
max_label_lengh: maximum length of hash
number_of_trees: number of trees build in indexing
"""
def __init__(self, max_label_length = 32, number_of_trees = 5):
self.max_label_length = max_label_length
self.number_of_trees = number_of_trees
self.min_label_length = 20
self.random_state = np.random.RandomState(seed=1)
def _get_random_hyperplanes(self, hash_size = None, dim = None):
"""
Generates hyperplanes from standard normal distribution and return
it as a 2D numpy array. This is g(p,x) for a particular tree.
"""
if hash_size == None or dim == None:
raise ValueError("hash_size or dim(number of dimensions) cannot be None.")
return self.random_state.randn(hash_size, dim)
def _hash(self, input_point = None, hash_function = None):
"""
Does hash on the data point with the provided hash_function: g(p,x).
"""
if input_point == None or hash_function == None:
raise ValueError("input_point or hash_function cannot be None.")
projections = np.dot(hash_function, input_point)
return "".join(['1' if i > 0 else '0' for i in projections])
def _create_tree(self, input_array = None, hash_function = None):
"""
Builds a single tree (in this case creates a sorted array of
binary hashes).
"""
if input_array == None or hash_function == None:
raise ValueError("input_array or hash_funciton cannot be None.")
number_of_points = input_array.shape[0]
binary_hashes = []
for i in range(number_of_points):
binary_hashes.append(self._hash(input_array[i], hash_function))
binary_hashes = np.array(binary_hashes)
o_i = np.argsort(binary_hashes)
return o_i, np.sort(binary_hashes)
def _compute_distances(self, query, candidates):
distances = euclidean_distances(query, self.input_array[candidates])
return np.argsort(distances), distances
def build_index(self, input_array = None):
"""
Builds index.
"""
if input_array == None:
raise ValueError("input_array cannot be None")
self.input_array = np.array(input_array)
number_of_points = input_array.shape[0]
dim = input_array.shape[1]
#Creates a g(p,x) for each tree
self.hash_functions = []
self.trees = []
self.original_indices = []
for i in range(self.number_of_trees):
""""
hash_size = self.random_state.randint(self.min_label_length,
self.max_label_length+1)
"""
hash_size = self.max_label_length
hash_function = self._get_random_hyperplanes(hash_size = hash_size, dim = dim)
o_i, bin_hashes = self._create_tree(input_array, hash_function)
self.original_indices.append(o_i)
self.trees.append(bin_hashes)
self.hash_functions.append(hash_function)
self.hash_functions = np.array(self.hash_functions)
self.trees = np.array(self.trees)
self.original_indices = np.array(self.original_indices)
def query(self, query = None, c = 1, m = 10):
"""
returns the number of neighbors for a given query.
"""
if query == None:
raise ValueError("query cannot be None.")
query = np.array(query)
#descend phase
max_depth = 0
for i in range(len(self.trees)):
bin_query = self._hash(query, self.hash_functions[i])
k = find_longest_prefix_match(self.trees[i], bin_query)
if k > max_depth:
max_depth = k
#Asynchronous ascend phase
candidates = []
number_of_candidates = c*len(self.trees)
while max_depth > 0 and (len(candidates) < number_of_candidates or len(set(candidates)) < m):
for i in range(len(self.trees)):
bin_query = self._hash(query, self.hash_functions[i])
candidates.extend(self.original_indices[i,simpleFunctionBisectReImplemented(self.trees[i],
bin_query, max_depth)].tolist())
#candidates = list(OrderedSet(candidates)) #this keeps the order inserted into the list
max_depth = max_depth - 1
#print max_depth, len(candidates) ,len(set(candidates))
candidates = np.array(list(set(candidates)))
ranks, distances = self._compute_distances(query, candidates)
#print ranks[0,:m]
print candidates.shape
return candidates[ranks[0,:m]]
def query_num_candidates(self, query = None, c = 1, m = 10):
"""
returns the number of neighbors for a given query.
"""
if query == None:
raise ValueError("query cannot be None.")
query = np.array(query)
#descend phase
max_depth = 0
for i in range(len(self.trees)):
bin_query = self._hash(query, self.hash_functions[i])
k = find_longest_prefix_match(self.trees[i], bin_query)
if k > max_depth:
max_depth = k
#Asynchronous ascend phase
candidates = []
number_of_candidates = c*len(self.trees)
while max_depth > 0 and (len(candidates) < number_of_candidates or len(set(candidates)) < m):
for i in range(len(self.trees)):
bin_query = self._hash(query, self.hash_functions[i])
candidates.extend(self.original_indices[i,simpleFunctionBisectReImplemented(self.trees[i],
bin_query, max_depth)].tolist())
#candidates = list(OrderedSet(candidates)) #this keeps the order inserted into the list
max_depth = max_depth - 1
#print max_depth, len(candidates) ,len(set(candidates))
candidates = np.array(list(set(candidates)))
ranks, distances = self._compute_distances(query, candidates)
#print ranks[0,:m]
return candidates[ranks[0,:m]], candidates.shape[0]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment