Skip to content

Instantly share code, notes, and snippets.

@moretea
Created June 4, 2018 15:49
Show Gist options
  • Save moretea/4b8620e6ce49849788956db0660c8b68 to your computer and use it in GitHub Desktop.
Save moretea/4b8620e6ce49849788956db0660c8b68 to your computer and use it in GitHub Desktop.
Memory mapped file example
from annoy import AnnoyIndex
import struct
VECTOR_LENGTH = 300
annoy_index = AnnoyIndex(VECTOR_LENGTH)
nr_lines = 0
words = []
print("Reading dataset")
with open("glove.6B.300d.txt") as input:
for line in input:
parts = line.split(" ")
word = parts[0]
vector = [float(x) for x in parts[1:]]
assert len(vector) == VECTOR_LENGTH
words.append((word, vector))
nr_lines = nr_lines +1
# if nr_lines >= 5000:
# break
# Sort the dataset by text to be able to do binary search lookup of words.
print("Sorting dataset")
words.sort(key=lambda x: x[0])
# Build k-NN datastructure
print("Build k-NN datastructure")
for idx, word in enumerate(words):
annoy_index.add_item(idx, word[1])
annoy_index.build(20) # need to test/tweak this number.
annoy_index.save("vectors.ann")
# Build lookup table from word to index.
print("Build lookup table")
with open("vectors.idx", "wb") as vi:
# Write how big our lookup list is going to be.
vi.write(struct.pack("<I", len(words)))
# Compute address where the words will start.
# 1 word for the length, then len(words) words for the fixed-width lookup table,
# for cheap index lookups.
word_idx = (1 + len(words)) * 4
# Write lookup list for words.
for word in words:
# Write text index of that word.
vi.write(struct.pack("<I", word_idx))
word_idx += len(bytes(word[0].encode("UTF-8"))) + 1 # Compute next index; length of word + null byte.
# padding_bytes = 4 - (word_idx % 4) # pad to multiple of 4 bytes.
# word_idx += padding_bytes
# Write null terminated strings; This way we can simply compute the address and consider
# them to be C's null-terminated strings.
for word in words:
vi.write(bytes(word[0].encode("UTF-8")))
vi.write(bytes("\x00".encode("UTF-8")))
# padding_bytes = 4 - (word_idx % 4)
# for _ in range(padding_bytes):
# vi.write(b"\x00")
from annoy import AnnoyIndex
import struct
import mmap
import ctypes
VECTOR_LENGTH = 300
print("INGELADEN")
class WordIndex:
def __init__(self, index_file):
self.file = open(index_file, "rb")
self.mm = mmap.mmap(self.file.fileno(), 0, access=mmap.ACCESS_READ)
self.nr_words = self._get_long(0)
# Now do some magic to get the pointer of the mmaped file.
# Needed to construct a ctypes C string.
obj = ctypes.py_object(self.mm)
address = ctypes.c_void_p()
length = ctypes.c_ssize_t()
ctypes.pythonapi.PyObject_AsReadBuffer(obj, ctypes.byref(address), ctypes.byref(length))
self.start_of_words_pointer = address.value
# Perform a binary search to find the index.
def find_index(self, word):
word = bytes(word.encode("utf-8"))
low = 0
high = self.nr_words
while low <= high:
midpoint = (low + high) // 2
midword = self.get_word(midpoint)
if midword == word:
return midpoint
elif word < midword:
high = midpoint -1
else:
low = midpoint + 1
# Return a pointer to a C-style string
def get_word(self, idx):
addr = (1 + idx) * 4
cstr_ptr = self.start_of_words_pointer + self._get_long(addr)
return ctypes.c_char_p(cstr_ptr).value
# read a long int from the mmapped file.
def _get_long(self, addr):
return struct.unpack("<I", self.mm[addr:addr+4])[0]
word_index = WordIndex("vectors.idx")
annoy_index = AnnoyIndex(VECTOR_LENGTH)
annoy_index.load("vectors.ann")
apple_nr = word_index.find_index("apple")
company_nr = word_index.find_index("company")
fruit_nr = word_index.find_index("fruit")
def print_close(what_nr):
indices = annoy_index.get_nns_by_item(what_nr, 10)
close_words = [word_index.get_word(x) for x in indices]
print("close to", close_words)
print_close(apple_nr)
print_close(fruit_nr)
print_close(company_nr)
#for i in range(0, word_index.nr_words):
# print(word_index.get_word(i).decode("utf-8"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment