moretea/build.py

## build.py
from annoy import AnnoyIndex
import struct

VECTOR_LENGTH = 300

annoy_index = AnnoyIndex(VECTOR_LENGTH)

nr_lines = 0
words = []
print("Reading dataset")
with open("glove.6B.300d.txt") as input:
    for line in input:
        parts = line.split(" ")
        word = parts[0]
        vector = [float(x) for x in parts[1:]]
        assert len(vector) == VECTOR_LENGTH
        words.append((word, vector))
        nr_lines = nr_lines +1
#        if nr_lines >= 5000:
#            break

# Sort the dataset by text to be able to do binary search lookup of words.
print("Sorting dataset")
words.sort(key=lambda x: x[0])

# Build k-NN datastructure
print("Build k-NN datastructure")
for idx, word in enumerate(words):
    annoy_index.add_item(idx, word[1])
annoy_index.build(20) # need to test/tweak this number.
annoy_index.save("vectors.ann")

# Build lookup table from word to index.
print("Build lookup table")
with open("vectors.idx", "wb") as vi:

    # Write how big our lookup list is going to be.
    vi.write(struct.pack("<I", len(words)))

    # Compute address where the words will start.
    # 1 word for the length, then len(words) words for the fixed-width lookup table,
    # for cheap index lookups.
    word_idx = (1 + len(words)) * 4

    # Write lookup list for words.
    for word in words:
        # Write text index of that word.
        vi.write(struct.pack("<I", word_idx))
        word_idx += len(bytes(word[0].encode("UTF-8"))) + 1 # Compute next index; length of word + null byte.

#        padding_bytes = 4 - (word_idx % 4) # pad to multiple of 4 bytes.
#        word_idx += padding_bytes

    # Write null terminated strings; This way we can simply compute the address and consider
    # them to be C's null-terminated strings.
    for word in words:
        vi.write(bytes(word[0].encode("UTF-8")))
        vi.write(bytes("\x00".encode("UTF-8")))
#        padding_bytes = 4 - (word_idx % 4)
#        for _ in range(padding_bytes):
#            vi.write(b"\x00")

## lookup.py
from annoy import AnnoyIndex
import struct
import mmap
import ctypes
VECTOR_LENGTH = 300

print("INGELADEN")

class WordIndex:
    def __init__(self, index_file):
        self.file = open(index_file, "rb")
        self.mm = mmap.mmap(self.file.fileno(), 0, access=mmap.ACCESS_READ)
        self.nr_words = self._get_long(0)

        # Now do some magic to get the pointer of the mmaped file.
        # Needed to construct a ctypes C string.
        obj = ctypes.py_object(self.mm)
        address = ctypes.c_void_p()
        length = ctypes.c_ssize_t()
        ctypes.pythonapi.PyObject_AsReadBuffer(obj, ctypes.byref(address), ctypes.byref(length))
        self.start_of_words_pointer = address.value

    # Perform a binary search to find the index.
    def find_index(self, word):
        word = bytes(word.encode("utf-8"))
        low = 0
        high = self.nr_words

        while low <= high:
            midpoint = (low + high) // 2

            midword = self.get_word(midpoint)
            if midword == word:
                return midpoint
            elif word < midword:
                high = midpoint -1
            else:
                low = midpoint + 1

    # Return a pointer to a C-style string
    def get_word(self, idx):
        addr = (1 + idx) * 4
        cstr_ptr = self.start_of_words_pointer + self._get_long(addr)
        return ctypes.c_char_p(cstr_ptr).value

    # read a long int from the mmapped file.
    def _get_long(self, addr):
        return struct.unpack("<I", self.mm[addr:addr+4])[0]

word_index = WordIndex("vectors.idx")

annoy_index = AnnoyIndex(VECTOR_LENGTH)
annoy_index.load("vectors.ann")

apple_nr   = word_index.find_index("apple")
company_nr = word_index.find_index("company")
fruit_nr   = word_index.find_index("fruit")

def print_close(what_nr):
    indices = annoy_index.get_nns_by_item(what_nr, 10)
    close_words = [word_index.get_word(x) for x in indices]
    print("close to", close_words)

print_close(apple_nr)
print_close(fruit_nr)
print_close(company_nr)


#for i in range(0, word_index.nr_words):
#    print(word_index.get_word(i).decode("utf-8"))

## requirements.txt
annoy>=1.12.0
	from annoy import AnnoyIndex
	import struct

	VECTOR_LENGTH = 300

	annoy_index = AnnoyIndex(VECTOR_LENGTH)

	nr_lines = 0
	words = []
	print("Reading dataset")
	with open("glove.6B.300d.txt") as input:
	for line in input:
	parts = line.split(" ")
	word = parts[0]
	vector = [float(x) for x in parts[1:]]
	assert len(vector) == VECTOR_LENGTH
	words.append((word, vector))
	nr_lines = nr_lines +1
	# if nr_lines >= 5000:
	# break

	# Sort the dataset by text to be able to do binary search lookup of words.
	print("Sorting dataset")
	words.sort(key=lambda x: x[0])

	# Build k-NN datastructure
	print("Build k-NN datastructure")
	for idx, word in enumerate(words):
	annoy_index.add_item(idx, word[1])
	annoy_index.build(20) # need to test/tweak this number.
	annoy_index.save("vectors.ann")

	# Build lookup table from word to index.
	print("Build lookup table")
	with open("vectors.idx", "wb") as vi:

	# Write how big our lookup list is going to be.
	vi.write(struct.pack("<I", len(words)))

	# Compute address where the words will start.
	# 1 word for the length, then len(words) words for the fixed-width lookup table,
	# for cheap index lookups.
	word_idx = (1 + len(words)) * 4

	# Write lookup list for words.
	for word in words:
	# Write text index of that word.
	vi.write(struct.pack("<I", word_idx))
	word_idx += len(bytes(word[0].encode("UTF-8"))) + 1 # Compute next index; length of word + null byte.

	# padding_bytes = 4 - (word_idx % 4) # pad to multiple of 4 bytes.
	# word_idx += padding_bytes

	# Write null terminated strings; This way we can simply compute the address and consider
	# them to be C's null-terminated strings.
	for word in words:
	vi.write(bytes(word[0].encode("UTF-8")))
	vi.write(bytes("\x00".encode("UTF-8")))
	# padding_bytes = 4 - (word_idx % 4)
	# for _ in range(padding_bytes):
	# vi.write(b"\x00")
	from annoy import AnnoyIndex
	import struct
	import mmap
	import ctypes
	VECTOR_LENGTH = 300

	print("INGELADEN")

	class WordIndex:
	def __init__(self, index_file):
	self.file = open(index_file, "rb")
	self.mm = mmap.mmap(self.file.fileno(), 0, access=mmap.ACCESS_READ)
	self.nr_words = self._get_long(0)

	# Now do some magic to get the pointer of the mmaped file.
	# Needed to construct a ctypes C string.
	obj = ctypes.py_object(self.mm)
	address = ctypes.c_void_p()
	length = ctypes.c_ssize_t()
	ctypes.pythonapi.PyObject_AsReadBuffer(obj, ctypes.byref(address), ctypes.byref(length))
	self.start_of_words_pointer = address.value

	# Perform a binary search to find the index.
	def find_index(self, word):
	word = bytes(word.encode("utf-8"))
	low = 0
	high = self.nr_words

	while low <= high:
	midpoint = (low + high) // 2

	midword = self.get_word(midpoint)
	if midword == word:
	return midpoint
	elif word < midword:
	high = midpoint -1
	else:
	low = midpoint + 1

	# Return a pointer to a C-style string
	def get_word(self, idx):
	addr = (1 + idx) * 4
	cstr_ptr = self.start_of_words_pointer + self._get_long(addr)
	return ctypes.c_char_p(cstr_ptr).value

	# read a long int from the mmapped file.
	def _get_long(self, addr):
	return struct.unpack("<I", self.mm[addr:addr+4])[0]

	word_index = WordIndex("vectors.idx")

	annoy_index = AnnoyIndex(VECTOR_LENGTH)
	annoy_index.load("vectors.ann")

	apple_nr = word_index.find_index("apple")
	company_nr = word_index.find_index("company")
	fruit_nr = word_index.find_index("fruit")

	def print_close(what_nr):
	indices = annoy_index.get_nns_by_item(what_nr, 10)
	close_words = [word_index.get_word(x) for x in indices]
	print("close to", close_words)

	print_close(apple_nr)
	print_close(fruit_nr)
	print_close(company_nr)


	#for i in range(0, word_index.nr_words):
	# print(word_index.get_word(i).decode("utf-8"))