nguyenvulebinh/analogies.py

## analogies.py
import numpy as np
from annoy import AnnoyIndex


class PretrainedEmbeddings(object):
    def __init__(self, word_to_index, word_vectors):
        """
        Args
        :param word_to_index: dict mapping from word to integers
        :param word_vectors: list of numpy arrays
        """
        self.word_to_index = word_to_index
        self.word_vectors = word_vectors
        self.index_to_word = {v: k for k, v in self.word_to_index.items()}
        self.index = AnnoyIndex(len(word_vectors[0]), metric='euclidean')
        for _, i in self.word_to_index.items():
            self.index.add_item(i, self.word_vectors[i])
        self.index.build(50)

    @classmethod
    def from_embeddings_file(cls, embedding_file):
        """
        Instantiate from pretrained vector file.
        Vector file should be if the format:
        word0 x0_0 x0_1 x0_2 x0_3 ... x0_N
        word1 x1_0 x1_1 x1_2 x1_3 ... x1_N
        :param embedding_file: str, location of the file
        :return: instance of PretrainedEmbeddings
        """

        word_to_index = {}
        word_vectors = []
        with open(embedding_file) as fb:
            for line in fb.readlines():
                line = line.split(" ")
                word = line[0]
                vec = np.array([float(x) for x in line[1:]])
                word_to_index[word] = len(word_to_index)
                word_vectors.append(vec)
        return cls(word_to_index, word_vectors)

    def get_embedding(self, word):
        """
        Args:
        :param word:
        :return: embedding np.array
        """
        return self.word_vectors[self.word_to_index[word]]

    def get_closest_to_vector(self, vector, n=1):
        """
        Given a vector, return it's n nearest neighbors
        :param vector: ndarray should match the size of the vectors in the Annoy index
        :param n: the number of neighbors to return
        :return: list words nearest to the given vector. The words are not ordered by distance
        """
        nn_indices = self.index.get_nns_by_vector(vector, n)
        return [self.index_to_word[neighbor] for neighbor in nn_indices]

    def compute_and_print_analogy(self, word1, word2, word3):
        """
        Prints the solutions to analogies using word embeddings
        Analogies are word1 is to word2 as word3 is to _
        :param word1:
        :param word2:
        :param word3:
        :return:
        """
        vec1 = self.get_embedding(word1)
        vec2 = self.get_embedding(word2)
        vec3 = self.get_embedding(word3)

        spatial_relationship = vec2 - vec1
        vec4 = vec3 + spatial_relationship

        closest_words = self.get_closest_to_vector(vec4, n=4)
        existing_words = {word1, word2, word3}
        closest_words = [word for word in closest_words if word not in existing_words]
        if len(closest_words) == 0:
            print("Could not find nearest neighbors for the vector!")
            return
        for word4 in closest_words:
            print("{} : {} :: {} : {}".format(word1, word2, word3, word4))


embedding = PretrainedEmbeddings.from_embeddings_file('../data/glove.6B/glove.6B.50d.txt')
embedding.compute_and_print_analogy('man', 'he', 'woman')
	import numpy as np
	from annoy import AnnoyIndex


	class PretrainedEmbeddings(object):
	def __init__(self, word_to_index, word_vectors):
	"""
	Args
	:param word_to_index: dict mapping from word to integers
	:param word_vectors: list of numpy arrays
	"""
	self.word_to_index = word_to_index
	self.word_vectors = word_vectors
	self.index_to_word = {v: k for k, v in self.word_to_index.items()}
	self.index = AnnoyIndex(len(word_vectors[0]), metric='euclidean')
	for _, i in self.word_to_index.items():
	self.index.add_item(i, self.word_vectors[i])
	self.index.build(50)

	@classmethod
	def from_embeddings_file(cls, embedding_file):
	"""
	Instantiate from pretrained vector file.
	Vector file should be if the format:
	word0 x0_0 x0_1 x0_2 x0_3 ... x0_N
	word1 x1_0 x1_1 x1_2 x1_3 ... x1_N
	:param embedding_file: str, location of the file
	:return: instance of PretrainedEmbeddings
	"""

	word_to_index = {}
	word_vectors = []
	with open(embedding_file) as fb:
	for line in fb.readlines():
	line = line.split(" ")
	word = line[0]
	vec = np.array([float(x) for x in line[1:]])
	word_to_index[word] = len(word_to_index)
	word_vectors.append(vec)
	return cls(word_to_index, word_vectors)

	def get_embedding(self, word):
	"""
	Args:
	:param word:
	:return: embedding np.array
	"""
	return self.word_vectors[self.word_to_index[word]]

	def get_closest_to_vector(self, vector, n=1):
	"""
	Given a vector, return it's n nearest neighbors
	:param vector: ndarray should match the size of the vectors in the Annoy index
	:param n: the number of neighbors to return
	:return: list words nearest to the given vector. The words are not ordered by distance
	"""
	nn_indices = self.index.get_nns_by_vector(vector, n)
	return [self.index_to_word[neighbor] for neighbor in nn_indices]

	def compute_and_print_analogy(self, word1, word2, word3):
	"""
	Prints the solutions to analogies using word embeddings
	Analogies are word1 is to word2 as word3 is to _
	:param word1:
	:param word2:
	:param word3:
	:return:
	"""
	vec1 = self.get_embedding(word1)
	vec2 = self.get_embedding(word2)
	vec3 = self.get_embedding(word3)

	spatial_relationship = vec2 - vec1
	vec4 = vec3 + spatial_relationship

	closest_words = self.get_closest_to_vector(vec4, n=4)
	existing_words = {word1, word2, word3}
	closest_words = [word for word in closest_words if word not in existing_words]
	if len(closest_words) == 0:
	print("Could not find nearest neighbors for the vector!")
	return
	for word4 in closest_words:
	print("{} : {} :: {} : {}".format(word1, word2, word3, word4))


	embedding = PretrainedEmbeddings.from_embeddings_file('../data/glove.6B/glove.6B.50d.txt')
	embedding.compute_and_print_analogy('man', 'he', 'woman')