Created
May 18, 2019 10:16
-
-
Save nguyenvulebinh/001b18a68243ba0ab1981816c3af7011 to your computer and use it in GitHub Desktop.
Analogies task using word embeddings and annoy library
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from annoy import AnnoyIndex | |
class PretrainedEmbeddings(object): | |
def __init__(self, word_to_index, word_vectors): | |
""" | |
Args | |
:param word_to_index: dict mapping from word to integers | |
:param word_vectors: list of numpy arrays | |
""" | |
self.word_to_index = word_to_index | |
self.word_vectors = word_vectors | |
self.index_to_word = {v: k for k, v in self.word_to_index.items()} | |
self.index = AnnoyIndex(len(word_vectors[0]), metric='euclidean') | |
for _, i in self.word_to_index.items(): | |
self.index.add_item(i, self.word_vectors[i]) | |
self.index.build(50) | |
@classmethod | |
def from_embeddings_file(cls, embedding_file): | |
""" | |
Instantiate from pretrained vector file. | |
Vector file should be if the format: | |
word0 x0_0 x0_1 x0_2 x0_3 ... x0_N | |
word1 x1_0 x1_1 x1_2 x1_3 ... x1_N | |
:param embedding_file: str, location of the file | |
:return: instance of PretrainedEmbeddings | |
""" | |
word_to_index = {} | |
word_vectors = [] | |
with open(embedding_file) as fb: | |
for line in fb.readlines(): | |
line = line.split(" ") | |
word = line[0] | |
vec = np.array([float(x) for x in line[1:]]) | |
word_to_index[word] = len(word_to_index) | |
word_vectors.append(vec) | |
return cls(word_to_index, word_vectors) | |
def get_embedding(self, word): | |
""" | |
Args: | |
:param word: | |
:return: embedding np.array | |
""" | |
return self.word_vectors[self.word_to_index[word]] | |
def get_closest_to_vector(self, vector, n=1): | |
""" | |
Given a vector, return it's n nearest neighbors | |
:param vector: ndarray should match the size of the vectors in the Annoy index | |
:param n: the number of neighbors to return | |
:return: list words nearest to the given vector. The words are not ordered by distance | |
""" | |
nn_indices = self.index.get_nns_by_vector(vector, n) | |
return [self.index_to_word[neighbor] for neighbor in nn_indices] | |
def compute_and_print_analogy(self, word1, word2, word3): | |
""" | |
Prints the solutions to analogies using word embeddings | |
Analogies are word1 is to word2 as word3 is to _ | |
:param word1: | |
:param word2: | |
:param word3: | |
:return: | |
""" | |
vec1 = self.get_embedding(word1) | |
vec2 = self.get_embedding(word2) | |
vec3 = self.get_embedding(word3) | |
spatial_relationship = vec2 - vec1 | |
vec4 = vec3 + spatial_relationship | |
closest_words = self.get_closest_to_vector(vec4, n=4) | |
existing_words = {word1, word2, word3} | |
closest_words = [word for word in closest_words if word not in existing_words] | |
if len(closest_words) == 0: | |
print("Could not find nearest neighbors for the vector!") | |
return | |
for word4 in closest_words: | |
print("{} : {} :: {} : {}".format(word1, word2, word3, word4)) | |
embedding = PretrainedEmbeddings.from_embeddings_file('../data/glove.6B/glove.6B.50d.txt') | |
embedding.compute_and_print_analogy('man', 'he', 'woman') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment