Skip to content

Instantly share code, notes, and snippets.

@nguyenvulebinh
Created May 18, 2019 10:16
Show Gist options
  • Save nguyenvulebinh/001b18a68243ba0ab1981816c3af7011 to your computer and use it in GitHub Desktop.
Save nguyenvulebinh/001b18a68243ba0ab1981816c3af7011 to your computer and use it in GitHub Desktop.
Analogies task using word embeddings and annoy library
import numpy as np
from annoy import AnnoyIndex
class PretrainedEmbeddings(object):
def __init__(self, word_to_index, word_vectors):
"""
Args
:param word_to_index: dict mapping from word to integers
:param word_vectors: list of numpy arrays
"""
self.word_to_index = word_to_index
self.word_vectors = word_vectors
self.index_to_word = {v: k for k, v in self.word_to_index.items()}
self.index = AnnoyIndex(len(word_vectors[0]), metric='euclidean')
for _, i in self.word_to_index.items():
self.index.add_item(i, self.word_vectors[i])
self.index.build(50)
@classmethod
def from_embeddings_file(cls, embedding_file):
"""
Instantiate from pretrained vector file.
Vector file should be if the format:
word0 x0_0 x0_1 x0_2 x0_3 ... x0_N
word1 x1_0 x1_1 x1_2 x1_3 ... x1_N
:param embedding_file: str, location of the file
:return: instance of PretrainedEmbeddings
"""
word_to_index = {}
word_vectors = []
with open(embedding_file) as fb:
for line in fb.readlines():
line = line.split(" ")
word = line[0]
vec = np.array([float(x) for x in line[1:]])
word_to_index[word] = len(word_to_index)
word_vectors.append(vec)
return cls(word_to_index, word_vectors)
def get_embedding(self, word):
"""
Args:
:param word:
:return: embedding np.array
"""
return self.word_vectors[self.word_to_index[word]]
def get_closest_to_vector(self, vector, n=1):
"""
Given a vector, return it's n nearest neighbors
:param vector: ndarray should match the size of the vectors in the Annoy index
:param n: the number of neighbors to return
:return: list words nearest to the given vector. The words are not ordered by distance
"""
nn_indices = self.index.get_nns_by_vector(vector, n)
return [self.index_to_word[neighbor] for neighbor in nn_indices]
def compute_and_print_analogy(self, word1, word2, word3):
"""
Prints the solutions to analogies using word embeddings
Analogies are word1 is to word2 as word3 is to _
:param word1:
:param word2:
:param word3:
:return:
"""
vec1 = self.get_embedding(word1)
vec2 = self.get_embedding(word2)
vec3 = self.get_embedding(word3)
spatial_relationship = vec2 - vec1
vec4 = vec3 + spatial_relationship
closest_words = self.get_closest_to_vector(vec4, n=4)
existing_words = {word1, word2, word3}
closest_words = [word for word in closest_words if word not in existing_words]
if len(closest_words) == 0:
print("Could not find nearest neighbors for the vector!")
return
for word4 in closest_words:
print("{} : {} :: {} : {}".format(word1, word2, word3, word4))
embedding = PretrainedEmbeddings.from_embeddings_file('../data/glove.6B/glove.6B.50d.txt')
embedding.compute_and_print_analogy('man', 'he', 'woman')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment