Created
June 7, 2019 16:16
-
-
Save oborchers/4992136123df5e20527276488c4223be to your computer and use it in GitHub Desktop.
Baseline implementation for SIF embeddings
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
REAL = np.float32 | |
def sif_embeddings(sentences, model, alpha=1e-3): | |
"""Compute the SIF embeddings for a list of sentences | |
Parameters | |
---------- | |
sentences : list | |
The sentences to compute the embeddings for | |
model : `~gensim.models.base_any2vec.BaseAny2VecModel` | |
A gensim model that contains the word vectors and the vocabulary | |
alpha : float, optional | |
Parameter which is used to weigh each individual word based on its probability p(w). | |
Returns | |
------- | |
numpy.ndarray | |
SIF sentence embedding matrix of dim len(sentences) * dimension | |
""" | |
vlookup = model.wv.vocab # Gives us access to word index and count | |
vectors = model.wv # Gives us access to word vectors | |
size = model.vector_size # Embedding size | |
Z = 0 | |
for k in vlookup: | |
Z += vlookup[k].count # Compute the normalization constant Z | |
output = [] | |
# Iterate all sentences | |
for s in sentences: | |
count = 0 | |
v = np.zeros(size, dtype=REAL) # Summary vector | |
# Iterare all words | |
for w in s: | |
# A word must be present in the vocabulary | |
if w in vlookup: | |
for i in range(size): | |
v[i] += ( alpha / (alpha + (vlookup[w].count / Z))) * vectors[w][i] | |
count += 1 | |
if count > 0: | |
for i in range(size): | |
v[i] *= 1/count | |
output.append(v) | |
return np.vstack(output).astype(REAL) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment