Skip to content

Instantly share code, notes, and snippets.

@oborchers
Created June 7, 2019 16:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save oborchers/4992136123df5e20527276488c4223be to your computer and use it in GitHub Desktop.
Save oborchers/4992136123df5e20527276488c4223be to your computer and use it in GitHub Desktop.
Baseline implementation for SIF embeddings
import numpy as np
REAL = np.float32
def sif_embeddings(sentences, model, alpha=1e-3):
"""Compute the SIF embeddings for a list of sentences
Parameters
----------
sentences : list
The sentences to compute the embeddings for
model : `~gensim.models.base_any2vec.BaseAny2VecModel`
A gensim model that contains the word vectors and the vocabulary
alpha : float, optional
Parameter which is used to weigh each individual word based on its probability p(w).
Returns
-------
numpy.ndarray
SIF sentence embedding matrix of dim len(sentences) * dimension
"""
vlookup = model.wv.vocab # Gives us access to word index and count
vectors = model.wv # Gives us access to word vectors
size = model.vector_size # Embedding size
Z = 0
for k in vlookup:
Z += vlookup[k].count # Compute the normalization constant Z
output = []
# Iterate all sentences
for s in sentences:
count = 0
v = np.zeros(size, dtype=REAL) # Summary vector
# Iterare all words
for w in s:
# A word must be present in the vocabulary
if w in vlookup:
for i in range(size):
v[i] += ( alpha / (alpha + (vlookup[w].count / Z))) * vectors[w][i]
count += 1
if count > 0:
for i in range(size):
v[i] *= 1/count
output.append(v)
return np.vstack(output).astype(REAL)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment