oborchers/sif_baseline.py

## sif_baseline.py
import numpy as np
REAL = np.float32

def sif_embeddings(sentences, model, alpha=1e-3):
    """Compute the SIF embeddings for a list of sentences
    Parameters
    ----------
    sentences : list
        The sentences to compute the embeddings for
    model : `~gensim.models.base_any2vec.BaseAny2VecModel`
        A gensim model that contains the word vectors and the vocabulary
    alpha : float, optional
        Parameter which is used to weigh each individual word based on its probability p(w).
    Returns
    -------
    numpy.ndarray
        SIF sentence embedding matrix of dim len(sentences) * dimension
    """

    vlookup = model.wv.vocab  # Gives us access to word index and count
    vectors = model.wv        # Gives us access to word vectors
    size = model.vector_size  # Embedding size

    Z = 0
    for k in vlookup:
        Z += vlookup[k].count # Compute the normalization constant Z

    output = []

    # Iterate all sentences
    for s in sentences:
        count = 0
        v = np.zeros(size, dtype=REAL) # Summary vector
        # Iterare all words
        for w in s:
            # A word must be present in the vocabulary
            if w in vlookup:
                for i in range(size):
                    v[i] += ( alpha / (alpha + (vlookup[w].count / Z))) * vectors[w][i]
                count += 1

        if count > 0:
            for i in range(size):
                v[i] *= 1/count
        output.append(v)
    return np.vstack(output).astype(REAL)
	import numpy as np
	REAL = np.float32

	def sif_embeddings(sentences, model, alpha=1e-3):
	"""Compute the SIF embeddings for a list of sentences
	Parameters
	----------
	sentences : list
	The sentences to compute the embeddings for
	model : `~gensim.models.base_any2vec.BaseAny2VecModel`
	A gensim model that contains the word vectors and the vocabulary
	alpha : float, optional
	Parameter which is used to weigh each individual word based on its probability p(w).
	Returns
	-------
	numpy.ndarray
	SIF sentence embedding matrix of dim len(sentences) * dimension
	"""

	vlookup = model.wv.vocab # Gives us access to word index and count
	vectors = model.wv # Gives us access to word vectors
	size = model.vector_size # Embedding size

	Z = 0
	for k in vlookup:
	Z += vlookup[k].count # Compute the normalization constant Z

	output = []

	# Iterate all sentences
	for s in sentences:
	count = 0
	v = np.zeros(size, dtype=REAL) # Summary vector
	# Iterare all words
	for w in s:
	# A word must be present in the vocabulary
	if w in vlookup:
	for i in range(size):
	v[i] += ( alpha / (alpha + (vlookup[w].count / Z))) * vectors[w][i]
	count += 1

	if count > 0:
	for i in range(size):
	v[i] *= 1/count
	output.append(v)
	return np.vstack(output).astype(REAL)