Skip to content

Instantly share code, notes, and snippets.

@TomLin
Created July 5, 2019 12:11
Show Gist options
  • Save TomLin/2e2728277d4ac4549da270a77bb56801 to your computer and use it in GitHub Desktop.
Save TomLin/2e2728277d4ac4549da270a77bb56801 to your computer and use it in GitHub Desktop.
Class of MeanEmbedding vectorizer.
class MeanEmbeddingVectorizer(object):
def __init__(self, word_model):
self.word_model = word_model
self.vector_size = word_model.wv.vector_size
def fit(self): # comply with scikit-learn transformer requirement
return self
def transform(self, docs): # comply with scikit-learn transformer requirement
doc_word_vector = self.word_average_list(docs)
return doc_word_vector
def word_average(self, sent):
"""
Compute average word vector for a single doc/sentence.
:param sent: list of sentence tokens
:return:
mean: float of averaging word vectors
"""
mean = []
for word in sent:
if word in self.word_model.wv.vocab:
mean.append(self.word_model.wv.get_vector(word))
if not mean: # empty words
# If a text is empty, return a vector of zeros.
logging.warning("cannot compute average owing to no vector for {}".format(sent))
return np.zeros(self.vector_size)
else:
mean = np.array(mean).mean(axis=0)
return mean
def word_average_list(self, docs):
"""
Compute average word vector for multiple docs, where docs had been tokenized.
:param docs: list of sentence in list of separated tokens
:return:
array of average word vector in shape (len(docs),)
"""
return np.vstack([self.word_average(sent) for sent in docs])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment