Created
July 5, 2019 12:11
-
-
Save TomLin/2e2728277d4ac4549da270a77bb56801 to your computer and use it in GitHub Desktop.
Class of MeanEmbedding vectorizer.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class MeanEmbeddingVectorizer(object): | |
def __init__(self, word_model): | |
self.word_model = word_model | |
self.vector_size = word_model.wv.vector_size | |
def fit(self): # comply with scikit-learn transformer requirement | |
return self | |
def transform(self, docs): # comply with scikit-learn transformer requirement | |
doc_word_vector = self.word_average_list(docs) | |
return doc_word_vector | |
def word_average(self, sent): | |
""" | |
Compute average word vector for a single doc/sentence. | |
:param sent: list of sentence tokens | |
:return: | |
mean: float of averaging word vectors | |
""" | |
mean = [] | |
for word in sent: | |
if word in self.word_model.wv.vocab: | |
mean.append(self.word_model.wv.get_vector(word)) | |
if not mean: # empty words | |
# If a text is empty, return a vector of zeros. | |
logging.warning("cannot compute average owing to no vector for {}".format(sent)) | |
return np.zeros(self.vector_size) | |
else: | |
mean = np.array(mean).mean(axis=0) | |
return mean | |
def word_average_list(self, docs): | |
""" | |
Compute average word vector for multiple docs, where docs had been tokenized. | |
:param docs: list of sentence in list of separated tokens | |
:return: | |
array of average word vector in shape (len(docs),) | |
""" | |
return np.vstack([self.word_average(sent) for sent in docs]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment