Last active
November 30, 2018 13:23
-
-
Save MayukhSobo/d71b7a0be00103f7a0db1d8aab04cd05 to your computer and use it in GitHub Desktop.
Generic sent2vec in python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def sent2vec(sentence, model, method='tfidf', **kwargs): | |
""" | |
Generic function to convert a sentence to a vector using | |
avg or TFIDF vecorization | |
""" | |
##### It is recommended to pass seperate stopwords ##### | |
stopwords = kwargs.get('stopwords') | |
if stopwords is None: | |
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS | |
stopwords = ENGLISH_STOP_WORDS | |
##### It is recommended to pass seperate tokenizers ##### | |
tokenizer = kwargs.get('tokenizer') | |
if tokenizer is None: | |
from nltk.tokenize import RegexpTokenizer | |
tokenizer = RegexpTokenizer(r'\w+') | |
words = tokenizer.tokenize(sentence) # Tokenize the words | |
words = {each for each in words if each not in stopwords} # Remove all the stopwords | |
V = [] | |
for word in words: # Process over all the words in the sentence | |
if model.__contains__(word): | |
V.append(model[word]) | |
V = np.array(V) | |
# If no words were present in the model | |
# or blank sentence was passed, return a | |
# word vector with all 0's | |
if V.shape[0] == 0: | |
# If model returns word2vec of different size | |
# Default value is taken 300 | |
custom_shape = kwargs.get('shape', 300) | |
return np.zeros(custom_shape) | |
# If there is atleast one word in the sentence that | |
# was vectoried properly | |
if method.lower() == 'avg': | |
V = V.sum(axis=0) | |
return V / np.sqrt((V ** 2).sum()) | |
elif method.lower() == 'tfidf': | |
tfidf_model = kwargs.get('tfidf_model') # Load the tfidf model | |
if tfidf_model: # If model loaded sucessfully | |
tfidf_vec = tfidf_model.transform([sentence]) # get TFIDF for the sentence | |
indx = tfidf_model.vocabulary_.get(word, -1) | |
tfidfs = [] | |
for word in words: | |
if model.__contains__(word): | |
if indx != -1: | |
tfidfs.append(tfidf_vec[0, indx]) | |
else: | |
tfidfs.append(0.0) | |
tfidfs = np.array(tfidfs) | |
denominator = tfidfs.sum() | |
if denominator == 0.0: # No word is representred in tfidf and w2v both | |
# Better than skipping that sentence | |
denominator = tfidf_model.idf_.min() * 0.01 | |
numerator = V * tfidfs.reshape(V.shape[0], 1) | |
numerator = numerator.sum(axis=0) | |
return numerator / denominator | |
else: | |
raise ValueError('No tfidf model is present') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment