MayukhSobo/sent2vec.py

## sent2vec.py
def sent2vec(sentence, model, method='tfidf', **kwargs):
    """
    Generic function to convert a sentence to a vector using
    avg or TFIDF vecorization
    """

    ##### It is recommended to pass seperate stopwords #####
    stopwords = kwargs.get('stopwords')
    if stopwords is None:
        from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
        stopwords = ENGLISH_STOP_WORDS

    ##### It is recommended to pass seperate tokenizers #####
    tokenizer = kwargs.get('tokenizer')
    if tokenizer is None:
        from nltk.tokenize import RegexpTokenizer
        tokenizer = RegexpTokenizer(r'\w+')

    words = tokenizer.tokenize(sentence) # Tokenize the words
    words = {each for each in words if each not in stopwords} # Remove all the stopwords

    V = []

    for word in words: # Process over all the words in the sentence
        if model.__contains__(word):
            V.append(model[word])
    V = np.array(V)

    # If no words were present in the model
    # or blank sentence was passed, return a
    # word vector with all 0's
    if V.shape[0] == 0:
        # If model returns word2vec of different size
        # Default value is taken 300
        custom_shape = kwargs.get('shape', 300)
        return np.zeros(custom_shape)

    # If there is atleast one word in the sentence that
    # was vectoried properly

    if method.lower() == 'avg':
        V = V.sum(axis=0)
        return V / np.sqrt((V ** 2).sum())

    elif method.lower() == 'tfidf':
        tfidf_model = kwargs.get('tfidf_model') # Load the tfidf model
        if tfidf_model: # If model loaded sucessfully
            tfidf_vec = tfidf_model.transform([sentence]) # get TFIDF for the sentence
            indx = tfidf_model.vocabulary_.get(word, -1)
            tfidfs = []
            for word in words:
                if model.__contains__(word):
                    if indx != -1:
                        tfidfs.append(tfidf_vec[0, indx])
                    else:
                        tfidfs.append(0.0)
            tfidfs = np.array(tfidfs)
            denominator = tfidfs.sum()
            if denominator == 0.0: # No word is representred in tfidf and w2v both
                # Better than skipping that sentence
                denominator = tfidf_model.idf_.min() * 0.01
            numerator = V * tfidfs.reshape(V.shape[0], 1)
            numerator = numerator.sum(axis=0)
            return numerator / denominator
        else:
            raise ValueError('No tfidf model is present')
	def sent2vec(sentence, model, method='tfidf', **kwargs):
	"""
	Generic function to convert a sentence to a vector using
	avg or TFIDF vecorization
	"""

	##### It is recommended to pass seperate stopwords #####
	stopwords = kwargs.get('stopwords')
	if stopwords is None:
	from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
	stopwords = ENGLISH_STOP_WORDS

	##### It is recommended to pass seperate tokenizers #####
	tokenizer = kwargs.get('tokenizer')
	if tokenizer is None:
	from nltk.tokenize import RegexpTokenizer
	tokenizer = RegexpTokenizer(r'\w+')

	words = tokenizer.tokenize(sentence) # Tokenize the words
	words = {each for each in words if each not in stopwords} # Remove all the stopwords

	V = []

	for word in words: # Process over all the words in the sentence
	if model.__contains__(word):
	V.append(model[word])
	V = np.array(V)

	# If no words were present in the model
	# or blank sentence was passed, return a
	# word vector with all 0's
	if V.shape[0] == 0:
	# If model returns word2vec of different size
	# Default value is taken 300
	custom_shape = kwargs.get('shape', 300)
	return np.zeros(custom_shape)

	# If there is atleast one word in the sentence that
	# was vectoried properly

	if method.lower() == 'avg':
	V = V.sum(axis=0)
	return V / np.sqrt((V ** 2).sum())

	elif method.lower() == 'tfidf':
	tfidf_model = kwargs.get('tfidf_model') # Load the tfidf model
	if tfidf_model: # If model loaded sucessfully
	tfidf_vec = tfidf_model.transform([sentence]) # get TFIDF for the sentence
	indx = tfidf_model.vocabulary_.get(word, -1)
	tfidfs = []
	for word in words:
	if model.__contains__(word):
	if indx != -1:
	tfidfs.append(tfidf_vec[0, indx])
	else:
	tfidfs.append(0.0)
	tfidfs = np.array(tfidfs)
	denominator = tfidfs.sum()
	if denominator == 0.0: # No word is representred in tfidf and w2v both
	# Better than skipping that sentence
	denominator = tfidf_model.idf_.min() * 0.01
	numerator = V * tfidfs.reshape(V.shape[0], 1)
	numerator = numerator.sum(axis=0)
	return numerator / denominator
	else:
	raise ValueError('No tfidf model is present')