TeraBytesMemory/scdv.py

## scdv.py
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.mixture import GaussianMixture

class SCDV(object):
    def __init__(self, word_embedding_model):
        self._word_embedding_model = word_embedding_model
        self._tfidf = TfidfVectorizer(use_idf=True)
        self._gmm = GaussianMixture()

    def fit_transform(self, X):
        # D: number of documents
        # W: number of vocablary
        # V: vector size
        # C: number of clusters
        idf = self._tfidf.fit(X).idf_  # shape: D x W

        vocabs = self._tfidf.get_feature_names()  # shape: W x V
        vecs = np.vstack([word_embedding_model.get(v) for v in vocabs])

        self._gmm.fit(vecs)
        proba = self._gmm.predict_proba(vecs)  # shape: W x C

        wcv = np.einsum('ij,ik->ijk', proba vocabs)  # shape: W x C x V

        # shape: {matrix that shape is (D x W)} \dots {matrix that shape is (W x CV)} = D x CV
        docvecs = idf @ np.stack(wcv, axis=1)
        return docvecs
	import numpy as np
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.mixture import GaussianMixture

	class SCDV(object):
	def __init__(self, word_embedding_model):
	self._word_embedding_model = word_embedding_model
	self._tfidf = TfidfVectorizer(use_idf=True)
	self._gmm = GaussianMixture()

	def fit_transform(self, X):
	# D: number of documents
	# W: number of vocablary
	# V: vector size
	# C: number of clusters
	idf = self._tfidf.fit(X).idf_ # shape: D x W

	vocabs = self._tfidf.get_feature_names() # shape: W x V
	vecs = np.vstack([word_embedding_model.get(v) for v in vocabs])

	self._gmm.fit(vecs)
	proba = self._gmm.predict_proba(vecs) # shape: W x C

	wcv = np.einsum('ij,ik->ijk', proba vocabs) # shape: W x C x V

	# shape: {matrix that shape is (D x W)} \dots {matrix that shape is (W x CV)} = D x CV
	docvecs = idf @ np.stack(wcv, axis=1)
	return docvecs