Skip to content

Instantly share code, notes, and snippets.

@TeraBytesMemory
Created November 3, 2019 02:09
Show Gist options
  • Save TeraBytesMemory/6d0eb49c8027f8431a13859184b227de to your computer and use it in GitHub Desktop.
Save TeraBytesMemory/6d0eb49c8027f8431a13859184b227de to your computer and use it in GitHub Desktop.
[WIP] Sparse Composite Document Vectors
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.mixture import GaussianMixture
class SCDV(object):
def __init__(self, word_embedding_model):
self._word_embedding_model = word_embedding_model
self._tfidf = TfidfVectorizer(use_idf=True)
self._gmm = GaussianMixture()
def fit_transform(self, X):
# D: number of documents
# W: number of vocablary
# V: vector size
# C: number of clusters
idf = self._tfidf.fit(X).idf_ # shape: D x W
vocabs = self._tfidf.get_feature_names() # shape: W x V
vecs = np.vstack([word_embedding_model.get(v) for v in vocabs])
self._gmm.fit(vecs)
proba = self._gmm.predict_proba(vecs) # shape: W x C
wcv = np.einsum('ij,ik->ijk', proba vocabs) # shape: W x C x V
# shape: {matrix that shape is (D x W)} \dots {matrix that shape is (W x CV)} = D x CV
docvecs = idf @ np.stack(wcv, axis=1)
return docvecs
@TeraBytesMemory
Copy link
Author

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment