Created
November 3, 2019 02:09
-
-
Save TeraBytesMemory/6d0eb49c8027f8431a13859184b227de to your computer and use it in GitHub Desktop.
[WIP] Sparse Composite Document Vectors
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.mixture import GaussianMixture | |
class SCDV(object): | |
def __init__(self, word_embedding_model): | |
self._word_embedding_model = word_embedding_model | |
self._tfidf = TfidfVectorizer(use_idf=True) | |
self._gmm = GaussianMixture() | |
def fit_transform(self, X): | |
# D: number of documents | |
# W: number of vocablary | |
# V: vector size | |
# C: number of clusters | |
idf = self._tfidf.fit(X).idf_ # shape: D x W | |
vocabs = self._tfidf.get_feature_names() # shape: W x V | |
vecs = np.vstack([word_embedding_model.get(v) for v in vocabs]) | |
self._gmm.fit(vecs) | |
proba = self._gmm.predict_proba(vecs) # shape: W x C | |
wcv = np.einsum('ij,ik->ijk', proba vocabs) # shape: W x C x V | |
# shape: {matrix that shape is (D x W)} \dots {matrix that shape is (W x CV)} = D x CV | |
docvecs = idf @ np.stack(wcv, axis=1) | |
return docvecs |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
ref: https://qiita.com/fufufukakaka/items/a7316273908a7c400868