Skip to content

Instantly share code, notes, and snippets.

@mmatkinson
Last active May 2, 2016 23:03
Show Gist options
  • Save mmatkinson/ae64bfae230522d8b9de03ce4d69ec74 to your computer and use it in GitHub Desktop.
Save mmatkinson/ae64bfae230522d8b9de03ce4d69ec74 to your computer and use it in GitHub Desktop.
Helper class for using sklearn vectorizers with gensim lda.
# For gensim
from itertools import groupby
import gensim
class VectorizedCorpus(object):
"""
Helper Class for using Sklearn Vectorizers with gensim's LDA model
handles transformations between gensim corpus / bow representations and sklearn matrix
"""
def __init__(self, vec, doc_list):
self.vec = vec
self.doc_list = doc_list
self.vec.fit(self.doc_list)
self.dictionary = gensim.corpora.dictionary.Dictionary([self.vec.vocabulary_.keys()]) # for use in LDA model instantiation
self.idvec2word = {v:k for k,v in self.vec.vocabulary_.items()}
def __repr__(self):
return "<VectorizedCorpus: \n vec:({}) \n docs:({}) \n dict:({})>".format(self.vec, len(self), len(self.dictionary))
def __len__(self):
return len(self.doc_list)
def __iter__(self):
return self.to_corpus(self.doc_list)
def to_corpus(self, doc_list=None):
""" Transforms a list of documents into a gensim corpus """
if type(doc_list) is type(None):
doc_list = self.doc_list
for k in gensim.matutils.Sparse2Corpus(self.vec.transform(doc_list), documents_columns=False):
yield k
def to_bow_dict(self,doc):
"""
representation of a document needed for transforming an unseen document into lda topic
distribution
LDAModel[self.to_bow_dict(["list of" , "sentences or paragraphs"])]
"""
return [self.dictionary.doc2bow(d) for d in self.to_bow(doc)]
def to_bow(self, doc):
""" Transforms a doc into list of words used
utilizes the stored vectorizer (self.vec)
"""
doc_vec = self._transform(doc).nonzero()
doc_words = [(k,self.idvec2word[v]) for k,v in zip(*doc_vec)]
doc_group = {k: [v[1] for v in list(g)] for k, g in itertools.groupby(doc_words, lambda x: x[0])}
return list(doc_group.values())
def _transform(self, doc):
if type(doc) in [list , np.ndarray]:
return self.vec.transform(doc)
elif type(doc) in [str]:
return self.vec.transform([doc])
else:
raise Exception()
def from_bow(self, doc):
if type(doc) in [list , np.ndarray]:
return [self._from_bow(d) for d in doc]
elif type(doc) in [str]:
return [self._from_bow(doc)]
else:
raise Exception()
def _from_bow(self, single_doc):
return [self.dictionary[term[0]] for term in single_doc]
from sklearn.feature_extraction.text import CountVectorizer
import gensim
vectorizer = CountVectorizer(stop_words='english',
min_df=4,
binary=True)
VecCorp = VectorizedCorpus(vec=vectorizer, doc_list = [list of documents])
VecCorp.dictionary # gensim dictionary
VecCorp.to_bow([" this is a sentence "]) # Transform a sentence to a list of words in it
VecCorp.to_bow_dict([" this is a sentence "]) # Transform a sentence gensim bow format ( dictionary index of each word )
VecCorp.from_bow(v.to_bow_dict([" this is a sentence "])) # inverse transform of to_bow_dict
ntopics=5
lda = gensim.models.ldamodel.LdaModel(corpus=VecCorp, num_topics=ntopics, id2word = VecCorp.dictionary)
lda[VecCorp.to_bow_dict(["This is a new unseen sentence"])[0]] # Asign topic mix to a new document
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment