mmatkinson/lda_vec.py

## lda_vec.py
# For gensim
from itertools import groupby
import gensim


class VectorizedCorpus(object):
    """
    Helper Class for using Sklearn Vectorizers with gensim's LDA model
    handles transformations between gensim corpus / bow representations and sklearn matrix

    """
    def __init__(self, vec, doc_list):
        self.vec = vec
        self.doc_list = doc_list
        self.vec.fit(self.doc_list)
        self.dictionary = gensim.corpora.dictionary.Dictionary([self.vec.vocabulary_.keys()]) # for use in LDA model instantiation
        self.idvec2word = {v:k for k,v in self.vec.vocabulary_.items()}

    def __repr__(self):
        return "<VectorizedCorpus: \n vec:({}) \n docs:({}) \n dict:({})>".format(self.vec, len(self), len(self.dictionary))

    def __len__(self):
        return len(self.doc_list)

    def __iter__(self):
        return self.to_corpus(self.doc_list)

    def to_corpus(self, doc_list=None):
        """ Transforms a list of documents into a gensim corpus """
        if type(doc_list) is type(None):
            doc_list  = self.doc_list

        for k in gensim.matutils.Sparse2Corpus(self.vec.transform(doc_list), documents_columns=False):
            yield k

    def to_bow_dict(self,doc):
        """
        representation of a document needed for transforming an unseen document into lda topic
        distribution

        LDAModel[self.to_bow_dict(["list of" , "sentences or paragraphs"])]

        """
        return [self.dictionary.doc2bow(d) for d in self.to_bow(doc)]

    def to_bow(self, doc):
        """ Transforms a doc into list of words used
            utilizes the stored vectorizer (self.vec)
        """
        doc_vec = self._transform(doc).nonzero()
        doc_words = [(k,self.idvec2word[v]) for k,v in zip(*doc_vec)]
        doc_group = {k: [v[1] for v in list(g)] for k, g in itertools.groupby(doc_words, lambda x: x[0])}
        return list(doc_group.values())

    def _transform(self, doc):
        if type(doc) in [list , np.ndarray]:
            return self.vec.transform(doc)
        elif type(doc) in [str]:
            return self.vec.transform([doc])
        else:
            raise Exception()

    def from_bow(self, doc):
        if type(doc) in [list , np.ndarray]:
            return [self._from_bow(d) for d in doc]
        elif type(doc) in [str]:
            return [self._from_bow(doc)]
        else:
            raise Exception()

    def _from_bow(self, single_doc):
        return [self.dictionary[term[0]] for term in single_doc]


## lda_vec_demo.py
from sklearn.feature_extraction.text import CountVectorizer
import gensim

vectorizer = CountVectorizer(stop_words='english',
                             min_df=4,
                             binary=True)

VecCorp = VectorizedCorpus(vec=vectorizer, doc_list = [list of documents])
VecCorp.dictionary # gensim dictionary
VecCorp.to_bow([" this is a sentence "]) # Transform a sentence to a list of words in it
VecCorp.to_bow_dict([" this is a sentence "])  # Transform a sentence gensim bow format ( dictionary index of each word )
VecCorp.from_bow(v.to_bow_dict([" this is a sentence "])) # inverse transform of to_bow_dict

ntopics=5
lda = gensim.models.ldamodel.LdaModel(corpus=VecCorp, num_topics=ntopics, id2word = VecCorp.dictionary)

lda[VecCorp.to_bow_dict(["This is a new unseen sentence"])[0]] # Asign topic mix to a new document
	# For gensim
	from itertools import groupby
	import gensim


	class VectorizedCorpus(object):
	"""
	Helper Class for using Sklearn Vectorizers with gensim's LDA model
	handles transformations between gensim corpus / bow representations and sklearn matrix

	"""
	def __init__(self, vec, doc_list):
	self.vec = vec
	self.doc_list = doc_list
	self.vec.fit(self.doc_list)
	self.dictionary = gensim.corpora.dictionary.Dictionary([self.vec.vocabulary_.keys()]) # for use in LDA model instantiation
	self.idvec2word = {v:k for k,v in self.vec.vocabulary_.items()}

	def __repr__(self):
	return "<VectorizedCorpus: \n vec:({}) \n docs:({}) \n dict:({})>".format(self.vec, len(self), len(self.dictionary))

	def __len__(self):
	return len(self.doc_list)

	def __iter__(self):
	return self.to_corpus(self.doc_list)

	def to_corpus(self, doc_list=None):
	""" Transforms a list of documents into a gensim corpus """
	if type(doc_list) is type(None):
	doc_list = self.doc_list

	for k in gensim.matutils.Sparse2Corpus(self.vec.transform(doc_list), documents_columns=False):
	yield k

	def to_bow_dict(self,doc):
	"""
	representation of a document needed for transforming an unseen document into lda topic
	distribution

	LDAModel[self.to_bow_dict(["list of" , "sentences or paragraphs"])]

	"""
	return [self.dictionary.doc2bow(d) for d in self.to_bow(doc)]

	def to_bow(self, doc):
	""" Transforms a doc into list of words used
	utilizes the stored vectorizer (self.vec)
	"""
	doc_vec = self._transform(doc).nonzero()
	doc_words = [(k,self.idvec2word[v]) for k,v in zip(*doc_vec)]
	doc_group = {k: [v[1] for v in list(g)] for k, g in itertools.groupby(doc_words, lambda x: x[0])}
	return list(doc_group.values())

	def _transform(self, doc):
	if type(doc) in [list , np.ndarray]:
	return self.vec.transform(doc)
	elif type(doc) in [str]:
	return self.vec.transform([doc])
	else:
	raise Exception()

	def from_bow(self, doc):
	if type(doc) in [list , np.ndarray]:
	return [self._from_bow(d) for d in doc]
	elif type(doc) in [str]:
	return [self._from_bow(doc)]
	else:
	raise Exception()

	def _from_bow(self, single_doc):
	return [self.dictionary[term[0]] for term in single_doc]
	from sklearn.feature_extraction.text import CountVectorizer
	import gensim

	vectorizer = CountVectorizer(stop_words='english',
	min_df=4,
	binary=True)

	VecCorp = VectorizedCorpus(vec=vectorizer, doc_list = [list of documents])
	VecCorp.dictionary # gensim dictionary
	VecCorp.to_bow([" this is a sentence "]) # Transform a sentence to a list of words in it
	VecCorp.to_bow_dict([" this is a sentence "]) # Transform a sentence gensim bow format ( dictionary index of each word )
	VecCorp.from_bow(v.to_bow_dict([" this is a sentence "])) # inverse transform of to_bow_dict

	ntopics=5
	lda = gensim.models.ldamodel.LdaModel(corpus=VecCorp, num_topics=ntopics, id2word = VecCorp.dictionary)

	lda[VecCorp.to_bow_dict(["This is a new unseen sentence"])[0]] # Asign topic mix to a new document