Skip to content

Instantly share code, notes, and snippets.

@AadityaJ
Last active December 30, 2016 13:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save AadityaJ/c98da3d01f76f068242c17b5e1593973 to your computer and use it in GitHub Desktop.
Save AadityaJ/c98da3d01f76f068242c17b5e1593973 to your computer and use it in GitHub Desktop.
import numpy as np
from gensim import matutils
from gensim.models.ldamodel import LdaModel
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from gensim.sklearn_integration.SklearnWrapperGensimLdaModel import SklearnWrapperLdaModel
## class to execute fit_predict. Will later on add to lda wrapper.
class dummy(SklearnWrapperLdaModel):
def fit_predict(self,X):
corpus=matutils.Sparse2Corpus(X)
return SklearnWrapperLdaModel.fit(self,corpus)
rand = np.random.mtrand.RandomState(8675309)
cats = ['rec.sport.baseball', 'sci.crypt']
data = fetch_20newsgroups(subset='train',
categories=cats,
shuffle=True)
vec = CountVectorizer(min_df=10, stop_words='english')
X = vec.fit_transform(data.data)
vocab = vec.get_feature_names()
vec = CountVectorizer(min_df=10, stop_words='english')
X = vec.fit_transform(data.data)
#corpus=matutils.Sparse2Corpus(X,documents_columns=False)
vocab = vec.get_feature_names() #vocab to be converted to id2word
id2word=dict([(i, s) for i, s in enumerate(vocab)])
obj=dummy(id2word=id2word,num_topics=5,passes=20)
lda=obj.fit_predict(X)
lda.print_topics()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment