Skip to content

Instantly share code, notes, and snippets.

/gensim_hdp.py Secret

Created June 16, 2014 10:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anonymous/2e3b2f3866e5029c55c3 to your computer and use it in GitHub Desktop.
Save anonymous/2e3b2f3866e5029c55c3 to your computer and use it in GitHub Desktop.
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from gensim import corpora, models, similarities
file_name = 'mycorpus.txt'
stopword_file = 'stopwords.txt'
test_document = 'document_test.txt'
import glob
class MyCorpus(object):
def __iter__(self):
for filename in glob.glob('search/*'):
words = []
with open(stopword_file, 'r') as f:
for line in f:
words += line.split()
yield dictionary.doc2bow(words)
corpus_memory_friendly = MyCorpus() # doesn't load the corpus into memory!
print(corpus_memory_friendly)
stopwords = []
with open(stopword_file, 'r') as f:
for line in f:
stopwords += line.split()
# collect statistics about all tokens
files = [[]]
for filename in glob.glob('search/*'):
words = []
with open(filename, 'r') as f:
for line in f:
words += line.split()
files.append(words)
dictionary = corpora.Dictionary(f for f in files)
# remove stop words and words that appear only once
stop_ids = [dictionary.token2id[stopword] for stopword in stopwords
if stopword in dictionary.token2id]
once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 1]
dictionary.filter_tokens(stop_ids + once_ids) # remove stop words and words that appear only once
dictionary.compactify() # remove gaps in id sequence after words that were removed
print(dictionary)
corpora.MmCorpus.serialize('corpus.mm', corpus_memory_friendly)
corpus = corpora.MmCorpus('corpus.mm')
hdp_model = models.hdpmodel.HdpModel(corpus , dictionary)
hdp_model.print_topics(topics=20, topn=10)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment