Skip to content

Instantly share code, notes, and snippets.

@hokuma hokuma/
Created Aug 4, 2017

What would you like to do?
import sys
import string
import json
from pprint import pprint
from gensim import corpora, models
from collections import defaultdict
def stopword(token):
return string.punctuation.find(token) > -1
docs = []
for line in sys.stdin:
line = line.rstrip('\n')
docs = [[token for token in doc if not stopword(token)]
for doc in docs]
frequency = defaultdict(int)
for doc in docs:
for token in doc:
frequency[token] += 1
docs = [[token for token in doc if frequency[token] > 1]
for doc in docs]
dictionary = corpora.Dictionary(docs)
corpus = [dictionary.doc2bow(doc) for doc in docs]
lda = models.LdaModel(corpus, num_topics = 10, id2word = dictionary)
topics = []
for i in range(10):
words = lda.show_topic(i, topn=10)
topics.append([{word[0]: word[1]} for word in words])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.