Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
lda
import sys
import string
import json
from pprint import pprint
from gensim import corpora, models
from collections import defaultdict
def stopword(token):
return string.punctuation.find(token) > -1
docs = []
for line in sys.stdin:
line = line.rstrip('\n')
docs.append(line.split(','))
docs = [[token for token in doc if not stopword(token)]
for doc in docs]
frequency = defaultdict(int)
for doc in docs:
for token in doc:
frequency[token] += 1
docs = [[token for token in doc if frequency[token] > 1]
for doc in docs]
dictionary = corpora.Dictionary(docs)
corpus = [dictionary.doc2bow(doc) for doc in docs]
lda = models.LdaModel(corpus, num_topics = 10, id2word = dictionary)
topics = []
for i in range(10):
words = lda.show_topic(i, topn=10)
topics.append([{word[0]: word[1]} for word in words])
print(json.dumps(topics))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment