Skip to content

Instantly share code, notes, and snippets.

@hokuma
Created August 4, 2017 16:14
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hokuma/fa66b6e1fb19be5da995a681fbe6689f to your computer and use it in GitHub Desktop.
Save hokuma/fa66b6e1fb19be5da995a681fbe6689f to your computer and use it in GitHub Desktop.
lda
import sys
import string
import json
from pprint import pprint
from gensim import corpora, models
from collections import defaultdict
def stopword(token):
return string.punctuation.find(token) > -1
docs = []
for line in sys.stdin:
line = line.rstrip('\n')
docs.append(line.split(','))
docs = [[token for token in doc if not stopword(token)]
for doc in docs]
frequency = defaultdict(int)
for doc in docs:
for token in doc:
frequency[token] += 1
docs = [[token for token in doc if frequency[token] > 1]
for doc in docs]
dictionary = corpora.Dictionary(docs)
corpus = [dictionary.doc2bow(doc) for doc in docs]
lda = models.LdaModel(corpus, num_topics = 10, id2word = dictionary)
topics = []
for i in range(10):
words = lda.show_topic(i, topn=10)
topics.append([{word[0]: word[1]} for word in words])
print(json.dumps(topics))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment