Skip to content

Instantly share code, notes, and snippets.

@clausd
Created December 20, 2016 13:26
Show Gist options
  • Save clausd/1566cb47a03a0517975d7058a17dcd85 to your computer and use it in GitHub Desktop.
Save clausd/1566cb47a03a0517975d7058a17dcd85 to your computer and use it in GitHub Desktop.
How to do LDA in python (for Morten)
import numpy as np
import pandas as pd
import lda
import lda.datasets
from sklearn.feature_extraction.text import CountVectorizer
def load_questions():
sheet = pd.read_excel('android_watch.xlsx')
Qs = sheet['Question']
return sheet, list(Qs)
def featurize(questions, stop_words = None):
cv = CountVectorizer(stop_words = stop_words)
X = cv.fit_transform(questions)
return X, list(cv.vocabulary_)
def find_topics(features):
model = lda.LDA(n_topics=20, n_iter=500, random_state=1)
model.fit(features)
return model
def report():
s,q = load_questions()
feats, vocab = featurize(q)
model = find_topics(feats)
# list topics
n_top_words = 8
topic_word = model.topic_word_
for i, topic_dist in enumerate(topic_word):
topic_words = np.array(vocab)[np.argsort(topic_dist)][:-n_top_words:-1]
print('Topic {}: {}'.format(i, ' '.join(topic_words)))
# list primary topic for each q
doc_topic = model.doc_topic_
for i in range(10):
print("{} (top topic: {})".format(q[i], doc_topic[i].argmax()))
@clausd
Copy link
Author

clausd commented Dec 20, 2016

Koden scoret fra https://ariddell.org/lda.html (og ikke helt sikker på om alt er som det skal være)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment