Skip to content

Instantly share code, notes, and snippets.

@eomlocal
Created August 9, 2018 08:33
Show Gist options
  • Save eomlocal/c3a1d4f0e9b110c80b6dffe9c33ee1ac to your computer and use it in GitHub Desktop.
Save eomlocal/c3a1d4f0e9b110c80b6dffe9c33ee1ac to your computer and use it in GitHub Desktop.
from collections import Counter
import random
def p_topic_given_document(topic, d, alpha=0.1):
return ((document_topic_counts[d][topic] + alpha) /
(document_lengths[d] + K * alpha))
def p_word_given_topic(word, topic, beta=0.1):
return ((topic_word_counts[topic][word] + beta) /
(topic_counts[topic] + V * beta))
def topic_weight(d, word, k):
return p_word_given_topic(word, k) * p_topic_given_document(k, d)
def choose_new_topic(d, word):
return sample_from([topic_weight(d, word, k) for k in range(K)])
def sample_from(weights):
total = sum(weights)
rnd = total * random.random()
for i, w in enumerate(weights):
rnd -= w
if rnd <= 0:
return i
documents = [["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"],
["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"],
["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"],
["R", "Python", "statistics", "regression", "probability"],
["machine learning", "regression", "decision trees", "libsvm"],
["Python", "R", "Java", "C++", "Haskell", "programming languages"],
["statistics", "probability", "mathematics", "theory"],
["machine learning", "scikit-learn", "Mahout", "neural networks"],
["neural networks", "deep learning", "Big Data", "artificial intelligence"],
["Hadoop", "Java", "MapReduce", "Big Data"],
["statistics", "R", "statsmodels"],
["C++", "deep learning", "artificial intelligence", "probability"],
["pandas", "R", "Python"],
["databases", "HBase", "Postgres", "MySQL", "MongoDB"],
["libsvm", "regression", "support vector machines"]]
random.seed(0)
K=4
document_topics = [[random.randrange(K) for word in document]
for document in documents]
document_topic_counts = [Counter() for _ in documents]
topic_word_counts = [Counter() for _ in range(K)]
topic_counts = [0 for _ in range(K)]
document_lengths = [len(document) for document in documents]
distinct_words = set(word for document in documents for word in document)
V = len(distinct_words)
D = len(documents)
for d in range(D):
for word, topic in zip(documents[d], document_topics[d]):
document_topic_counts[d][topic] += 1
topic_word_counts[topic][word] += 1
topic_counts[topic] += 1
for iter in range(1000):
for d in range(D):
for i, (word, topic) in enumerate(zip(documents[d],
document_topics[d])):
document_topic_counts[d][topic] -= 1
topic_word_counts[topic][word] -= 1
topic_counts[topic] -= 1
document_lengths[d] -= 1
new_topic = choose_new_topic(d, word)
document_topics[d][i] = new_topic
document_topic_counts[d][new_topic] += 1
topic_word_counts[new_topic][word] += 1
topic_counts[new_topic] += 1
document_lengths[d] += 1
print(document_topic_counts[0]) *0번째 topic 에 포함된 단어 빈도
print(topic_word_counts[0]) *0번째 토픽에 포함된 word
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment