cstrelioff/README.md

## README.md

      
    Raw
  

              README.md
            
          
    A script that replicates all examples in my blog post on using the lda Python package for Latent Dirichlet Allocation-- see
my lda post for more information.
Run all the examples

    $ python ex002_lda.py
Or,
    $ chmod u+x ex002_lda.py
    $ ./ex002_lda.py

  
## ex002_lda.py
#! /usr/bin/env python
# -*- coding: utf-8 -*-
# vim:fenc=utf-8
#
# Copyright © 2014 Christopher C. Strelioff <chris.strelioff@gmail.com>
#
# Distributed under terms of the MIT license.

"""
ex002.py -- An example of LDA in Python.
"""
from __future__ import division, print_function

import numpy as np
import lda
import lda.datasets


# document-term matrix
X = lda.datasets.load_reuters()
print("type(X): {}".format(type(X)))
print("shape: {}\n".format(X.shape))

# the vocab
vocab = lda.datasets.load_reuters_vocab()
print("type(vocab): {}".format(type(vocab)))
print("len(vocab): {}\n".format(len(vocab)))

# titles for each story
titles = lda.datasets.load_reuters_titles()
print("type(titles): {}".format(type(titles)))
print("len(titles): {}\n".format(len(titles)))


doc_id = 0
word_id = 3117

print("doc id: {} word id: {}".format(doc_id, word_id))
print("-- count: {}".format(X[doc_id, word_id]))
print("-- word : {}".format(vocab[word_id]))
print("-- doc  : {}".format(titles[doc_id]))


model = lda.LDA(n_topics=20, n_iter=500, random_state=1)
model.fit(X)


topic_word = model.topic_word_
print("type(topic_word): {}".format(type(topic_word)))
print("shape: {}".format(topic_word.shape))


for n in range(5):
    sum_pr = sum(topic_word[n,:])
    print("topic: {} sum: {}".format(n, sum_pr))


n = 5
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n+1):-1]
    print('*Topic {}\n- {}'.format(i, ' '.join(topic_words)))


doc_topic = model.doc_topic_
print("type(doc_topic): {}".format(type(doc_topic)))
print("shape: {}".format(doc_topic.shape))


for n in range(5):
    sum_pr = sum(doc_topic[n,:])
    print("document: {} sum: {}".format(n, sum_pr))


for n in range(10):
    topic_most_pr = doc_topic[n].argmax()
    print("doc: {} topic: {}\n{}...".format(n,
                                            topic_most_pr,
                                            titles[n][:50]))


import matplotlib.pyplot as plt

# use matplotlib style sheet
try:
    plt.style.use('ggplot')
except:
    # version of matplotlib might not be recent
    pass


f, ax= plt.subplots(5, 1, figsize=(8, 6), sharex=True)
for i, k in enumerate([0, 5, 9, 14, 19]):
    ax[i].stem(topic_word[k,:], linefmt='b-',
               markerfmt='bo', basefmt='w-')
    ax[i].set_xlim(-50,4350)
    ax[i].set_ylim(0, 0.08)
    ax[i].set_ylabel("Prob")
    ax[i].set_title("topic {}".format(k))

ax[4].set_xlabel("word")

plt.tight_layout()
plt.show()


f, ax= plt.subplots(5, 1, figsize=(8, 6), sharex=True)
for i, k in enumerate([1, 3, 4, 8, 9]):
    ax[i].stem(doc_topic[k,:], linefmt='r-',
               markerfmt='ro', basefmt='w-')
    ax[i].set_xlim(-1, 21)
    ax[i].set_ylim(0, 1)
    ax[i].set_ylabel("Prob")
    ax[i].set_title("Document {}".format(k))

ax[4].set_xlabel("Topic")

plt.tight_layout()
plt.show()
	#! /usr/bin/env python
	# -- coding: utf-8 --
	# vim:fenc=utf-8
	#
	# Copyright © 2014 Christopher C. Strelioff <chris.strelioff@gmail.com>
	#
	# Distributed under terms of the MIT license.

	"""
	ex002.py -- An example of LDA in Python.
	"""
	from __future__ import division, print_function

	import numpy as np
	import lda
	import lda.datasets


	# document-term matrix
	X = lda.datasets.load_reuters()
	print("type(X): {}".format(type(X)))
	print("shape: {}\n".format(X.shape))

	# the vocab
	vocab = lda.datasets.load_reuters_vocab()
	print("type(vocab): {}".format(type(vocab)))
	print("len(vocab): {}\n".format(len(vocab)))

	# titles for each story
	titles = lda.datasets.load_reuters_titles()
	print("type(titles): {}".format(type(titles)))
	print("len(titles): {}\n".format(len(titles)))


	doc_id = 0
	word_id = 3117

	print("doc id: {} word id: {}".format(doc_id, word_id))
	print("-- count: {}".format(X[doc_id, word_id]))
	print("-- word : {}".format(vocab[word_id]))
	print("-- doc : {}".format(titles[doc_id]))


	model = lda.LDA(n_topics=20, n_iter=500, random_state=1)
	model.fit(X)


	topic_word = model.topic_word_
	print("type(topic_word): {}".format(type(topic_word)))
	print("shape: {}".format(topic_word.shape))


	for n in range(5):
	sum_pr = sum(topic_word[n,:])
	print("topic: {} sum: {}".format(n, sum_pr))


	n = 5
	for i, topic_dist in enumerate(topic_word):
	topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n+1):-1]
	print('*Topic {}\n- {}'.format(i, ' '.join(topic_words)))


	doc_topic = model.doc_topic_
	print("type(doc_topic): {}".format(type(doc_topic)))
	print("shape: {}".format(doc_topic.shape))


	for n in range(5):
	sum_pr = sum(doc_topic[n,:])
	print("document: {} sum: {}".format(n, sum_pr))


	for n in range(10):
	topic_most_pr = doc_topic[n].argmax()
	print("doc: {} topic: {}\n{}...".format(n,
	topic_most_pr,
	titles[n][:50]))


	import matplotlib.pyplot as plt

	# use matplotlib style sheet
	try:
	plt.style.use('ggplot')
	except:
	# version of matplotlib might not be recent
	pass


	f, ax= plt.subplots(5, 1, figsize=(8, 6), sharex=True)
	for i, k in enumerate([0, 5, 9, 14, 19]):
	ax[i].stem(topic_word[k,:], linefmt='b-',
	markerfmt='bo', basefmt='w-')
	ax[i].set_xlim(-50,4350)
	ax[i].set_ylim(0, 0.08)
	ax[i].set_ylabel("Prob")
	ax[i].set_title("topic {}".format(k))

	ax[4].set_xlabel("word")

	plt.tight_layout()
	plt.show()


	f, ax= plt.subplots(5, 1, figsize=(8, 6), sharex=True)
	for i, k in enumerate([1, 3, 4, 8, 9]):
	ax[i].stem(doc_topic[k,:], linefmt='r-',
	markerfmt='ro', basefmt='w-')
	ax[i].set_xlim(-1, 21)
	ax[i].set_ylim(0, 1)
	ax[i].set_ylabel("Prob")
	ax[i].set_title("Document {}".format(k))

	ax[4].set_xlabel("Topic")

	plt.tight_layout()
	plt.show()