public
Last active

LDA

  • Download Gist
demo_20news.py
Python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
 
def get_vectors(vocab_size=5000):
newsgroups_train = fetch_20newsgroups(subset='train')
vectorizer = CountVectorizer(max_df=.9, max_features=vocab_size)
vecs = vectorizer.fit_transform(newsgroups_train.data)
vocabulary = vectorizer.vocabulary
terms = np.array(vocabulary.keys())
indices = np.array(vocabulary.values())
inverse_vocabulary = terms[np.argsort(indices)]
return vecs.tocsr(), inverse_vocabulary, newsgroups_train
 
 
def train_model(mat, topicConcentrationInDocuments, wordConcentrationInTopics, numTopics):
 
model = Model(mat, numTopics=numTopics,
topicConcentrationInDocuments=topicConcentrationInDocuments,
wordConcentrationInTopics=wordConcentrationInTopics)
logprobs = []
logProb, topicCountsInDocs, wordCountsInTopics = model.bestSample(250, 5, logprobs)
 
return dict(
model=model,
bestLogProb=logProb,
topicCountsInDocs=topicCountsInDocs,
wordCountsInTopics=wordCountsInTopics,
allLogProbs=logprobs)
 
def summarizeByWords(model, wordCountsInTopics, terms, n=20):
wordDistForTopic = model.wordDistribution(wordCountsInTopics)
for topic, dist in enumerate(wordDistForTopic):
topTerms = dist.argsort()[-n:][::-1]
print "{}: {}".format(topic, ', '.join(terms[term] for term in topTerms))
 
def summarizeByDocs(model, topicCountsInDocs, docids, n=5):
topicDistForDoc = model.topicDistribution(topicCountsInDocs)
for topic, docs in enumerate(topicDistForDoc.T):
topDocs = docs.argsort()[-n:][::-1]
print "{}: {}".format(topic, ', '.join('{}({:.2})'.format(docids[doc], docs[doc])
for doc in topDocs))
 
def summarize_model(model, terms, dataset, wordCountsInTopics, topicCountsInDocs):
from matplotlib import pyplot as plt
summarizeByWords(model, wordCountsInTopics, terms)
topicDist = model.topicDistribution(topicCountsInDocs)
plt.clf()
x = np.arange(20)
for newsgroup in range(20):
plt.subplot(5,4,newsgroup+1)
plt.bar(x, topicDist[dataset.target==newsgroup].mean(axis=0))
if newsgroup < 16: plt.xticks(visible=False)
plt.yticks(visible=False)
plt.title(dataset.target_names[newsgroup])
 
# Run this with something like
# vecs, terms, dataset = get_vectors()
# globals().update(train_model(vecs, 1, 1, 20))
# summarize_model(model, terms, dataset, wordCountsInTopics, topicCountsInDocs)
lda_gibbs.py
Python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176
import numpy as np
from scipy.special import gammaln
from itertools import islice
# Latent Dirichlet Allocation using collapsed Gibbs sampling
#
# Based on the insight of Griffiths and Steyvers (2004) that you can
# analytically integrate out the distributions of words in a topic and
# of topics in a document.
 
def multinomialDraw(dist):
"""Returns a single draw from the given multinomial distribution."""
return np.random.multinomial(1, dist).argmax()
 
def iterWords(mat, row):
"""Given a row of word counts in a document, yield each word the given number of times."""
for ind in xrange(mat.indptr[row], mat.indptr[row+1]):
word = mat.indices[ind]
for i in xrange(mat.data[ind]):
yield word
 
class Model(object):
def logProb(self):
# Omits the constant factors D*(gammaln(K*alpha)-K*gammaln(alpha)) and K*(gammaln(V*eta)-V*gammaln(eta)).
topicTerm = self.topicConcentrationInDocuments+self.topicCountsInDocs
vocabTerm = self.wordConcentrationInTopics+self.wordCountsInTopics
return ((gammaln(topicTerm).sum(axis=0) - gammaln(topicTerm.sum(axis=0))).sum() +
(gammaln(vocabTerm).sum(axis=0) - gammaln(vocabTerm.sum(axis=0))).sum())
def __init__(self, documentWordCounts, numTopics,
topicConcentrationInDocuments, wordConcentrationInTopics):
self.topicConcentrationInDocuments = float(topicConcentrationInDocuments) # alpha
self.wordConcentrationInTopics = float(wordConcentrationInTopics) # eta
self.numTopics = numTopics
numDocs, numVocab = documentWordCounts.shape
# We initialize counts as we see documents for the first time.
self.topicCountsInDocs = np.zeros((numTopics, numDocs))
self.wordCountsInTopics = np.zeros((numVocab, numTopics))
self.numWordsInTopic = np.zeros(numTopics)
self.topicAssignments = [np.zeros(doc.sum(), dtype=np.uint16) for doc in documentWordCounts]
self.documentWordCounts = documentWordCounts
self.sampleNum = 0
 
def doSample(self):
topicCountsInDocs = self.topicCountsInDocs
wordCountsInTopics = self.wordCountsInTopics
topicAssignments = self.topicAssignments
wordConcentrationInTopics = self.wordConcentrationInTopics
topicConcentrationInDocuments = self.topicConcentrationInDocuments
numVocab, numTopics = wordCountsInTopics.shape
numDocs = self.documentWordCounts.shape[0]
print self.sampleNum
for doc in xrange(numDocs):
# resample the topic assignments for each word in this document.
for i, word in enumerate(iterWords(self.documentWordCounts, doc)):
# On the first pass, words have not yet been assigned to topics.
if self.sampleNum > 0:
prevTopic = topicAssignments[doc][i]
#assert wordCountsInTopics[word,prevTopic] > 0
wordCountsInTopics[word,prevTopic] -= 1
#assert topicCountsInDocs[prevTopic,doc] > 0
topicCountsInDocs[prevTopic,doc] -= 1
#assert self.numWordsInTopic[prevTopic] > 0
self.numWordsInTopic[prevTopic] -= 1
p_wordInTopic = wordCountsInTopics[word,:] + wordConcentrationInTopics
p_wordInTopic /= (self.numWordsInTopic + numVocab*wordConcentrationInTopics)
p_topicInDoc = topicCountsInDocs[:,doc] + topicConcentrationInDocuments
dist = p_wordInTopic * p_topicInDoc.T
topic = multinomialDraw(dist/dist.sum())
 
wordCountsInTopics[word, topic] += 1
topicCountsInDocs[topic, doc] += 1
self.numWordsInTopic[topic] += 1
topicAssignments[doc][i] = topic
self.sampleNum += 1
 
def iterSamples(self, burnin, lag):
"""Yields samples from the posterior distribution of topic
counts in documents and word counts in topics.
 
burnin: number of samples to compute before starting to yield them
lag: number of samples between each pair of valid samples
 
Yields: (logProb, topicCountsInDocs, wordCountsInTopics)
 
The arrays returned will be modified in place by subsequent
samples, so if you want to save them, you should make copies
(.copy()).
"""
assert burnin > lag
for i in xrange(burnin-lag):
self.doSample()
while True:
for j in xrange(lag+1):
self.doSample()
yield self.logProb(), self.topicCountsInDocs.copy(), self.wordCountsInTopics.copy()
 
def drawSamples(self, burnin, nSamples, lag):
logProbs = np.zeros(nSamples)*np.nan
samples = []
for i, (logProb, topicCountsInDocs, wordCountsInTopics) in (
islice(self.iterSamples(burnin, lag), nSamples)):
samples.append(topicCountsInDocs, wordCountsInTopics)
logProbs[i] = self.logProb()
print "Sample {}, logProb {}".format(i, logProbs[i])
return samples, logProbs
 
def bestSample(self, n, burnin=0, logprobs=None):
"""Returns the sample with the best posterior probability over n samples."""
bestSamp = -np.inf, None, None
for logProb, topicCountsInDocs, wordCountsInTopics in (
islice(self.iterSamples(burnin, 0), n)):
if logProb > bestSamp[0]:
bestSamp = logProb, topicCountsInDocs.copy(), wordCountsInTopics.copy()
if logprobs is None:
print logProb
else:
logprobs.append(logProb)
return bestSamp
 
def wordDistribution(self, wordCountsInTopics_sample):
"""Return the predictive distribution of words in a topic given a sample."""
dist = wordCountsInTopics_sample.T + self.wordConcentrationInTopics
dist /= dist.sum(axis=1)[:,np.newaxis]
return dist
 
def topicDistribution(self, topicCountsInDocs_sample):
"""Return the predictive distribution of topics in a document given a sample."""
dist = topicCountsInDocs_sample.T + self.topicConcentrationInDocuments
dist /= dist.sum(axis=1)[:,np.newaxis]
return dist
 
def dirichletDraw(alpha, N):
return np.random.dirichlet(np.ones(N)*alpha)
 
def exampleDataset(n, topicConcentrationInDocuments, docLength):
from scipy.sparse import csr_matrix
topics = np.zeros((10, 25))
for i in range(10):
topic = np.zeros((5,5))
if i<5:
topic[i,:] = 1
else:
topic[:,i-5] = 1
topic = topic.ravel()
topic /= np.sum(topic)
topics[i,:] = topic
docs = []
for i in xrange(n):
doc = np.zeros(25, dtype=np.uint8)
dist = dirichletDraw(topicConcentrationInDocuments, 10)
for i in xrange(docLength):
topic = multinomialDraw(dist)
word = multinomialDraw(topics[topic])
doc[word] += 1
docs.append(csr_matrix(doc))
return topics, docs
 
def showTopics(dist):
from matplotlib import pyplot as plt
plt.clf()
for i in range(10):
plt.subplot(2,5,i+1)
plt.imshow(dist[i].reshape(5,5), interpolation='nearest', cmap='gray')
 
 
def demo(n, topicConcentrationInDocuments, nSamples, lag):
topics, docs = exampleDataset(n, topicConcentrationInDocuments, docLength=100)
lda = Model(docs, numTopics=15,
topicConcentrationInDocuments=topicConcentrationInDocuments,
wordConcentrationInTopics=.1)
samples, logProbs = lda.drawSamples(
burnin=10, nSamples=nSamples, lag=lag)
return locals()

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.