Skip to content

Instantly share code, notes, and snippets.

@nkt1546789
Last active August 29, 2016 14:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nkt1546789/128ba21445cbbcc57e8098e8341ee431 to your computer and use it in GitHub Desktop.
Save nkt1546789/128ba21445cbbcc57e8098e8341ee431 to your computer and use it in GitHub Desktop.
Single topic unigram generator in Python.
import numpy as np
from scipy import sparse
class SingleTopicUnigramGenerator(object):
def __init__(self, n_topics=3, n_features=1000, alpha=1.0, beta=1.0):
self.n_topics = n_topics
self.n_features = n_features
self.alpha = alpha
self.beta = beta
def generate(self, n_docs=200, min_length=100, max_length=100):
theta = np.random.dirichlet(np.repeat(self.beta, self.n_topics), 1)[0]
# For each topic, generating word distribution
Phi = np.random.dirichlet(np.repeat(self.beta, self.n_features), self.n_topics)
# generating topics
z = np.random.multinomial(1, pvals=theta, size=n_docs).argmax(axis=1)
# generating unigrams whose length is 100 ~ 300
W = []
for d in xrange(n_docs):
length = np.int32(np.random.uniform(min_length, max_length))
wd = np.array(np.random.multinomial(length, pvals=Phi[z[d]], size=1)[0], dtype=np.float64)
W.append(wd)
W = sparse.csr_matrix(W)
return W, z
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment