HaritzPuerto/pyLDAvisException.py

## pyLDAvisException.py
#First, retrieve documents

setDocs1 = []
allDocuments = []
for file_name in os.listdir("/home/vagrant/shared/Test/1"):
    file = codecs.open("/home/vagrant/shared/Test/1/" + file_name, "r", "utf-8")
    aux = file.read()
    setDocs1.append(aux)
    allDocuments.append(aux)

setDocs2 = []
for file_name in os.listdir("/home/vagrant/shared/Test/2"):
    file = codecs.open("/home/vagrant/shared/Test/2/" + file_name, "r", "utf-8")
    aux = file.read()
    setDocs2.append(aux)
    allDocuments.append(aux)

# Build dictionary and corpora

texts1 = []
texts2 = []
all_texts = []
tokenizer = RegexpTokenizer(r'\w+')
stoplist_tw=['amp','get','got','hey','hmm','hoo','hop','iep','let','ooo','par',
            'pdt','pln','pst','wha','yep','yer','aest','didn','nzdt','via',
            'one','com','new','like','great','make','top','awesome','best',
            'good','wow','yes','say','yay','would','thanks','thank','going',
            'new','use','should','could','best','really','see','want','nice',
            'while','know']

unigrams = [ w for doc in allDocuments for w in doc if len(w)==1]
bigrams  = [ w for doc in allDocuments for w in doc if len(w)==2]

en_stop  = set(nltk.corpus.stopwords.words("english") + stoplist_tw
                + unigrams + bigrams)
p_stemmer = PorterStemmer()
# loop through document list
for i in setDocs1:
    # clean and tokenize document string
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]

    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]

    # add tokens to list
    texts1.append(stemmed_tokens)
    all_texts.append(stemmed_tokens)

for i in setDocs2:
    # clean and tokenize document string
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in en_stop]

    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]

    # add tokens to list
    texts2.append(stemmed_tokens)
    all_texts.append(stemmed_tokens)

# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(all_texts)
# convert tokenized documents into a document-term matrix
corpus1 = [dictionary.doc2bow(text) for text in texts1]
corpus2 = [dictionary.doc2bow(text) for text in texts2]


#Now, I create two LDA models. One for corpus1 and another one for corpus2.

lda_model_1 = gensim.models.ldamodel.LdaModel(corpus1, num_topics=3, id2word = dictionary, passes=10, alpha=0.001)

# I can see that it's working by doing this:

for i in xrange(3):
    print i
    for tup in lda_model_1.get_topic_terms(i):
        print dictionary[tup[0]] + ' ' + str(tup[1])

# The result in my case is what I expected.

# But now, when I execute the following an exception arises:

data1 =  pyLDAvis.gensim.prepare(lda_model_1, corpus1, dictionary) # EXCEPTION HERE <--------------------------------------------
pyLDAvis.display(data1)


#However, the 2nd model works great. Does somebody know what I doing wrong? I don't know why one model works great but the other doesn't.

lda_model_2 = gensim.models.ldamodel.LdaModel(corpus2, num_topics=3, id2word = dictionary, passes=10, alpha=0.001)
data2 =  pyLDAvis.gensim.prepare(lda_model_2, corpus2, dictionary)
pyLDAvis.display(data2)
#it works well
	#First, retrieve documents

	setDocs1 = []
	allDocuments = []
	for file_name in os.listdir("/home/vagrant/shared/Test/1"):
	file = codecs.open("/home/vagrant/shared/Test/1/" + file_name, "r", "utf-8")
	aux = file.read()
	setDocs1.append(aux)
	allDocuments.append(aux)

	setDocs2 = []
	for file_name in os.listdir("/home/vagrant/shared/Test/2"):
	file = codecs.open("/home/vagrant/shared/Test/2/" + file_name, "r", "utf-8")
	aux = file.read()
	setDocs2.append(aux)
	allDocuments.append(aux)

	# Build dictionary and corpora

	texts1 = []
	texts2 = []
	all_texts = []
	tokenizer = RegexpTokenizer(r'\w+')
	stoplist_tw=['amp','get','got','hey','hmm','hoo','hop','iep','let','ooo','par',
	'pdt','pln','pst','wha','yep','yer','aest','didn','nzdt','via',
	'one','com','new','like','great','make','top','awesome','best',
	'good','wow','yes','say','yay','would','thanks','thank','going',
	'new','use','should','could','best','really','see','want','nice',
	'while','know']

	unigrams = [ w for doc in allDocuments for w in doc if len(w)==1]
	bigrams = [ w for doc in allDocuments for w in doc if len(w)==2]

	en_stop = set(nltk.corpus.stopwords.words("english") + stoplist_tw
	+ unigrams + bigrams)
	p_stemmer = PorterStemmer()
	# loop through document list
	for i in setDocs1:
	# clean and tokenize document string
	raw = i.lower()
	tokens = tokenizer.tokenize(raw)

	# remove stop words from tokens
	stopped_tokens = [i for i in tokens if not i in en_stop]

	# stem tokens
	stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]

	# add tokens to list
	texts1.append(stemmed_tokens)
	all_texts.append(stemmed_tokens)

	for i in setDocs2:
	# clean and tokenize document string
	raw = i.lower()
	tokens = tokenizer.tokenize(raw)

	# remove stop words from tokens
	stopped_tokens = [i for i in tokens if not i in en_stop]

	# stem tokens
	stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]

	# add tokens to list
	texts2.append(stemmed_tokens)
	all_texts.append(stemmed_tokens)

	# turn our tokenized documents into a id <-> term dictionary
	dictionary = corpora.Dictionary(all_texts)
	# convert tokenized documents into a document-term matrix
	corpus1 = [dictionary.doc2bow(text) for text in texts1]
	corpus2 = [dictionary.doc2bow(text) for text in texts2]


	#Now, I create two LDA models. One for corpus1 and another one for corpus2.

	lda_model_1 = gensim.models.ldamodel.LdaModel(corpus1, num_topics=3, id2word = dictionary, passes=10, alpha=0.001)

	# I can see that it's working by doing this:

	for i in xrange(3):
	print i
	for tup in lda_model_1.get_topic_terms(i):
	print dictionary[tup[0]] + ' ' + str(tup[1])

	# The result in my case is what I expected.

	# But now, when I execute the following an exception arises:

	data1 = pyLDAvis.gensim.prepare(lda_model_1, corpus1, dictionary) # EXCEPTION HERE <--------------------------------------------
	pyLDAvis.display(data1)


	#However, the 2nd model works great. Does somebody know what I doing wrong? I don't know why one model works great but the other doesn't.

	lda_model_2 = gensim.models.ldamodel.LdaModel(corpus2, num_topics=3, id2word = dictionary, passes=10, alpha=0.001)
	data2 = pyLDAvis.gensim.prepare(lda_model_2, corpus2, dictionary)
	pyLDAvis.display(data2)
	#it works well