Skip to content

Instantly share code, notes, and snippets.

@HaritzPuerto
Last active September 10, 2016 15:10
Show Gist options
  • Save HaritzPuerto/993c8d07ede22e1649265c7f55220cf9 to your computer and use it in GitHub Desktop.
Save HaritzPuerto/993c8d07ede22e1649265c7f55220cf9 to your computer and use it in GitHub Desktop.
Code to reproduce an exception arisen in pyLDAvis code.
#First, retrieve documents
setDocs1 = []
allDocuments = []
for file_name in os.listdir("/home/vagrant/shared/Test/1"):
file = codecs.open("/home/vagrant/shared/Test/1/" + file_name, "r", "utf-8")
aux = file.read()
setDocs1.append(aux)
allDocuments.append(aux)
setDocs2 = []
for file_name in os.listdir("/home/vagrant/shared/Test/2"):
file = codecs.open("/home/vagrant/shared/Test/2/" + file_name, "r", "utf-8")
aux = file.read()
setDocs2.append(aux)
allDocuments.append(aux)
# Build dictionary and corpora
texts1 = []
texts2 = []
all_texts = []
tokenizer = RegexpTokenizer(r'\w+')
stoplist_tw=['amp','get','got','hey','hmm','hoo','hop','iep','let','ooo','par',
'pdt','pln','pst','wha','yep','yer','aest','didn','nzdt','via',
'one','com','new','like','great','make','top','awesome','best',
'good','wow','yes','say','yay','would','thanks','thank','going',
'new','use','should','could','best','really','see','want','nice',
'while','know']
unigrams = [ w for doc in allDocuments for w in doc if len(w)==1]
bigrams = [ w for doc in allDocuments for w in doc if len(w)==2]
en_stop = set(nltk.corpus.stopwords.words("english") + stoplist_tw
+ unigrams + bigrams)
p_stemmer = PorterStemmer()
# loop through document list
for i in setDocs1:
# clean and tokenize document string
raw = i.lower()
tokens = tokenizer.tokenize(raw)
# remove stop words from tokens
stopped_tokens = [i for i in tokens if not i in en_stop]
# stem tokens
stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
# add tokens to list
texts1.append(stemmed_tokens)
all_texts.append(stemmed_tokens)
for i in setDocs2:
# clean and tokenize document string
raw = i.lower()
tokens = tokenizer.tokenize(raw)
# remove stop words from tokens
stopped_tokens = [i for i in tokens if not i in en_stop]
# stem tokens
stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
# add tokens to list
texts2.append(stemmed_tokens)
all_texts.append(stemmed_tokens)
# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(all_texts)
# convert tokenized documents into a document-term matrix
corpus1 = [dictionary.doc2bow(text) for text in texts1]
corpus2 = [dictionary.doc2bow(text) for text in texts2]
#Now, I create two LDA models. One for corpus1 and another one for corpus2.
lda_model_1 = gensim.models.ldamodel.LdaModel(corpus1, num_topics=3, id2word = dictionary, passes=10, alpha=0.001)
# I can see that it's working by doing this:
for i in xrange(3):
print i
for tup in lda_model_1.get_topic_terms(i):
print dictionary[tup[0]] + ' ' + str(tup[1])
# The result in my case is what I expected.
# But now, when I execute the following an exception arises:
data1 = pyLDAvis.gensim.prepare(lda_model_1, corpus1, dictionary) # EXCEPTION HERE <--------------------------------------------
pyLDAvis.display(data1)
#However, the 2nd model works great. Does somebody know what I doing wrong? I don't know why one model works great but the other doesn't.
lda_model_2 = gensim.models.ldamodel.LdaModel(corpus2, num_topics=3, id2word = dictionary, passes=10, alpha=0.001)
data2 = pyLDAvis.gensim.prepare(lda_model_2, corpus2, dictionary)
pyLDAvis.display(data2)
#it works well
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment