Last active
September 10, 2016 15:10
-
-
Save HaritzPuerto/993c8d07ede22e1649265c7f55220cf9 to your computer and use it in GitHub Desktop.
Code to reproduce an exception arisen in pyLDAvis code.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#First, retrieve documents | |
setDocs1 = [] | |
allDocuments = [] | |
for file_name in os.listdir("/home/vagrant/shared/Test/1"): | |
file = codecs.open("/home/vagrant/shared/Test/1/" + file_name, "r", "utf-8") | |
aux = file.read() | |
setDocs1.append(aux) | |
allDocuments.append(aux) | |
setDocs2 = [] | |
for file_name in os.listdir("/home/vagrant/shared/Test/2"): | |
file = codecs.open("/home/vagrant/shared/Test/2/" + file_name, "r", "utf-8") | |
aux = file.read() | |
setDocs2.append(aux) | |
allDocuments.append(aux) | |
# Build dictionary and corpora | |
texts1 = [] | |
texts2 = [] | |
all_texts = [] | |
tokenizer = RegexpTokenizer(r'\w+') | |
stoplist_tw=['amp','get','got','hey','hmm','hoo','hop','iep','let','ooo','par', | |
'pdt','pln','pst','wha','yep','yer','aest','didn','nzdt','via', | |
'one','com','new','like','great','make','top','awesome','best', | |
'good','wow','yes','say','yay','would','thanks','thank','going', | |
'new','use','should','could','best','really','see','want','nice', | |
'while','know'] | |
unigrams = [ w for doc in allDocuments for w in doc if len(w)==1] | |
bigrams = [ w for doc in allDocuments for w in doc if len(w)==2] | |
en_stop = set(nltk.corpus.stopwords.words("english") + stoplist_tw | |
+ unigrams + bigrams) | |
p_stemmer = PorterStemmer() | |
# loop through document list | |
for i in setDocs1: | |
# clean and tokenize document string | |
raw = i.lower() | |
tokens = tokenizer.tokenize(raw) | |
# remove stop words from tokens | |
stopped_tokens = [i for i in tokens if not i in en_stop] | |
# stem tokens | |
stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] | |
# add tokens to list | |
texts1.append(stemmed_tokens) | |
all_texts.append(stemmed_tokens) | |
for i in setDocs2: | |
# clean and tokenize document string | |
raw = i.lower() | |
tokens = tokenizer.tokenize(raw) | |
# remove stop words from tokens | |
stopped_tokens = [i for i in tokens if not i in en_stop] | |
# stem tokens | |
stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] | |
# add tokens to list | |
texts2.append(stemmed_tokens) | |
all_texts.append(stemmed_tokens) | |
# turn our tokenized documents into a id <-> term dictionary | |
dictionary = corpora.Dictionary(all_texts) | |
# convert tokenized documents into a document-term matrix | |
corpus1 = [dictionary.doc2bow(text) for text in texts1] | |
corpus2 = [dictionary.doc2bow(text) for text in texts2] | |
#Now, I create two LDA models. One for corpus1 and another one for corpus2. | |
lda_model_1 = gensim.models.ldamodel.LdaModel(corpus1, num_topics=3, id2word = dictionary, passes=10, alpha=0.001) | |
# I can see that it's working by doing this: | |
for i in xrange(3): | |
print i | |
for tup in lda_model_1.get_topic_terms(i): | |
print dictionary[tup[0]] + ' ' + str(tup[1]) | |
# The result in my case is what I expected. | |
# But now, when I execute the following an exception arises: | |
data1 = pyLDAvis.gensim.prepare(lda_model_1, corpus1, dictionary) # EXCEPTION HERE <-------------------------------------------- | |
pyLDAvis.display(data1) | |
#However, the 2nd model works great. Does somebody know what I doing wrong? I don't know why one model works great but the other doesn't. | |
lda_model_2 = gensim.models.ldamodel.LdaModel(corpus2, num_topics=3, id2word = dictionary, passes=10, alpha=0.001) | |
data2 = pyLDAvis.gensim.prepare(lda_model_2, corpus2, dictionary) | |
pyLDAvis.display(data2) | |
#it works well |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment