Skip to content

Instantly share code, notes, and snippets.

@Couhp
Created August 18, 2017 09:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Couhp/6722b33fd9ce212dcc5f345354915278 to your computer and use it in GitHub Desktop.
Save Couhp/6722b33fd9ce212dcc5f345354915278 to your computer and use it in GitHub Desktop.
LDA by python
# -*- coding: utf-8 -*-
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim
from os import listdir
from os.path import isfile, join
import time
import logging
log_number = open("LDA_Log/Logging","r").read()
update_log = open("LDA_Log/Logging","w")
update_log.writelines(str(int(log_number) + 1))
update_log.close()
#logging.basicConfig(filename='LDA_Log/lda_model_' + log_number + '.log', format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
tokenizer = RegexpTokenizer(r'\w+')
# create sample documents
mypath = "training"
listFile = [f for f in listdir(mypath) if isfile(join(mypath, f))]
# compile sample documents into a list
doc_set = []
start_time = time.time()
for inputFile in listFile :
f = open(join(mypath,inputFile), "r")
inputStr = f.read()
if len(inputStr) < 30 :
inputStr += " " + inputStr
doc_set.append(inputStr)
end_time = time.time()
print ("Time load : ", (end_time - start_time)*1000 , " ms")
print ("The number of documents", len(doc_set))
# loop through document list
texts = []
for doc in doc_set:
# clean and tokenize document string
texts.append(doc.split(" "))
# add tokens to list
# texts.append(stemmed_tokens)
# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)
dictionary.save("dictionary.dict")
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]
# generate LDA model
start_time = time.time()
ldamodel = gensim.models.ldamulticore.LdaMulticore(corpus, num_topics = 380, id2word = dictionary, passes = 10,eval_every=5, workers=5)
end_time = time.time()
print ("Time load : ", (end_time - start_time) , " s")
topics = ldamodel.print_topics(num_topics=-1)
for topic in topics :
print ("Topic : ", topic)
ldamodel.save("ldamodel.model") #Save the model for next time....
# data = texts[1]
# a = ldamodel[data]
# print a
# for i in range(5) :
# data = ldamodel.get_document_topics(corpus[i])
# print (data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment