Skip to content

Instantly share code, notes, and snippets.

@sangheestyle
Last active October 19, 2017 18:39
Show Gist options
  • Save sangheestyle/11012878 to your computer and use it in GitHub Desktop.
Save sangheestyle/11012878 to your computer and use it in GitHub Desktop.
Analyzing books in gutenberg with LDA
import logging
import os
import zipfile
import multiprocessing
from subprocess import call
from gensim.corpora.textcorpus import TextCorpus
from gensim.corpora import Dictionary, MmCorpus
from gensim.models import TfidfModel
from gensim import utils
def get_list_of_files(root=None, file_ext=None):
"""
a. traverse directories
b. make a list including file paths which have given file extension.
c. return the list
"""
filename_list = []
for root, dirs, files in os.walk(root):
for f in files:
if f.endswith(file_ext):
filename_list.append(os.path.join(root, f))
return filename_list
def get_zip_file_size(file_path=None):
file_name = os.path.basename(file_path)
file_name, file_extention = os.path.splitext(file_name)
return_value = None
try:
with zipfile.ZipFile(file_path, 'r') as zf:
for i in zf.infolist():
if i.filename == file_name + ".txt":
return_value = i.file_size
except:
e = sys.exc_info()[0]
print "ERROR:", e, file_path
return return_value
def get_filtered_zip_files(root=None,
file_ext=None,
number_of_files=None,
min_text_size=None,
max_text_size=None):
files_list = get_list_of_files(root, file_ext)
filtered_list = []
for file_path in files_list:
if len(filtered_list) >= number_of_files:
break
file_size = get_zip_file_size(file_path)
if file_size > min_text_size and file_size < max_text_size:
filtered_list.append(file_path)
print ">>> number of files:", len(filtered_list)
return filtered_list
def read_zip_file(file_path=None):
"""
a. read a zip file including text file
b. return the text
"""
try:
# FIXME: work around
call(['unzip', '-o', file_path])
except:
e = sys.exc_info()[0]
print ">>> ERROR:", e
file_name = os.path.basename(file_path)
file_name, ext = os.path.splitext(file_name)
unzipped_text_file_name = file_name + ".txt"
with open(unzipped_text_file_name, 'rb') as fp:
text = fp.read()
os.remove(unzipped_text_file_name)
return text
def process_text(filename):
text = read_zip_file(filename)
if text is not None:
text = utils.to_unicode(text, 'utf8', errors='ignore')
text = utils.lemmatize(text)
else:
text = []
return [filename, text]
class GutenbergCorpus(TextCorpus):
def __init__(self, input=None):
self.processes = max(1, multiprocessing.cpu_count())
self.iteration = 0
self.filenames = []
super(GutenbergCorpus, self).__init__(input)
def get_texts(self):
self.iteration += 1
pool = multiprocessing.Pool(self.processes)
file_names = get_filtered_zip_files(self.input, '.zip', 20000, 100000, 700000)
for index, item in enumerate(pool.imap(process_text, file_names)):
print ">> processing", index + 1, "/", len(file_names)
if self.iteration >= 2 :
self.filenames.append(item[0])
yield item[1]
DEFAULT_DICT_SIZE = 100000
if __name__ == '__main__':
import sys
import logging
import gensim
import bz2
root = '../www.gutenberg.lib.md.us'
prefix = 'gutenberg'
gutenberg = GutenbergCorpus(root)
"""
gutenberg.dictionary.filter_extremes(no_below=10,
no_above=0.2,
keep_n=DEFAULT_DICT_SIZE)
"""
MmCorpus.serialize(prefix + '_bow.mm', gutenberg, progress_cnt=10000)
gutenberg.dictionary.save_as_text(prefix + '_wordids.txt.bz2')
with open('gutenberg_filename.txt', 'wb') as f:
for filename in gutenberg.filenames:
print >> f, filename
dictionary = Dictionary.load_from_text(prefix + '_wordids.txt.bz2')
del gutenberg
mm = MmCorpus(prefix + '_bow.mm')
tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
MmCorpus.serialize(prefix + '_tfidf.mm', tfidf[mm], progress_cnt=10000)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
id2word = gensim.corpora.Dictionary.load_from_text('gutenberg_wordids.txt.bz2')
mm = gensim.corpora.MmCorpus('gutenberg_tfidf.mm')
lda = gensim.models.ldamodel.LdaModel(corpus=mm, id2word=id2word, num_topics=20, chunksize=100)
lda.save('gutenberg_idf.model')
import gensim
def get_top_most_topic(topic_rate_pair):
trp = sorted(topic_rate_pair, key=lambda item: item[1], reverse=True)
top_most_topic = trp[0]
return top_most_topic
mm = gensim.corpora.MmCorpus('gutenberg_tfidf.mm')
lda = gensim.models.ldamodel.LdaModel.load('gutenberg_tfidf_lda.model')
doc2topic = lda[mm]
doc_topic_list = []
for topic_rate_pair in doc2topic:
top_most_topic = get_top_most_topic(topic_rate_pair)
doc_topic_list.append(top_most_topic)
print top_most_topic[0]
import logging, gensim, bz2
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
id2word = gensim.corpora.Dictionary.load_from_text('gutenberg_wordids.txt.bz2')
mm = gensim.corpora.MmCorpus('gutenberg_tfidf.mm')
lda = gensim.models.ldamodel.LdaModel(corpus=mm,
id2word=id2word,
num_topics=50,
update_every=1,
chunksize=100,
passes=10)
lda.print_topics()
lda.save('gutenberg_tfidf_lda.model')
@sangheestyle
Copy link
Author

I have downloaded 62,869 files including books written in English and formatted in text format.

FINISHED --2014-04-17 05:18:24--
Total wall clock time: 1d 14h 18m 53s
Downloaded: 62869 files, 9.1G in 2h 12m 39s (1.17 MB/s)

real    2298m53.045s
user    0m48.768s
sys 2m47.387s
sanghee@stylepoint-linux:~/dev/gutenberg$ time wget -w 2 -m -H "http://www.gutenberg.org/robot/harvest?filetypes[]=txt&langs[]=en"

@sangheestyle
Copy link
Author

Challenge

a. format of text of gutenberg is not good for slicing only book contents without gutenberg comments. (e.g., regal issue comment) The comment makes quality of generating topics poor so that I need to revise some ways to remove the part before fedding the texts into lda model.

b. How many number of topics is good for LDA? (e.g. 100)

@sangheestyle
Copy link
Author

@sangheestyle
Copy link
Author

After get dictionary and tfidf, please run following code to get lda.

import logging, gensim, bz2
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
id2word = gensim.corpora.Dictionary.load_from_text('gutenberg_wordids.txt.bz2')
mm = gensim.corpora.MmCorpus('gutenberg_tfidf.mm')
print(mm)
lda = gensim.models.ldamodel.LdaModel(corpus=mm, id2word=id2word, num_topics=100, update_every=1, chunksize=10000, passes=1)
lda.print_topics(20)

@sangheestyle
Copy link
Author

Topics (n = 10, doc = 20,000)

2014-04-21 21:13:03,857 : INFO : topic #0 (0.100): 0.000*hazael/NN + 0.000*naaman/NN + 0.000*elisha/VB + 0.000*hamath/NN + 0.000*rimmon/NN + 0.000*rimmon/JJ + 0.000*gioconda/NN + 0.000*jordan/VB + 0.000*puccini/NN + 0.000*joash/NN
2014-04-21 21:13:03,863 : INFO : topic #1 (0.100): 0.007*lotus/NN + 0.006*nightingale/NN + 0.005*annabel/NN + 0.004*rupert/NN + 0.003*satan/NN + 0.003*jerusalem/NN + 0.002*daisy/NN + 0.002*lucy/NN + 0.002*locust/NN + 0.002*saith/VB
2014-04-21 21:13:03,870 : INFO : topic #2 (0.100): 0.007*decorative/JJ + 0.003*balloon/NN + 0.003*blanche/JJ + 0.003*madeira/NN + 0.002*beaver/NN + 0.002*japanese/JJ + 0.002*kensington/NN + 0.002*gretel/NN + 0.002*jean/NN + 0.002*amsterdam/NN
2014-04-21 21:13:03,877 : INFO : topic #3 (0.100): 0.002*marquis/NN + 0.002*luke/NN + 0.002*li/NN + 0.002*mentor/NN + 0.002*pittsburgh/NN + 0.002*ille/NN + 0.002*hermas/NN + 0.001*yahweh/NN + 0.001*fu/NN + 0.001*chr/NN
2014-04-21 21:13:03,884 : INFO : topic #4 (0.100): 0.005*woodstock/NN + 0.004*po/NN + 0.004*buffalo/NN + 0.004*vo/NN + 0.003*acid/NN + 0.003*portugal/JJ + 0.003*anne/JJ + 0.003*japan/NN + 0.003*cant/NN + 0.003*worcester/NN
2014-04-21 21:13:03,891 : INFO : topic #5 (0.100): 0.001*deadwood/NN + 0.000*egan/VB + 0.000*burk/NN + 0.000*kohl/NN + 0.000*montana/VB + 0.000*idaho/RB + 0.000*fagan/JJ + 0.000*mccall/JJ + 0.000*piedmont/VB + 0.000*whitewood/NN
2014-04-21 21:13:03,898 : INFO : topic #6 (0.100): 0.004*loue/NN + 0.003*brest/NN + 0.002*pc/NN + 0.002*lene/NN + 0.002*haue/VB + 0.002*dich/JJ + 0.002*tristibus/NN + 0.002*geh/NN + 0.002*niet/JJ + 0.002*loue/VB
2014-04-21 21:13:03,905 : INFO : topic #7 (0.100): 0.003*zen/NN + 0.002*bessie/NN + 0.002*burrough/NN + 0.002*bertha/NN + 0.002*mcmanus/NN + 0.001*illustrated/NN + 0.001*nelly/RB + 0.001*dolly/RB + 0.001*jinks/NN + 0.001*florence/NN
2014-04-21 21:13:03,912 : INFO : topic #8 (0.100): 0.013*illus/NN + 0.009*philatelic/JJ + 0.004*timbre/NN + 0.004*illus/JJ + 0.003*sm/NN + 0.003*moen/NN + 0.003*bruxelle/NN + 0.003*cassino/NN + 0.002*philatelic/NN + 0.002*baptiste/NN
2014-04-21 21:13:03,919 : INFO : topic #9 (0.100): 0.009*foliis/NN + 0.005*junr/NN + 0.005*syst/NN + 0.004*murr/NN + 0.003*vegetab/NN + 0.003*monogynia/VB + 0.003*caule/NN + 0.002*floribus/NN + 0.002*hort/NN + 0.002*erica/NN

@sangheestyle
Copy link
Author

Downsize

61,673 documents are used when it cames to run revision 14.
file_names = get_filtered_zip_files(self.input, '.zip', 70000, 100, 100000000)

I did the experiment including 50,000 documents instead of 61,673 documents when it cames to run revision 15, because in revision 14, some files were too big so that they were spent so much time to be precessed.

file_names = get_filtered_zip_files(self.input, '.zip', 50000, 100000, 1200000)

@sangheestyle
Copy link
Author

Training model

import logging, gensim, bz2
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
id2word = gensim.corpora.Dictionary.load_from_text('gutenberg_wordids.txt.bz2')
mm = gensim.corpora.MmCorpus('gutenberg_tfidf.mm')
print(mm)
lda = gensim.models.ldamodel.LdaModel(corpus=mm,
                                      id2word=id2word,
                                      num_topics=50,
                                      update_every=1,
                                      chunksize=1,
                                      passes=10)
lda.print_topics(20)
lda.save('gutenberg_tfidf_lda.model')

@sangheestyle
Copy link
Author

How long takes to generate tf-idf and topics with 20,000 documents?

The experiment based on revision 18 (20,000 documents) took about 4500 mins (3 days) to make dictionary and tfidf. Also it took 704 mins to generate topics.

import logging, gensim, bz2
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
id2word = gensim.corpora.Dictionary.load_from_text('gutenberg_wordids.txt.bz2')
mm = gensim.corpora.MmCorpus('gutenberg_tfidf.mm')
print(mm)
lda = gensim.models.ldamodel.LdaModel(corpus=mm,
                                      id2word=id2word,
                                      num_topics=50,
                                      update_every=1,
                                      chunksize=100,
                                      passes=10)
lda.print_topics(20)
lda.save('gutenberg_tfidf_lda.model')

@sangheestyle
Copy link
Author

Challenge

  • remove punctuation
  • use lemmatizer or stemmer
  • size of each text
  • use tf or tf-idf
  • processing time (20,000 texts for 3 days in order to get tf-idf.mm in 4 cores)
  • number of topic (considering HDA)

@sangheestyle
Copy link
Author

Decide number of topic

Reference

--num-topics [NUMBER] The number of topics to use. The best number depends on what you are looking for in the model. The default (10) will provide a broad overview of the contents of the corpus. The number of topics should depend to some degree on the size of the collection, but 200 to 400 will produce reasonably fine-grained results.

Sample code

import logging, gensim, bz2
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
id2word = gensim.corpora.Dictionary.load_from_text('gutenberg_wordids.txt.bz2')
mm = gensim.corpora.MmCorpus('gutenberg_tfidf.mm')
print(mm)
mallet_path = '/home/sanghee/bin/mallet-2.0.7/bin/mallet'
lda = gensim.models.LdaMallet(mallet_path,
                                       corpus=mm,
                                      id2word=id2word,
                                      num_topics=100)
lda.print_topics()
lda.save('gutenberg_tfidf_lda_mallet.model')

@sangheestyle
Copy link
Author

Exploring information via IPython

After processing corpus and model, we used IPython to explore tocpis and generate some final data sets such as number of document for each topics, and document to topic list.

Here are some code snippets.

with open('major_topics.txt', 'wb') as fp:
    for topic in topic_num:
        print >> fp, topic, [item[1] for item in lda.show_topic(topic)]
with open('topics_to_book_titles.txt', 'wb') as fp:
    for topic in topic_num:
        print >> fp, topic, ":", [bt[number] for number in d[topic][:10]]

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment