sangheestyle/analyze_gutenberg.py

## analyze_gutenberg.py
import logging
import os
import zipfile
import multiprocessing
from subprocess import call

from gensim.corpora.textcorpus import TextCorpus
from gensim.corpora import Dictionary, MmCorpus
from gensim.models import TfidfModel
from gensim import utils


def get_list_of_files(root=None, file_ext=None):
    """
    a. traverse directories
    b. make a list including file paths which have given file extension.
    c. return the list
    """
    filename_list = []
    for root, dirs, files in os.walk(root):
        for f in files:
            if f.endswith(file_ext):
                filename_list.append(os.path.join(root, f))
    return filename_list


def get_zip_file_size(file_path=None):
    file_name = os.path.basename(file_path)
    file_name, file_extention = os.path.splitext(file_name)
    return_value = None
    try:
        with zipfile.ZipFile(file_path, 'r') as zf:
            for i in zf.infolist():
                if i.filename == file_name + ".txt":
                    return_value = i.file_size
    except:
        e = sys.exc_info()[0]
        print "ERROR:", e, file_path
    return return_value


def get_filtered_zip_files(root=None,
                           file_ext=None,
                           number_of_files=None,
                           min_text_size=None,
                           max_text_size=None):
    files_list = get_list_of_files(root, file_ext)
    filtered_list = []
    for file_path in files_list:
        if len(filtered_list) >= number_of_files:
            break

        file_size = get_zip_file_size(file_path)
        if file_size > min_text_size and file_size < max_text_size:
            filtered_list.append(file_path)
    print ">>> number of files:", len(filtered_list)
    return filtered_list


def read_zip_file(file_path=None):
    """
    a. read a zip file including text file
    b. return the text
    """
    try:
        # FIXME: work around
        call(['unzip', '-o', file_path])
    except:
        e = sys.exc_info()[0]
        print ">>> ERROR:", e
    file_name = os.path.basename(file_path)
    file_name, ext = os.path.splitext(file_name)
    unzipped_text_file_name = file_name + ".txt"
    with open(unzipped_text_file_name, 'rb') as fp:
        text = fp.read()
        os.remove(unzipped_text_file_name)
    return text


def process_text(filename):
    text = read_zip_file(filename)
    if text is not None:
        text = utils.to_unicode(text, 'utf8', errors='ignore')
        text = utils.lemmatize(text)
    else:
        text = []
    return [filename, text]


class GutenbergCorpus(TextCorpus):

    def __init__(self, input=None):
        self.processes = max(1, multiprocessing.cpu_count())
        self.iteration = 0
        self.filenames = []
        super(GutenbergCorpus, self).__init__(input)

    def get_texts(self):
        self.iteration += 1
        pool = multiprocessing.Pool(self.processes)
        file_names = get_filtered_zip_files(self.input, '.zip', 20000, 100000, 700000)
        for index, item in enumerate(pool.imap(process_text, file_names)):
            print ">> processing", index + 1, "/", len(file_names)
            if self.iteration >= 2 :
                self.filenames.append(item[0])
            yield item[1]


DEFAULT_DICT_SIZE = 100000


if __name__ == '__main__':
    import sys
    import logging
    import gensim
    import bz2

    root = '../www.gutenberg.lib.md.us'
    prefix = 'gutenberg'
    gutenberg = GutenbergCorpus(root)
    """
    gutenberg.dictionary.filter_extremes(no_below=10,
                                         no_above=0.2,
                                         keep_n=DEFAULT_DICT_SIZE)
                                         """
    MmCorpus.serialize(prefix + '_bow.mm', gutenberg, progress_cnt=10000)
    gutenberg.dictionary.save_as_text(prefix + '_wordids.txt.bz2')
    with open('gutenberg_filename.txt', 'wb') as f:
        for filename in gutenberg.filenames:
            print >> f, filename
    dictionary = Dictionary.load_from_text(prefix + '_wordids.txt.bz2')
    del gutenberg
    mm = MmCorpus(prefix + '_bow.mm')
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
    MmCorpus.serialize(prefix + '_tfidf.mm', tfidf[mm], progress_cnt=10000)

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    id2word = gensim.corpora.Dictionary.load_from_text('gutenberg_wordids.txt.bz2')
    mm = gensim.corpora.MmCorpus('gutenberg_tfidf.mm')
    lda = gensim.models.ldamodel.LdaModel(corpus=mm, id2word=id2word, num_topics=20, chunksize=100)
    lda.save('gutenberg_idf.model')

## show_doc_topic_pairs.py
import gensim

def get_top_most_topic(topic_rate_pair):
    trp = sorted(topic_rate_pair, key=lambda item: item[1], reverse=True)
    top_most_topic = trp[0]
    return top_most_topic

mm = gensim.corpora.MmCorpus('gutenberg_tfidf.mm')
lda = gensim.models.ldamodel.LdaModel.load('gutenberg_tfidf_lda.model')
doc2topic = lda[mm]
doc_topic_list = []
for topic_rate_pair in doc2topic:
    top_most_topic = get_top_most_topic(topic_rate_pair)
    doc_topic_list.append(top_most_topic)
    print top_most_topic[0]

## train_model.py
import logging, gensim, bz2

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
id2word = gensim.corpora.Dictionary.load_from_text('gutenberg_wordids.txt.bz2')
mm = gensim.corpora.MmCorpus('gutenberg_tfidf.mm')
lda = gensim.models.ldamodel.LdaModel(corpus=mm,
                                      id2word=id2word,
                                      num_topics=50,
                                      update_every=1,
                                      chunksize=100,
                                      passes=10)
lda.print_topics()
lda.save('gutenberg_tfidf_lda.model')
	import logging
	import os
	import zipfile
	import multiprocessing
	from subprocess import call

	from gensim.corpora.textcorpus import TextCorpus
	from gensim.corpora import Dictionary, MmCorpus
	from gensim.models import TfidfModel
	from gensim import utils


	def get_list_of_files(root=None, file_ext=None):
	"""
	a. traverse directories
	b. make a list including file paths which have given file extension.
	c. return the list
	"""
	filename_list = []
	for root, dirs, files in os.walk(root):
	for f in files:
	if f.endswith(file_ext):
	filename_list.append(os.path.join(root, f))
	return filename_list


	def get_zip_file_size(file_path=None):
	file_name = os.path.basename(file_path)
	file_name, file_extention = os.path.splitext(file_name)
	return_value = None
	try:
	with zipfile.ZipFile(file_path, 'r') as zf:
	for i in zf.infolist():
	if i.filename == file_name + ".txt":
	return_value = i.file_size
	except:
	e = sys.exc_info()[0]
	print "ERROR:", e, file_path
	return return_value


	def get_filtered_zip_files(root=None,
	file_ext=None,
	number_of_files=None,
	min_text_size=None,
	max_text_size=None):
	files_list = get_list_of_files(root, file_ext)
	filtered_list = []
	for file_path in files_list:
	if len(filtered_list) >= number_of_files:
	break

	file_size = get_zip_file_size(file_path)
	if file_size > min_text_size and file_size < max_text_size:
	filtered_list.append(file_path)
	print ">>> number of files:", len(filtered_list)
	return filtered_list


	def read_zip_file(file_path=None):
	"""
	a. read a zip file including text file
	b. return the text
	"""
	try:
	# FIXME: work around
	call(['unzip', '-o', file_path])
	except:
	e = sys.exc_info()[0]
	print ">>> ERROR:", e
	file_name = os.path.basename(file_path)
	file_name, ext = os.path.splitext(file_name)
	unzipped_text_file_name = file_name + ".txt"
	with open(unzipped_text_file_name, 'rb') as fp:
	text = fp.read()
	os.remove(unzipped_text_file_name)
	return text


	def process_text(filename):
	text = read_zip_file(filename)
	if text is not None:
	text = utils.to_unicode(text, 'utf8', errors='ignore')
	text = utils.lemmatize(text)
	else:
	text = []
	return [filename, text]


	class GutenbergCorpus(TextCorpus):

	def __init__(self, input=None):
	self.processes = max(1, multiprocessing.cpu_count())
	self.iteration = 0
	self.filenames = []
	super(GutenbergCorpus, self).__init__(input)

	def get_texts(self):
	self.iteration += 1
	pool = multiprocessing.Pool(self.processes)
	file_names = get_filtered_zip_files(self.input, '.zip', 20000, 100000, 700000)
	for index, item in enumerate(pool.imap(process_text, file_names)):
	print ">> processing", index + 1, "/", len(file_names)
	if self.iteration >= 2 :
	self.filenames.append(item[0])
	yield item[1]


	DEFAULT_DICT_SIZE = 100000


	if __name__ == '__main__':
	import sys
	import logging
	import gensim
	import bz2

	root = '../www.gutenberg.lib.md.us'
	prefix = 'gutenberg'
	gutenberg = GutenbergCorpus(root)
	"""
	gutenberg.dictionary.filter_extremes(no_below=10,
	no_above=0.2,
	keep_n=DEFAULT_DICT_SIZE)
	"""
	MmCorpus.serialize(prefix + '_bow.mm', gutenberg, progress_cnt=10000)
	gutenberg.dictionary.save_as_text(prefix + '_wordids.txt.bz2')
	with open('gutenberg_filename.txt', 'wb') as f:
	for filename in gutenberg.filenames:
	print >> f, filename
	dictionary = Dictionary.load_from_text(prefix + '_wordids.txt.bz2')
	del gutenberg
	mm = MmCorpus(prefix + '_bow.mm')
	tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
	MmCorpus.serialize(prefix + '_tfidf.mm', tfidf[mm], progress_cnt=10000)

	logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
	id2word = gensim.corpora.Dictionary.load_from_text('gutenberg_wordids.txt.bz2')
	mm = gensim.corpora.MmCorpus('gutenberg_tfidf.mm')
	lda = gensim.models.ldamodel.LdaModel(corpus=mm, id2word=id2word, num_topics=20, chunksize=100)
	lda.save('gutenberg_idf.model')
	import gensim

	def get_top_most_topic(topic_rate_pair):
	trp = sorted(topic_rate_pair, key=lambda item: item[1], reverse=True)
	top_most_topic = trp[0]
	return top_most_topic

	mm = gensim.corpora.MmCorpus('gutenberg_tfidf.mm')
	lda = gensim.models.ldamodel.LdaModel.load('gutenberg_tfidf_lda.model')
	doc2topic = lda[mm]
	doc_topic_list = []
	for topic_rate_pair in doc2topic:
	top_most_topic = get_top_most_topic(topic_rate_pair)
	doc_topic_list.append(top_most_topic)
	print top_most_topic[0]
	import logging, gensim, bz2

	logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
	id2word = gensim.corpora.Dictionary.load_from_text('gutenberg_wordids.txt.bz2')
	mm = gensim.corpora.MmCorpus('gutenberg_tfidf.mm')
	lda = gensim.models.ldamodel.LdaModel(corpus=mm,
	id2word=id2word,
	num_topics=50,
	update_every=1,
	chunksize=100,
	passes=10)
	lda.print_topics()
	lda.save('gutenberg_tfidf_lda.model')