dyerrington/preprocess_corpus.py

## preprocess_corpus.py
import numpy as np, pandas as pd, os, seaborn as sns, codecs
from gensim import corpora, models, similarities
from gensim.parsing.preprocessing import STOPWORDS

class preprocess_corpus(object):

    files  =  []
    dirs   =  []

    def __init__(self, dir, directory=False, stopwords_file=False):

        self.top_dir = directory if directory else dir
        self.dictionary = gensim.corpora.Dictionary(self.iter_documents(self.top_dir))

        print 'dictionary before:', self.dictionary

        if stopwords_file:
            try:
                with codecs.open(stopwords_file, 'r', 'utf-8') as fp:
                    self.stopwords  =   set(fp.read().strip().split())
                    self.stopwords  =   [word.decode('utf8') for word in self.stopwords]

                    stop_ids        =   [self.dictionary.token2id[stopword] for stopword in self.stopwords
                                            if stopword in dictionary.token2id]

            except:
                raise ValueError("Stopwords file not found:  %s" % stopwords_file)
        else:
            stop_ids = []

        once_ids = [tokenid for tokenid, docfreq in self.dictionary.dfs.iteritems() if docfreq == 1]
        self.dictionary.filter_tokens(stop_ids + once_ids) # remove stop words and words that appear only once
        self.dictionary.filter_extremes(no_below=1, keep_n=30000) # check API docs for pruning params
        self.dictionary.compactify()

        print 'dictionary after:', self.dictionary


    def __iter__(self):
        for tokens in self.iter_documents(self.top_dir):
            yield self.dictionary.doc2bow(tokens)

    def iter_documents(self, top_directory):
        """Iterate over all documents, yielding a document (=list of utf8 tokens) at a time."""
        for root, dirs, files in os.walk(top_directory):
            for file in filter(lambda file: file.endswith('.txt'), files):
                self.files.append(file)
                self.dirs.append(root.split('/')[9]) # you may want to update this based on your directory structure or comment out
                document = open(os.path.join(root, file)).read() # read the entire document, as one big string
                yield gensim.utils.tokenize(document, lower=True) # or whatever tokenization suits you
	import numpy as np, pandas as pd, os, seaborn as sns, codecs
	from gensim import corpora, models, similarities
	from gensim.parsing.preprocessing import STOPWORDS

	class preprocess_corpus(object):

	files = []
	dirs = []

	def __init__(self, dir, directory=False, stopwords_file=False):

	self.top_dir = directory if directory else dir
	self.dictionary = gensim.corpora.Dictionary(self.iter_documents(self.top_dir))

	print 'dictionary before:', self.dictionary

	if stopwords_file:
	try:
	with codecs.open(stopwords_file, 'r', 'utf-8') as fp:
	self.stopwords = set(fp.read().strip().split())
	self.stopwords = [word.decode('utf8') for word in self.stopwords]

	stop_ids = [self.dictionary.token2id[stopword] for stopword in self.stopwords
	if stopword in dictionary.token2id]

	except:
	raise ValueError("Stopwords file not found: %s" % stopwords_file)
	else:
	stop_ids = []

	once_ids = [tokenid for tokenid, docfreq in self.dictionary.dfs.iteritems() if docfreq == 1]
	self.dictionary.filter_tokens(stop_ids + once_ids) # remove stop words and words that appear only once
	self.dictionary.filter_extremes(no_below=1, keep_n=30000) # check API docs for pruning params
	self.dictionary.compactify()

	print 'dictionary after:', self.dictionary


	def __iter__(self):
	for tokens in self.iter_documents(self.top_dir):
	yield self.dictionary.doc2bow(tokens)

	def iter_documents(self, top_directory):
	"""Iterate over all documents, yielding a document (=list of utf8 tokens) at a time."""
	for root, dirs, files in os.walk(top_directory):
	for file in filter(lambda file: file.endswith('.txt'), files):
	self.files.append(file)
	self.dirs.append(root.split('/')[9]) # you may want to update this based on your directory structure or comment out
	document = open(os.path.join(root, file)).read() # read the entire document, as one big string
	yield gensim.utils.tokenize(document, lower=True) # or whatever tokenization suits you