Skip to content

Instantly share code, notes, and snippets.

@dyerrington
Last active August 29, 2015 14:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dyerrington/62c6316784584007fc91 to your computer and use it in GitHub Desktop.
Save dyerrington/62c6316784584007fc91 to your computer and use it in GitHub Desktop.
Preprocessing pipeline for processing documents with Gensim. Easily manage text data to format data frames, run classification, etc.
import numpy as np, pandas as pd, os, seaborn as sns, codecs
from gensim import corpora, models, similarities
from gensim.parsing.preprocessing import STOPWORDS
class preprocess_corpus(object):
files = []
dirs = []
def __init__(self, dir, directory=False, stopwords_file=False):
self.top_dir = directory if directory else dir
self.dictionary = gensim.corpora.Dictionary(self.iter_documents(self.top_dir))
print 'dictionary before:', self.dictionary
if stopwords_file:
try:
with codecs.open(stopwords_file, 'r', 'utf-8') as fp:
self.stopwords = set(fp.read().strip().split())
self.stopwords = [word.decode('utf8') for word in self.stopwords]
stop_ids = [self.dictionary.token2id[stopword] for stopword in self.stopwords
if stopword in dictionary.token2id]
except:
raise ValueError("Stopwords file not found: %s" % stopwords_file)
else:
stop_ids = []
once_ids = [tokenid for tokenid, docfreq in self.dictionary.dfs.iteritems() if docfreq == 1]
self.dictionary.filter_tokens(stop_ids + once_ids) # remove stop words and words that appear only once
self.dictionary.filter_extremes(no_below=1, keep_n=30000) # check API docs for pruning params
self.dictionary.compactify()
print 'dictionary after:', self.dictionary
def __iter__(self):
for tokens in self.iter_documents(self.top_dir):
yield self.dictionary.doc2bow(tokens)
def iter_documents(self, top_directory):
"""Iterate over all documents, yielding a document (=list of utf8 tokens) at a time."""
for root, dirs, files in os.walk(top_directory):
for file in filter(lambda file: file.endswith('.txt'), files):
self.files.append(file)
self.dirs.append(root.split('/')[9]) # you may want to update this based on your directory structure or comment out
document = open(os.path.join(root, file)).read() # read the entire document, as one big string
yield gensim.utils.tokenize(document, lower=True) # or whatever tokenization suits you
@dyerrington
Copy link
Author

Quick example:

corpus_file = 'data/lyrics/albums'
stopwords_file = 'data/stopwords.txt'

corpus = preprocess_corpus(corpus_file, stopwords_file=stopwords_file)
lda = models.ldamodel.LdaModel(corpus=corpus, num_topics=20, id2word=corpus.dictionary)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment