Last active
August 29, 2015 14:21
-
-
Save dyerrington/62c6316784584007fc91 to your computer and use it in GitHub Desktop.
Preprocessing pipeline for processing documents with Gensim. Easily manage text data to format data frames, run classification, etc.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np, pandas as pd, os, seaborn as sns, codecs | |
from gensim import corpora, models, similarities | |
from gensim.parsing.preprocessing import STOPWORDS | |
class preprocess_corpus(object): | |
files = [] | |
dirs = [] | |
def __init__(self, dir, directory=False, stopwords_file=False): | |
self.top_dir = directory if directory else dir | |
self.dictionary = gensim.corpora.Dictionary(self.iter_documents(self.top_dir)) | |
print 'dictionary before:', self.dictionary | |
if stopwords_file: | |
try: | |
with codecs.open(stopwords_file, 'r', 'utf-8') as fp: | |
self.stopwords = set(fp.read().strip().split()) | |
self.stopwords = [word.decode('utf8') for word in self.stopwords] | |
stop_ids = [self.dictionary.token2id[stopword] for stopword in self.stopwords | |
if stopword in dictionary.token2id] | |
except: | |
raise ValueError("Stopwords file not found: %s" % stopwords_file) | |
else: | |
stop_ids = [] | |
once_ids = [tokenid for tokenid, docfreq in self.dictionary.dfs.iteritems() if docfreq == 1] | |
self.dictionary.filter_tokens(stop_ids + once_ids) # remove stop words and words that appear only once | |
self.dictionary.filter_extremes(no_below=1, keep_n=30000) # check API docs for pruning params | |
self.dictionary.compactify() | |
print 'dictionary after:', self.dictionary | |
def __iter__(self): | |
for tokens in self.iter_documents(self.top_dir): | |
yield self.dictionary.doc2bow(tokens) | |
def iter_documents(self, top_directory): | |
"""Iterate over all documents, yielding a document (=list of utf8 tokens) at a time.""" | |
for root, dirs, files in os.walk(top_directory): | |
for file in filter(lambda file: file.endswith('.txt'), files): | |
self.files.append(file) | |
self.dirs.append(root.split('/')[9]) # you may want to update this based on your directory structure or comment out | |
document = open(os.path.join(root, file)).read() # read the entire document, as one big string | |
yield gensim.utils.tokenize(document, lower=True) # or whatever tokenization suits you |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Quick example:
corpus_file = 'data/lyrics/albums'
stopwords_file = 'data/stopwords.txt'
corpus = preprocess_corpus(corpus_file, stopwords_file=stopwords_file)
lda = models.ldamodel.LdaModel(corpus=corpus, num_topics=20, id2word=corpus.dictionary)