Skip to content

Instantly share code, notes, and snippets.

@j0hn
Created March 3, 2015 20:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save j0hn/755ba8d97c6f8bef5e1f to your computer and use it in GitHub Desktop.
Save j0hn/755ba8d97c6f8bef5e1f to your computer and use it in GitHub Desktop.
Parallel processing on iepy
"""
Corpus preprocessing script
Usage:
preprocess.py
preprocess.py [<modulo> <modulo_result>] [options]
preprocess.py --increment-ner
preprocess.py -h | --help | --version
Options:
-h --help Show this screen
--increment-ner Re run NER and Gazetter for every document. If a document lacked any of the previous steps, will be preprocessed entirely.
--version Version number
"""
import logging
from docopt import docopt
import iepy
iepy.setup(__file__)
from iepy.data.db import DocumentManager
from iepy.preprocess.stanford_preprocess import StanfordPreprocess
from iepy.preprocess.pipeline import PreProcessPipeline
from iepy.preprocess.segmenter import SyntacticSegmenterRunner
class ParallelDocManager(DocumentManager):
def mines_of(self, qset, number_of_processors, my_id):
K = number_of_processors
N = my_id
clause = 'id %%%% %s = %s' % (K, N)
return qset.extra(where=[clause])
if __name__ == '__main__':
logger = logging.getLogger(u'preprocess')
logger.setLevel(logging.INFO)
logging.basicConfig(level=logging.INFO, format='%(message)s')
opts = docopt(__doc__, version=iepy.__version__)
doc_manager = DocumentManager()
docs = doc_manager.get_raw_documents()
modulo = opts.get("<modulo>")
if modulo:
modulo = int(modulo)
modulo_result = int(opts["<modulo_result>"])
docs = ParallelDocManager().mines_of(docs, modulo, modulo_result)
increment_ner = opts['--increment-ner']
pipeline = PreProcessPipeline([
StanfordPreprocess(increment_ner),
SyntacticSegmenterRunner(increment=True)
], docs)
pipeline.process_everything()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment