Created
March 3, 2015 20:28
-
-
Save j0hn/755ba8d97c6f8bef5e1f to your computer and use it in GitHub Desktop.
Parallel processing on iepy
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Corpus preprocessing script | |
Usage: | |
preprocess.py | |
preprocess.py [<modulo> <modulo_result>] [options] | |
preprocess.py --increment-ner | |
preprocess.py -h | --help | --version | |
Options: | |
-h --help Show this screen | |
--increment-ner Re run NER and Gazetter for every document. If a document lacked any of the previous steps, will be preprocessed entirely. | |
--version Version number | |
""" | |
import logging | |
from docopt import docopt | |
import iepy | |
iepy.setup(__file__) | |
from iepy.data.db import DocumentManager | |
from iepy.preprocess.stanford_preprocess import StanfordPreprocess | |
from iepy.preprocess.pipeline import PreProcessPipeline | |
from iepy.preprocess.segmenter import SyntacticSegmenterRunner | |
class ParallelDocManager(DocumentManager): | |
def mines_of(self, qset, number_of_processors, my_id): | |
K = number_of_processors | |
N = my_id | |
clause = 'id %%%% %s = %s' % (K, N) | |
return qset.extra(where=[clause]) | |
if __name__ == '__main__': | |
logger = logging.getLogger(u'preprocess') | |
logger.setLevel(logging.INFO) | |
logging.basicConfig(level=logging.INFO, format='%(message)s') | |
opts = docopt(__doc__, version=iepy.__version__) | |
doc_manager = DocumentManager() | |
docs = doc_manager.get_raw_documents() | |
modulo = opts.get("<modulo>") | |
if modulo: | |
modulo = int(modulo) | |
modulo_result = int(opts["<modulo_result>"]) | |
docs = ParallelDocManager().mines_of(docs, modulo, modulo_result) | |
increment_ner = opts['--increment-ner'] | |
pipeline = PreProcessPipeline([ | |
StanfordPreprocess(increment_ner), | |
SyntacticSegmenterRunner(increment=True) | |
], docs) | |
pipeline.process_everything() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment