Skip to content

Instantly share code, notes, and snippets.

@nezda
Last active August 29, 2015 14:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nezda/d1d75f6f2ed975ce9dc3 to your computer and use it in GitHub Desktop.
Save nezda/d1d75f6f2ed975ce9dc3 to your computer and use it in GitHub Desktop.
import argparse
from first import first
from deus_lib.scripts import initialize_script
from bias.models import (
LexSession,
)
from lexdb.model import (
PatentCitation,
DocketEntry,
MultipleDocument,
Tag,
)
from lexdb.model.document import (
MultipleDocumentSource,
)
from lexdb.model.ptab import (
PTABDocument,
PTABDocumentSource,
)
# from lexdb.model.itc import (
# ITCDocument,
# ITCDocumentSource,
# )
from sqlalchemy import distinct
import time
from bias.tasks.heavy import (
ocr_pacer_document,
)
from bias.mq import broker_management_api
def main():
# Create a highcpu-16 worker tagged 'ocr', deploy to it with `ocr.yml`
# run with `python ocr_patent_pdfs.py --config=gce`
# q = LexSession.query(
# distinct(MultipleDocument.id),
# )
# q = q.join(MultipleDocumentSource)
# q = q.filter(MultipleDocumentSource.backend == 'tesseract_whitespace_mods')
q = LexSession.query(
distinct(PTABDocument.id),
)
q = q.join(PTABDocumentSource)
q = q.filter(PTABDocumentSource.backend == 'tesseract_whitespace_mods')
q = [first(row) for row in q]
batch_size = 50
while q:
print 'Remaining docs to ocr:', len(q)
heavy_messages = broker_management_api.messages_in_queue('ocr_task')
print heavy_messages
while heavy_messages > 5:
time.sleep(60)
heavy_messages = broker_management_api.messages_in_queue(
'ocr_task'
)
print heavy_messages
to_ocr_this_pass = q[:batch_size]
q = q[batch_size:]
for doc_id_to_ocr in to_ocr_this_pass:
ocr_pacer_document.delay(doc_id_to_ocr)
time.sleep(60)
if __name__ == '__main__':
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument('--config', default='macbeth')
args = arg_parser.parse_args()
initialize_script(args.config)
from bias import celery as bias_app
from bias.mq import configure_mq_management
configure_mq_management(bias_app.conf)
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment