Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
#!/usr/bin/env python3
import sys
import json
import shutil
from datetime import datetime
from pathlib import Path
from io import StringIO
import whoosh.fields as f
from whoosh.index import create_in
from whoosh.qparser import QueryParser
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
class catch_time:
def __enter__(self):
self.t =
return self
def __exit__(self, type, value, traceback):
self.t = - self.t
# CMD line parsing
if len(sys.argv) <= 2:
print('Usage: ./ <FILE> <QUERY>', file=sys.stderr)
path = Path(sys.argv[1])
q_string = sys.argv[2]
# Index creation
ix_path = Path('index')
ix = create_in(ix_path, f.Schema(page=f.NUMERIC(stored=True), content=f.TEXT))
writer = ix.writer()
with catch_time() as t,'rb') as f, StringIO() as out_str:
rsrcmgr = PDFResourceManager()
device = TextConverter(rsrcmgr, out_str, codec='utf-8', laparams=LAParams())
interpreter = PDFPageInterpreter(rsrcmgr, device)
for p, page in enumerate(PDFPage.get_pages(f), 1):
text = out_str.getvalue()
writer.add_document(page=p, content=text)
print(f'Index time : {t.t}', file=sys.stderr)
# Search
query = QueryParser("content", ix.schema).parse(q_string)
with ix.searcher() as searcher:
with catch_time() as t:
results =
print(f'Search time: {t.t}', file=sys.stderr)
pages = sorted(r['page'] for r in results)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment