Skip to content

Instantly share code, notes, and snippets.

@flying-sheep
Created May 8, 2020 18:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save flying-sheep/27f99747f85abb20bab7dc732abe3f6a to your computer and use it in GitHub Desktop.
Save flying-sheep/27f99747f85abb20bab7dc732abe3f6a to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
import sys
import json
import shutil
from datetime import datetime
from pathlib import Path
from io import StringIO
import whoosh.fields as f
from whoosh.index import create_in
from whoosh.qparser import QueryParser
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
class catch_time:
def __enter__(self):
self.t = datetime.now()
return self
def __exit__(self, type, value, traceback):
self.t = datetime.now() - self.t
# CMD line parsing
if len(sys.argv) <= 2:
print('Usage: ./pdf_search.py <FILE> <QUERY>', file=sys.stderr)
sys.exit(1)
path = Path(sys.argv[1])
q_string = sys.argv[2]
# Index creation
ix_path = Path('index')
shutil.rmtree(ix_path)
ix_path.mkdir(exist_ok=True)
ix = create_in(ix_path, f.Schema(page=f.NUMERIC(stored=True), content=f.TEXT))
writer = ix.writer()
with catch_time() as t, path.open('rb') as f, StringIO() as out_str:
rsrcmgr = PDFResourceManager()
device = TextConverter(rsrcmgr, out_str, codec='utf-8', laparams=LAParams())
interpreter = PDFPageInterpreter(rsrcmgr, device)
for p, page in enumerate(PDFPage.get_pages(f), 1):
interpreter.process_page(page)
text = out_str.getvalue()
out_str.truncate(0)
out_str.seek(0)
writer.add_document(page=p, content=text)
writer.commit()
print(f'Index time : {t.t}', file=sys.stderr)
# Search
query = QueryParser("content", ix.schema).parse(q_string)
with ix.searcher() as searcher:
with catch_time() as t:
results = searcher.search(query)
print(f'Search time: {t.t}', file=sys.stderr)
pages = sorted(r['page'] for r in results)
print(json.dumps(pages))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment