Skip to content

Instantly share code, notes, and snippets.

@lotuc
Created January 18, 2019 07:07
Show Gist options
  • Save lotuc/43c5de327131f6132515e69d4dd3366b to your computer and use it in GitHub Desktop.
Save lotuc/43c5de327131f6132515e69d4dd3366b to your computer and use it in GitHub Desktop.
PDF 文本搜索
#!/usr/bin/env python3
'''
pip install pdfminer3k
'''
from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFTextExtractionNotAllowed
from pdfminer.layout import LAParams, LTTextContainer
from pdfminer.converter import PDFPageAggregator
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
def search(pdf_path, key):
fp = open(pdf_path, 'rb')
parser = PDFParser(fp)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize()
if not doc.is_extractable:
raise PDFTextExtractionNotAllowed
else:
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
text = ''
for page in doc.get_pages():
interpreter.process_page(page)
layout = device.get_result()
for x in layout:
if isinstance(x, LTTextContainer):
text += x.get_text().strip()
if key in text:
return True
return False
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment