Created
January 18, 2019 07:07
-
-
Save lotuc/43c5de327131f6132515e69d4dd3366b to your computer and use it in GitHub Desktop.
PDF 文本搜索
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
''' | |
pip install pdfminer3k | |
''' | |
from pdfminer.pdfparser import PDFParser, PDFDocument | |
from pdfminer.pdfinterp import PDFTextExtractionNotAllowed | |
from pdfminer.layout import LAParams, LTTextContainer | |
from pdfminer.converter import PDFPageAggregator | |
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter | |
def search(pdf_path, key): | |
fp = open(pdf_path, 'rb') | |
parser = PDFParser(fp) | |
doc = PDFDocument() | |
parser.set_document(doc) | |
doc.set_parser(parser) | |
doc.initialize() | |
if not doc.is_extractable: | |
raise PDFTextExtractionNotAllowed | |
else: | |
rsrcmgr = PDFResourceManager() | |
laparams = LAParams() | |
device = PDFPageAggregator(rsrcmgr, laparams=laparams) | |
interpreter = PDFPageInterpreter(rsrcmgr, device) | |
text = '' | |
for page in doc.get_pages(): | |
interpreter.process_page(page) | |
layout = device.get_result() | |
for x in layout: | |
if isinstance(x, LTTextContainer): | |
text += x.get_text().strip() | |
if key in text: | |
return True | |
return False |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment