Skip to content

Instantly share code, notes, and snippets.

@joelhsmith
Last active April 5, 2024 19:57
Show Gist options
  • Save joelhsmith/5e6ec7ee70ab4b89d7bc5700e9e07fde to your computer and use it in GitHub Desktop.
Save joelhsmith/5e6ec7ee70ab4b89d7bc5700e9e07fde to your computer and use it in GitHub Desktop.
Python script to check if a PDF has tags. Result is export of tagged content to console and searches it for traditional acrobat tags
from pdfminer3.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer3.pdfdevice import TagExtractor
from pdfminer3.pdfpage import PDFPage
from io import BytesIO
def convert_pdf(path, password=''):
rsrcmgr = PDFResourceManager()
retstr = BytesIO()
try:
try:
device = TagExtractor(rsrcmgr, retstr, codec='utf-8')
except:
print('Not utf-8.')
try:
device = TagExtractor(rsrcmgr, retstr, codec='ascii')
except:
print('Not ascii.')
except Exception as ex:
print(ex)
fp = open(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
maxpages = 1
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True):
interpreter.process_page(page)
contents = retstr.getvalue().decode()
fp.close()
device.close()
retstr.close()
print(contents)
# check if common proprietary Acrobat tags are in the response
tags = ["<b\'Part\'", "</b\'Sect\'", "</b\'Art\'", "<b'Content'", "<b\'Artifact\'"]
for tag in tags:
if tag in contents:
print('tagged')
break
else:
continue
if __name__ == '__main__':
import sys
convert_pdf(sys.argv[1])
# Example usage:
# python detect_pdf_tags.py junk-file.pdf
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment