Skip to content

Instantly share code, notes, and snippets.

@andjc
Forked from joelhsmith/detect_tagged_pdf.py
Created May 21, 2023 06:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save andjc/e2338c7f2b4a9656a7cae2fa9e8484df to your computer and use it in GitHub Desktop.
Save andjc/e2338c7f2b4a9656a7cae2fa9e8484df to your computer and use it in GitHub Desktop.
Python script to check if a PDF has tags. Result is export of tagged content to console and searches it for traditional acrobat tags
from pdfminer3.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer3.pdfdevice import TagExtractor
from pdfminer3.pdfpage import PDFPage
from io import BytesIO
def convert_pdf(path, password=''):
rsrcmgr = PDFResourceManager()
retstr = BytesIO()
try:
try:
device = TagExtractor(rsrcmgr, retstr, codec='utf-8')
except:
print('Not utf-8.')
try:
device = TagExtractor(rsrcmgr, retstr, codec='ascii')
except:
print('Not ascii.')
except Exception as ex:
print(ex)
fp = open(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
maxpages = 1
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True):
interpreter.process_page(page)
contents = retstr.getvalue().decode()
fp.close()
device.close()
retstr.close()
print(contents)
# check if common proprietary Acrobat tags are in the response
tags = ["<b\'Part\'", "</b\'Sect\'", "</b\'Art\'", "<b'Content'", "<b\'Artifact\'"]
for tag in tags:
if tag in contents:
print('tagged')
break
else:
continue
if __name__ == '__main__':
import sys
convert_pdf(sys.argv[1])
# Example usage:
# python detect_pdf_tags.py junk-file.pdf
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment