Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save pasenor/a0d6307976d4f5e969972d15e25682a3 to your computer and use it in GitHub Desktop.
Save pasenor/a0d6307976d4f5e969972d15e25682a3 to your computer and use it in GitHub Desktop.
find PDF font info with PyPDF2, example code
from PyPDF2 import PdfFileReader
from pprint import pprint
def walk(obj, fnt, emb):
'''
If there is a key called 'BaseFont', that is a font that is used in the document.
If there is a key called 'FontName' and another key in the same dictionary object
that is called 'FontFilex' (where x is null, 2, or 3), then that fontname is
embedded.
We create and add to two sets, fnt = fonts used and emb = fonts embedded.
'''
if not hasattr(obj, 'keys'):
return None, None
fontkeys = set(['/FontFile', '/FontFile2', '/FontFile3'])
if '/BaseFont' in obj:
fnt.add(obj['/BaseFont'])
if '/FontName' in obj:
if [x for x in fontkeys if x in obj]:# test to see if there is FontFile
emb.add(obj['/FontName'])
for k in obj.keys():
walk(obj[k], fnt, emb)
return fnt, emb# return the sets for each page
if __name__ == '__main__':
import sys
try:
fname = sys.argv[1]
except:
print("Usage:\n{0} filename.pdf".format(sys.argv[0]))
sys.exit(0)
pdf = PdfFileReader(fname)
fonts = set()
embedded = set()
for page in pdf.pages:
obj = page.getObject()
f, e = walk(obj['/Resources'], fonts, embedded)
fonts = fonts.union(f)
embedded = embedded.union(e)
unembedded = fonts - embedded
print( 'Font List')
pprint(sorted(fonts))
if unembedded:
print( '\nUnembedded Fonts')
pprint(sorted(unembedded))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment