Skip to content

Instantly share code, notes, and snippets.

@zenssh
Forked from tiarno/gist:8a2995e70cee42f01e79
Created January 19, 2016 23:01
Show Gist options
  • Save zenssh/fe8cc625563b53529b4e to your computer and use it in GitHub Desktop.
Save zenssh/fe8cc625563b53529b4e to your computer and use it in GitHub Desktop.
find PDF font info with PyPDF2, example code
from PyPDF2 import PdfFileReader
from pprint import pprint
def walk(obj, fnt, emb):
'''
If there is a key called 'BaseFont', that is a font that is used in the document.
If there is a key called 'FontName' and another key in the same dictionary object
that is called 'FontFilex' (where x is null, 2, or 3), then that fontname is
embedded.
We create and add to two sets, fnt = fonts used and emb = fonts embedded.
'''
if not hasattr(obj, 'keys'):
return None, None
fontkeys = set(['/FontFile', '/FontFile2', '/FontFile3'])
if '/BaseFont' in obj:
fnt.add(obj['/BaseFont'])
if '/FontName' in obj:
if [x for x in fontkeys if x in obj]:# test to see if there is FontFile
emb.add(obj['/FontName'])
for k in obj.keys():
walk(obj[k], fnt, emb)
return fnt, emb# return the sets for each page
if __name__ == '__main__':
fname = 'myfile.pdf'
pdf = PdfFileReader(fname)
fonts = set()
embedded = set()
for page in pdf.pages:
obj = page.getObject()
f, e = walk(obj['/Resources'], fonts, embedded)
fonts = fonts.union(f)
embedded = embedded.union(e)
unembedded = fonts - embedded
print 'Font List'
pprint(sorted(list(fonts)))
if unembedded:
print '\nUnembedded Fonts'
pprint(unembedded)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment