Skip to content

Instantly share code, notes, and snippets.

@tiarno
Last active February 2, 2024 22:15
Show Gist options
  • Save tiarno/8a2995e70cee42f01e79 to your computer and use it in GitHub Desktop.
Save tiarno/8a2995e70cee42f01e79 to your computer and use it in GitHub Desktop.
find PDF font info with PyPDF2, example code
from PyPDF2 import PdfFileReader
from pprint import pprint
def walk(obj, fnt, emb):
'''
If there is a key called 'BaseFont', that is a font that is used in the document.
If there is a key called 'FontName' and another key in the same dictionary object
that is called 'FontFilex' (where x is null, 2, or 3), then that fontname is
embedded.
We create and add to two sets, fnt = fonts used and emb = fonts embedded.
'''
if not hasattr(obj, 'keys'):
return None, None
fontkeys = set(['/FontFile', '/FontFile2', '/FontFile3'])
if '/BaseFont' in obj:
fnt.add(obj['/BaseFont'])
if '/FontName' in obj:
if [x for x in fontkeys if x in obj]:# test to see if there is FontFile
emb.add(obj['/FontName'])
for k in obj.keys():
walk(obj[k], fnt, emb)
return fnt, emb# return the sets for each page
if __name__ == '__main__':
fname = 'myfile.pdf'
pdf = PdfFileReader(fname)
fonts = set()
embedded = set()
for page in pdf.pages:
obj = page.getObject()
# updated via this answer:
# https://stackoverflow.com/questions/60876103/use-pypdf2-to-detect-non-embedded-fonts-in-pdf-file-generated-by-google-docs/60895334#60895334
# in order to handle lists inside objects. Thanks misingnoglic !
# untested code since I don't have such a PDF to play with.
if type(obj) == PyPDF2.generic.ArrayObject: # You can also do ducktyping here
for i in obj:
if hasattr(i, 'keys'):
f, e = walk(i, fonts, embedded_fonts)
fonts = fonts.union(f)
embedded = embedded.union(e)
else:
f, e = walk(obj['/Resources'], fonts, embedded)
fonts = fonts.union(f)
embedded = embedded.union(e)
unembedded = fonts - embedded
print 'Font List'
pprint(sorted(list(fonts)))
if unembedded:
print '\nUnembedded Fonts'
pprint(unembedded)
@benthorner
Copy link

We've been using this code for quite a while to detect unembedded fonts.

I made a change recently to support composite (Type 0) fonts: alphagov/notifications-template-preview@83c137b

@tiarno
Copy link
Author

tiarno commented Jun 25, 2021

We've been using this code for quite a while to detect unembedded fonts.

I made a change recently to support composite (Type 0) fonts: alphagov/notifications-template-preview@83c137b

Very nice, I didn't even know about Type0 fonts. Thanks for the comment.

@medhadeeptimahanti
Copy link

Thank you for this! It helped me out. I've gotten outputs for PDFs I've tested as follows:

{'SymbolMT', 'ArialMT', 'BCDEEE', 'BCDGEE', 'BCDFEE'}

Is there a reason why it shows 'ArialMT' instead of just 'Arial' and what exactly is the 'BCDGEE'? Is there a way to get rid of the 'MT' in 'ArialMT' and the 'BCDGEE'?

@mteam88
Copy link

mteam88 commented Jan 10, 2022

I would love this in python 3.

@roj1512
Copy link

roj1512 commented May 7, 2023

@mteam88 here:

from PyPDF2 import PdfReader
from pprint import pprint
import PyPDF2

def walk(obj, fnt, emb):
    '''
    If there is a key called 'BaseFont', that is a font that is used in the document.
    If there is a key called 'FontName' and another key in the same dictionary object
    that is called 'FontFilex' (where x is null, 2, or 3), then that fontname is 
    embedded.
    
    We create and add to two sets, fnt = fonts used and emb = fonts embedded.
    '''
    if not hasattr(obj, 'keys'):
        return None, None
    fontkeys = set(['/FontFile', '/FontFile2', '/FontFile3'])
    if '/BaseFont' in obj:
        fnt.add(obj['/BaseFont'])
    if '/FontName' in obj:
        if [x for x in fontkeys if x in obj]:# test to see if there is FontFile
            emb.add(obj['/FontName'])

    for k in obj.keys():
        walk(obj[k], fnt, emb)

    return fnt, emb# return the sets for each page

if __name__ == '__main__':
    fname = 'myfile.pdf'
    pdf = PdfReader(fname)
    fonts = set()
    embedded = set()
    for page in pdf.pages:
        obj = page.get_object()
        # updated via this answer:
        # https://stackoverflow.com/questions/60876103/use-pypdf2-to-detect-non-embedded-fonts-in-pdf-file-generated-by-google-docs/60895334#60895334 
        # in order to handle lists inside objects. Thanks misingnoglic !
        # untested code since I don't have such a PDF to play with.
        if type(obj) == PyPDF2.generic.ArrayObject:  # You can also do ducktyping here
            for i in obj:
                if hasattr(i, 'keys'):
                    f, e = walk(i, fonts, embedded_fonts)
                    fonts = fonts.union(f)
                    embedded = embedded.union(e)
        else:
            f, e = walk(obj['/Resources'], fonts, embedded)
            fonts = fonts.union(f)
            embedded = embedded.union(e)

    unembedded = fonts - embedded
    print('Font List')
    pprint(sorted(list(fonts)))
    if unembedded:
        print ('\nUnembedded Fonts')
        pprint(unembedded)

@J0hn3ch
Copy link

J0hn3ch commented Aug 1, 2023

Hi! Is there a way to get bold words or bold phrases inside a page containing information about font used?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment