-
-
Save tiarno/8a2995e70cee42f01e79 to your computer and use it in GitHub Desktop.
from PyPDF2 import PdfFileReader | |
from pprint import pprint | |
def walk(obj, fnt, emb): | |
''' | |
If there is a key called 'BaseFont', that is a font that is used in the document. | |
If there is a key called 'FontName' and another key in the same dictionary object | |
that is called 'FontFilex' (where x is null, 2, or 3), then that fontname is | |
embedded. | |
We create and add to two sets, fnt = fonts used and emb = fonts embedded. | |
''' | |
if not hasattr(obj, 'keys'): | |
return None, None | |
fontkeys = set(['/FontFile', '/FontFile2', '/FontFile3']) | |
if '/BaseFont' in obj: | |
fnt.add(obj['/BaseFont']) | |
if '/FontName' in obj: | |
if [x for x in fontkeys if x in obj]:# test to see if there is FontFile | |
emb.add(obj['/FontName']) | |
for k in obj.keys(): | |
walk(obj[k], fnt, emb) | |
return fnt, emb# return the sets for each page | |
if __name__ == '__main__': | |
fname = 'myfile.pdf' | |
pdf = PdfFileReader(fname) | |
fonts = set() | |
embedded = set() | |
for page in pdf.pages: | |
obj = page.getObject() | |
# updated via this answer: | |
# https://stackoverflow.com/questions/60876103/use-pypdf2-to-detect-non-embedded-fonts-in-pdf-file-generated-by-google-docs/60895334#60895334 | |
# in order to handle lists inside objects. Thanks misingnoglic ! | |
# untested code since I don't have such a PDF to play with. | |
if type(obj) == PyPDF2.generic.ArrayObject: # You can also do ducktyping here | |
for i in obj: | |
if hasattr(i, 'keys'): | |
f, e = walk(i, fonts, embedded_fonts) | |
fonts = fonts.union(f) | |
embedded = embedded.union(e) | |
else: | |
f, e = walk(obj['/Resources'], fonts, embedded) | |
fonts = fonts.union(f) | |
embedded = embedded.union(e) | |
unembedded = fonts - embedded | |
print 'Font List' | |
pprint(sorted(list(fonts))) | |
if unembedded: | |
print '\nUnembedded Fonts' | |
pprint(unembedded) |
hi, can someone help a rookie out... this is a function that will return the names of the various fonts within a pdf... correct?
so i should feed it a pdf ( i assume this is the object param of the function) but what are the other 2 params? it seems like its asking me for 2 fonts..
quick explain... thanks
That's a very handy! Is it also possuble to get a text wich is written bold?
@tiarno any way we can do this as well
Indeed this would be really useful!
We've been using this code for quite a while to detect unembedded fonts.
I made a change recently to support composite (Type 0) fonts: alphagov/notifications-template-preview@83c137b
We've been using this code for quite a while to detect unembedded fonts.
I made a change recently to support composite (Type 0) fonts: alphagov/notifications-template-preview@83c137b
Very nice, I didn't even know about Type0 fonts. Thanks for the comment.
Thank you for this! It helped me out. I've gotten outputs for PDFs I've tested as follows:
{'SymbolMT', 'ArialMT', 'BCDEEE', 'BCDGEE', 'BCDFEE'}
Is there a reason why it shows 'ArialMT' instead of just 'Arial' and what exactly is the 'BCDGEE'? Is there a way to get rid of the 'MT' in 'ArialMT' and the 'BCDGEE'?
I would love this in python 3.
@mteam88 here:
from PyPDF2 import PdfReader
from pprint import pprint
import PyPDF2
def walk(obj, fnt, emb):
'''
If there is a key called 'BaseFont', that is a font that is used in the document.
If there is a key called 'FontName' and another key in the same dictionary object
that is called 'FontFilex' (where x is null, 2, or 3), then that fontname is
embedded.
We create and add to two sets, fnt = fonts used and emb = fonts embedded.
'''
if not hasattr(obj, 'keys'):
return None, None
fontkeys = set(['/FontFile', '/FontFile2', '/FontFile3'])
if '/BaseFont' in obj:
fnt.add(obj['/BaseFont'])
if '/FontName' in obj:
if [x for x in fontkeys if x in obj]:# test to see if there is FontFile
emb.add(obj['/FontName'])
for k in obj.keys():
walk(obj[k], fnt, emb)
return fnt, emb# return the sets for each page
if __name__ == '__main__':
fname = 'myfile.pdf'
pdf = PdfReader(fname)
fonts = set()
embedded = set()
for page in pdf.pages:
obj = page.get_object()
# updated via this answer:
# https://stackoverflow.com/questions/60876103/use-pypdf2-to-detect-non-embedded-fonts-in-pdf-file-generated-by-google-docs/60895334#60895334
# in order to handle lists inside objects. Thanks misingnoglic !
# untested code since I don't have such a PDF to play with.
if type(obj) == PyPDF2.generic.ArrayObject: # You can also do ducktyping here
for i in obj:
if hasattr(i, 'keys'):
f, e = walk(i, fonts, embedded_fonts)
fonts = fonts.union(f)
embedded = embedded.union(e)
else:
f, e = walk(obj['/Resources'], fonts, embedded)
fonts = fonts.union(f)
embedded = embedded.union(e)
unembedded = fonts - embedded
print('Font List')
pprint(sorted(list(fonts)))
if unembedded:
print ('\nUnembedded Fonts')
pprint(unembedded)
Hi! Is there a way to get bold words or bold phrases inside a page containing information about font used?
@tiarno any way we can do this as well