Skip to content

Instantly share code, notes, and snippets.

@elnazsn1988
Forked from LouisdeBruijn/fonts.py
Created September 23, 2020 00:10
Show Gist options
  • Save elnazsn1988/43e9c499b19249f573ed38da9b4364a4 to your computer and use it in GitHub Desktop.
Save elnazsn1988/43e9c499b19249f573ed38da9b4364a4 to your computer and use it in GitHub Desktop.
def fonts(doc, granularity=False):
"""Extracts fonts and their usage in PDF documents.
:param doc: PDF document to iterate through
:type doc: <class 'fitz.fitz.Document'>
:param granularity: also use 'font', 'flags' and 'color' to discriminate text
:type granularity: bool
:rtype: [(font_size, count), (font_size, count}], dict
:return: most used fonts sorted by count, font style information
"""
styles = {}
font_counts = {}
for page in doc:
blocks = page.getText("dict")["blocks"]
for b in blocks: # iterate through the text blocks
if b['type'] == 0: # block contains text
for l in b["lines"]: # iterate through the text lines
for s in l["spans"]: # iterate through the text spans
if granularity:
identifier = "{0}_{1}_{2}_{3}".format(s['size'], s['flags'], s['font'], s['color'])
styles[identifier] = {'size': s['size'], 'flags': s['flags'], 'font': s['font'],
'color': s['color']}
else:
identifier = "{0}".format(s['size'])
styles[identifier] = {'size': s['size'], 'font': s['font']}
font_counts[identifier] = font_counts.get(identifier, 0) + 1 # count the fonts usage
font_counts = sorted(font_counts.items(), key=itemgetter(1), reverse=True)
if len(font_counts) < 1:
raise ValueError("Zero discriminating fonts found!")
return font_counts, styles
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment