Skip to content

Instantly share code, notes, and snippets.

@imbdb
Created May 11, 2024 06:06
Show Gist options
  • Save imbdb/7a44a997208daad994d291e9915fc3cd to your computer and use it in GitHub Desktop.
Save imbdb/7a44a997208daad994d291e9915fc3cd to your computer and use it in GitHub Desktop.
Python script to extract text which can be title of page (font-size greater than 27)
import fitz
def scrape(filePath):
results = [] # list of tuples that store the information as (text, font size, font name)
pdf = fitz.open(filePath) # filePath is a string that contains the path to the pdf
count = 0
titles = {}
for page in pdf:
dict = page.get_text("dict")
blocks = dict["blocks"]
for block in blocks:
if "lines" in block.keys():
spans = block['lines']
for span in spans:
data = span['spans']
for lines in data:
if lines['size'] > 27:
count += 1
if(titles.get(page.number) != None):
titles[page.number] = titles[page.number] + ' ' + lines['text'].strip()
else:
titles[page.number] = lines['text'].strip()
# lines['text'] -> string, lines['size'] -> font size, lines['font'] -> font name
if titles.get(page.number) == None:
titles[page.number] = 'ADD_TITLE_HERE'
for title in titles:
dots = '-' * (141 - len(str(title)) - len(titles[title]))
print(f"{titles[title]} {dots} {str(title)}")
pdf.close()
return results
print(scrape(".\merged.pdf"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment