imbdb/ppt-pdf-title-extractor.py

## ppt-pdf-title-extractor.py
import fitz

def scrape(filePath):
    results = [] # list of tuples that store the information as (text, font size, font name)
    pdf = fitz.open(filePath) # filePath is a string that contains the path to the pdf
    count = 0
    titles = {}
    for page in pdf:
        dict = page.get_text("dict")
        blocks = dict["blocks"]
        for block in blocks:
            if "lines" in block.keys():
                spans = block['lines']
                for span in spans:
                    data = span['spans']
                    for lines in data:
                        if lines['size'] > 27:
                            count += 1
                            if(titles.get(page.number) != None):
                                titles[page.number] = titles[page.number] + ' ' + lines['text'].strip()
                            else:
                                titles[page.number] = lines['text'].strip()
                            # lines['text'] -> string, lines['size'] -> font size, lines['font'] -> font name
        if titles.get(page.number) == None:
            titles[page.number] = 'ADD_TITLE_HERE'
    for title in titles:
        dots = '-' * (141 - len(str(title)) - len(titles[title]))
        print(f"{titles[title]} {dots} {str(title)}")
    pdf.close()
    return results

print(scrape(".\merged.pdf"))
	import fitz

	def scrape(filePath):
	results = [] # list of tuples that store the information as (text, font size, font name)
	pdf = fitz.open(filePath) # filePath is a string that contains the path to the pdf
	count = 0
	titles = {}
	for page in pdf:
	dict = page.get_text("dict")
	blocks = dict["blocks"]
	for block in blocks:
	if "lines" in block.keys():
	spans = block['lines']
	for span in spans:
	data = span['spans']
	for lines in data:
	if lines['size'] > 27:
	count += 1
	if(titles.get(page.number) != None):
	titles[page.number] = titles[page.number] + ' ' + lines['text'].strip()
	else:
	titles[page.number] = lines['text'].strip()
	# lines['text'] -> string, lines['size'] -> font size, lines['font'] -> font name
	if titles.get(page.number) == None:
	titles[page.number] = 'ADD_TITLE_HERE'
	for title in titles:
	dots = '-' * (141 - len(str(title)) - len(titles[title]))
	print(f"{titles[title]} {dots} {str(title)}")
	pdf.close()
	return results

	print(scrape(".\merged.pdf"))