cavedave/parseCroidhePdf.py

## parseCroidhePdf.py
import pdfplumber

def is_bold_font(char):
    return "bold" in char['fontname'].lower()

def parse_pdf_to_dictionary(pdf_path):
    term_definition_dict = {}
    term = ""
    next_term = ""
    definition = ""
    #this code gets all bold. We need bold at start of line and ending in:
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            for element in page.chars:
                text = element['text']
                if  is_bold_font(element):#text.endswith(":") and
                    #print(element['x0']) could check if char is at the start
#but bold can be at start anyway

                    if firstBold:
                      if ":" in term:#dictionary terms have a :
                        term=term.strip()
                        print("term is "+term)
                        term_definition_dict[term] = definition
                      #else:
                      #  definition += text
                      #if not true we are bolded but with out :
                      #we shold keep the bold as it tells us about the word
                      #but for the moment lets not
                      term = ""
                      definition = ""
                      firstBold=False
                    term += text

                else:
                  firstBold=True
                  definition += text


    return term_definition_dict

pdf_path = "/content/8fddae92ae307b022d964ebe73d45df6-pages-40-44.pdf"
term_definition_dict = parse_pdf_to_dictionary(pdf_path)
for term, definition in term_definition_dict.items():
    print(f"Term: {term}\n Definition: {definition}\n")
	import pdfplumber

	def is_bold_font(char):
	return "bold" in char['fontname'].lower()

	def parse_pdf_to_dictionary(pdf_path):
	term_definition_dict = {}
	term = ""
	next_term = ""
	definition = ""
	#this code gets all bold. We need bold at start of line and ending in:
	with pdfplumber.open(pdf_path) as pdf:
	for page in pdf.pages:
	for element in page.chars:
	text = element['text']
	if is_bold_font(element):#text.endswith(":") and
	#print(element['x0']) could check if char is at the start
	#but bold can be at start anyway

	if firstBold:
	if ":" in term:#dictionary terms have a :
	term=term.strip()
	print("term is "+term)
	term_definition_dict[term] = definition
	#else:
	# definition += text
	#if not true we are bolded but with out :
	#we shold keep the bold as it tells us about the word
	#but for the moment lets not
	term = ""
	definition = ""
	firstBold=False
	term += text

	else:
	firstBold=True
	definition += text


	return term_definition_dict

	pdf_path = "/content/8fddae92ae307b022d964ebe73d45df6-pages-40-44.pdf"
	term_definition_dict = parse_pdf_to_dictionary(pdf_path)
	for term, definition in term_definition_dict.items():
	print(f"Term: {term}\n Definition: {definition}\n")