Created
August 12, 2023 16:39
-
-
Save cavedave/41576ac0813efc5140fc5b30fcbc67ff to your computer and use it in GitHub Desktop.
Code to parse a pdf of the only Irish-Irish dictionary of the 20th century. Croidhe Cainnte Chiarraighe. Foclóir Gaeilge-Gaeilge (pdf) 1942. PDF is at https://www.forasnagaeilge.ie/wp-content/uploads/2016/06/8fddae92ae307b022d964ebe73d45df6.pdf . I took a few pages using https://smallpdf.com/split-pdf to speed up experiments but that can be done…
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pdfplumber | |
def is_bold_font(char): | |
return "bold" in char['fontname'].lower() | |
def parse_pdf_to_dictionary(pdf_path): | |
term_definition_dict = {} | |
term = "" | |
next_term = "" | |
definition = "" | |
#this code gets all bold. We need bold at start of line and ending in: | |
with pdfplumber.open(pdf_path) as pdf: | |
for page in pdf.pages: | |
for element in page.chars: | |
text = element['text'] | |
if is_bold_font(element):#text.endswith(":") and | |
#print(element['x0']) could check if char is at the start | |
#but bold can be at start anyway | |
if firstBold: | |
if ":" in term:#dictionary terms have a : | |
term=term.strip() | |
print("term is "+term) | |
term_definition_dict[term] = definition | |
#else: | |
# definition += text | |
#if not true we are bolded but with out : | |
#we shold keep the bold as it tells us about the word | |
#but for the moment lets not | |
term = "" | |
definition = "" | |
firstBold=False | |
term += text | |
else: | |
firstBold=True | |
definition += text | |
return term_definition_dict | |
pdf_path = "/content/8fddae92ae307b022d964ebe73d45df6-pages-40-44.pdf" | |
term_definition_dict = parse_pdf_to_dictionary(pdf_path) | |
for term, definition in term_definition_dict.items(): | |
print(f"Term: {term}\n Definition: {definition}\n") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment