Skip to content

Instantly share code, notes, and snippets.

@MeAmarP
Created February 28, 2024 18:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save MeAmarP/41b1601364522aaef2e095b5cb063686 to your computer and use it in GitHub Desktop.
Save MeAmarP/41b1601364522aaef2e095b5cb063686 to your computer and use it in GitHub Desktop.
extract pdf content page wise into json
import fitz
import json
def open_pdf_document(path):
"""Open a PDF document and return the document object."""
try:
doc = fitz.open(path)
return doc
except Exception as e:
print(f"Failed to open document: {e}")
return None
def extract_toc(doc):
"""Extract the table of contents from the document."""
toc = doc.get_toc()
return toc
def extract_page_topics(doc):
"""Extract main topics from each page."""
page_content = {}
for page_num in range(len(doc)):
page = doc.load_page(page_num)
text = page.get_text("text")
page_content[page_num+1] = text
return page_content
def extract_text_per_page(doc, opt:str):
"""Extract content from each page."""
for page in doc:
text = page.get_text(opt)
print(text)
def main():
path = "sample.pdf"
doc = open_pdf_document(path)
if doc:
toc = extract_toc(doc)
print("Table of Contents:", toc)
# extract_text_per_page(doc)
page_content = extract_page_topics(doc)
with open("data.json", "w") as f:
# Dump the dictionary to the file
json.dump(page_content, f)
else:
print("Document could not be opened.")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment