Created
February 28, 2024 18:12
-
-
Save MeAmarP/41b1601364522aaef2e095b5cb063686 to your computer and use it in GitHub Desktop.
extract pdf content page wise into json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import fitz | |
import json | |
def open_pdf_document(path): | |
"""Open a PDF document and return the document object.""" | |
try: | |
doc = fitz.open(path) | |
return doc | |
except Exception as e: | |
print(f"Failed to open document: {e}") | |
return None | |
def extract_toc(doc): | |
"""Extract the table of contents from the document.""" | |
toc = doc.get_toc() | |
return toc | |
def extract_page_topics(doc): | |
"""Extract main topics from each page.""" | |
page_content = {} | |
for page_num in range(len(doc)): | |
page = doc.load_page(page_num) | |
text = page.get_text("text") | |
page_content[page_num+1] = text | |
return page_content | |
def extract_text_per_page(doc, opt:str): | |
"""Extract content from each page.""" | |
for page in doc: | |
text = page.get_text(opt) | |
print(text) | |
def main(): | |
path = "sample.pdf" | |
doc = open_pdf_document(path) | |
if doc: | |
toc = extract_toc(doc) | |
print("Table of Contents:", toc) | |
# extract_text_per_page(doc) | |
page_content = extract_page_topics(doc) | |
with open("data.json", "w") as f: | |
# Dump the dictionary to the file | |
json.dump(page_content, f) | |
else: | |
print("Document could not be opened.") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment