MeAmarP/pdf_extract.py

## pdf_extract.py
import fitz
import json

def open_pdf_document(path):
    """Open a PDF document and return the document object."""
    try:
        doc = fitz.open(path)
        return doc
    except Exception as e:
        print(f"Failed to open document: {e}")
        return None

def extract_toc(doc):
    """Extract the table of contents from the document."""
    toc = doc.get_toc()
    return toc

def extract_page_topics(doc):
    """Extract main topics from each page."""
    page_content = {}
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text = page.get_text("text")
        page_content[page_num+1] = text
    return page_content

def extract_text_per_page(doc, opt:str):
  """Extract content from each page."""
    for page in doc:
        text = page.get_text(opt)
        print(text)


def main():
    path = "sample.pdf"
    doc = open_pdf_document(path)
    if doc:
        toc = extract_toc(doc)
        print("Table of Contents:", toc)

        # extract_text_per_page(doc)
        page_content = extract_page_topics(doc)

        with open("data.json", "w") as f:
            # Dump the dictionary to the file
            json.dump(page_content, f)

    else:
        print("Document could not be opened.")

if __name__ == "__main__":
    main()
	import fitz
	import json

	def open_pdf_document(path):
	"""Open a PDF document and return the document object."""
	try:
	doc = fitz.open(path)
	return doc
	except Exception as e:
	print(f"Failed to open document: {e}")
	return None

	def extract_toc(doc):
	"""Extract the table of contents from the document."""
	toc = doc.get_toc()
	return toc

	def extract_page_topics(doc):
	"""Extract main topics from each page."""
	page_content = {}
	for page_num in range(len(doc)):
	page = doc.load_page(page_num)
	text = page.get_text("text")
	page_content[page_num+1] = text
	return page_content

	def extract_text_per_page(doc, opt:str):
	"""Extract content from each page."""
	for page in doc:
	text = page.get_text(opt)
	print(text)



	def main():
	path = "sample.pdf"
	doc = open_pdf_document(path)
	if doc:
	toc = extract_toc(doc)
	print("Table of Contents:", toc)

	# extract_text_per_page(doc)
	page_content = extract_page_topics(doc)

	with open("data.json", "w") as f:
	# Dump the dictionary to the file
	json.dump(page_content, f)

	else:
	print("Document could not be opened.")

	if __name__ == "__main__":
	main()