salmanshah1d/pdf_hacking.py

## pdf_hacking.py
from dotenv import load_dotenv; load_dotenv()
from gpt_index import GPTTreeIndex, SimpleDirectoryReader
from gpt_index.schema import Document
from PIL import Image
from pathlib import Path
import os
import pytesseract
import pdf2image
import time

def get_text_from_pdf(image_path: str):
    pages = pdf2image.convert_from_path(image_path, dpi=500)
    text_per_page = [get_text_from_pil(page).replace("\n", " ") for page in pages]
    pdf_text = ' '.join(text_per_page)
    return pdf_text

def get_text_from_pil(pil_image: Image):
    return pytesseract.image_to_string(pil_image)

def get_text_from_path(image_path: str):
    if os.path.splitext(image_path)[1] == '.pdf':
        return get_text_from_pdf(image_path)
    else:
        return get_text_from_pil(Image.open(image_path))

def main():
    path = 'All the Years of Her Life.pdf'
    index_path = Path(path.replace(" ", "_")).with_suffix('.json')

    if os.path.exists(index_path):
        index = GPTTreeIndex.load_from_disk(index_path)
    else:
        documents = [Document(text=get_text_from_path(path))]
        start = time.time()
        index = GPTTreeIndex(documents)
        print(time.time() - start)
        index.save_to_disk(index_path)

    while True:
        query = input('Query: ')
        start = time.time()
        answer = index.query(query)
        print(f"Answer: {answer} took {time.time() - start} seconds")


if __name__ == "__main__":
    main()
	from dotenv import load_dotenv; load_dotenv()
	from gpt_index import GPTTreeIndex, SimpleDirectoryReader
	from gpt_index.schema import Document
	from PIL import Image
	from pathlib import Path
	import os
	import pytesseract
	import pdf2image
	import time

	def get_text_from_pdf(image_path: str):
	pages = pdf2image.convert_from_path(image_path, dpi=500)
	text_per_page = [get_text_from_pil(page).replace("\n", " ") for page in pages]
	pdf_text = ' '.join(text_per_page)
	return pdf_text

	def get_text_from_pil(pil_image: Image):
	return pytesseract.image_to_string(pil_image)

	def get_text_from_path(image_path: str):
	if os.path.splitext(image_path)[1] == '.pdf':
	return get_text_from_pdf(image_path)
	else:
	return get_text_from_pil(Image.open(image_path))

	def main():
	path = 'All the Years of Her Life.pdf'
	index_path = Path(path.replace(" ", "_")).with_suffix('.json')

	if os.path.exists(index_path):
	index = GPTTreeIndex.load_from_disk(index_path)
	else:
	documents = [Document(text=get_text_from_path(path))]
	start = time.time()
	index = GPTTreeIndex(documents)
	print(time.time() - start)
	index.save_to_disk(index_path)

	while True:
	query = input('Query: ')
	start = time.time()
	answer = index.query(query)
	print(f"Answer: {answer} took {time.time() - start} seconds")


	if __name__ == "__main__":
	main()