Skip to content

Instantly share code, notes, and snippets.

@salmanshah1d
Created November 22, 2022 01:31
Show Gist options
  • Save salmanshah1d/636364b16b91c691db8f42ad4fc619d3 to your computer and use it in GitHub Desktop.
Save salmanshah1d/636364b16b91c691db8f42ad4fc619d3 to your computer and use it in GitHub Desktop.
from dotenv import load_dotenv; load_dotenv()
from gpt_index import GPTTreeIndex, SimpleDirectoryReader
from gpt_index.schema import Document
from PIL import Image
from pathlib import Path
import os
import pytesseract
import pdf2image
import time
def get_text_from_pdf(image_path: str):
pages = pdf2image.convert_from_path(image_path, dpi=500)
text_per_page = [get_text_from_pil(page).replace("\n", " ") for page in pages]
pdf_text = ' '.join(text_per_page)
return pdf_text
def get_text_from_pil(pil_image: Image):
return pytesseract.image_to_string(pil_image)
def get_text_from_path(image_path: str):
if os.path.splitext(image_path)[1] == '.pdf':
return get_text_from_pdf(image_path)
else:
return get_text_from_pil(Image.open(image_path))
def main():
path = 'All the Years of Her Life.pdf'
index_path = Path(path.replace(" ", "_")).with_suffix('.json')
if os.path.exists(index_path):
index = GPTTreeIndex.load_from_disk(index_path)
else:
documents = [Document(text=get_text_from_path(path))]
start = time.time()
index = GPTTreeIndex(documents)
print(time.time() - start)
index.save_to_disk(index_path)
while True:
query = input('Query: ')
start = time.time()
answer = index.query(query)
print(f"Answer: {answer} took {time.time() - start} seconds")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment