Skip to content

Instantly share code, notes, and snippets.

@satish860
Created December 11, 2023 11:29
Show Gist options
  • Save satish860/db66730686739262b5b3c03ea4b5b85a to your computer and use it in GitHub Desktop.
Save satish860/db66730686739262b5b3c03ea4b5b85a to your computer and use it in GitHub Desktop.
from llmsherpa.readers import LayoutPDFReader
llmsherpa_api_url = "https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all"
pdf_url = "1.pdf" # Replace with your actual PDF file path
def read_pdf(pdf_url):
try:
pdf_reader = LayoutPDFReader(llmsherpa_api_url)
return pdf_reader.read_pdf(pdf_url)
except Exception as e:
print(f"Error reading PDF: {e}")
return None
def ends_with_sentence_terminator(line):
return line.strip().endswith(('.', '?', '!', ":"))
def process_pdf_chunks(doc):
previous_heading = None
incomplete_chunk = ""
processed_text = []
for chunk in doc.chunks():
fullchunk = incomplete_chunk
incomplete_chunk = ""
text = chunk.to_context_text()
lines = text.split("\n")
if lines:
current_heading = lines[0].split('>')[-1].strip()
if current_heading != previous_heading:
fullchunk += current_heading + '\n'
previous_heading = current_heading
for i, line in enumerate(lines[1:], 1):
if i < len(lines) - 1 and not ends_with_sentence_terminator(line):
fullchunk += line + ' '
else:
fullchunk += line + '\n'
if not fullchunk.strip().endswith('.'):
incomplete_chunk = fullchunk
continue
processed_text.append(fullchunk)
if incomplete_chunk:
processed_text.append(incomplete_chunk)
return processed_text
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment