Skip to content

Instantly share code, notes, and snippets.

@alpaylan
Created April 25, 2024 21:59
Show Gist options
  • Save alpaylan/e3c8b052e43e9f6640a8ecaa6fc769d7 to your computer and use it in GitHub Desktop.
Save alpaylan/e3c8b052e43e9f6640a8ecaa6fc769d7 to your computer and use it in GitHub Desktop.
Heuristically splits a book pdf into chapters
# This script takes a PDF book, splits it into chapters, and saves each chapter as a separate PDF file.
import PyPDF2
import re
# Open the PDF file
pdf_file = open('book.pdf', 'rb')
pdf_reader = PyPDF2.PdfReader(pdf_file)
# Create a PDF writer object
pdf_writer = PyPDF2.PdfWriter()
# Regular expression to match the chapter headings
chapter_pattern = re.compile(r'^Chapter \d+$')
page_lengths = []
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
text = page.extract_text()
print(f'Processing page {page_num}...')
page_lengths.append((page_num, len(text.split(" ")), "chapter" in text.lower()))
print(page_lengths)
# Average path length
average_length = sum([page[1] for page in page_lengths]) / len(page_lengths)
# Filter pages that have chapter
chapter_pages = [page for page in page_lengths if page[2]]
print(chapter_pages)
print("Average length: ", average_length)
# Find chapter pages that are shorter than %60 percent of average length
short_chapter_pages = [page for page in chapter_pages if page[1] < (average_length * 0.6)]
print(short_chapter_pages)
# Partition pdf by chapter pages
chapter_ranges = []
start = 0
for i in range(len(short_chapter_pages)):
if i == len(short_chapter_pages) - 1:
chapter_ranges.append((start, short_chapter_pages[i][0]))
else:
chapter_ranges.append((start, short_chapter_pages[i][0]))
start = short_chapter_pages[i][0]
# Add the last page
chapter_ranges.append((start, len(pdf_reader.pages)))
print(chapter_ranges)
# Save each chapter as a separate PDF file
for i, (start, end) in enumerate(chapter_ranges):
pdf_writer = PyPDF2.PdfWriter()
for page_num in range(start, end):
pdf_writer.add_page(pdf_reader.pages[page_num])
with open(f'book/chapter_{i+1}.pdf', 'wb') as pdf_output_file:
pdf_writer.write(pdf_output_file)
# Close the PDF file
pdf_file.close()
print('Chapters separated successfully.')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment