Skip to content

Instantly share code, notes, and snippets.

@jlmalone
Created April 17, 2024 15:23
Show Gist options
  • Save jlmalone/4f9d7d88a0628256e3e2aee70131a4b7 to your computer and use it in GitHub Desktop.
Save jlmalone/4f9d7d88a0628256e3e2aee70131a4b7 to your computer and use it in GitHub Desktop.
Extract the text out of an ePub
# pip install ebooklib beautifulsoup4 python-dotenv
import ebooklib
from dotenv import load_dotenv
import os
from ebooklib import epub
from bs4 import BeautifulSoup
def extract_text_from_epub(file_path):
book = epub.read_epub(file_path)
text_content = []
for item in book.get_items():
if item.get_type() == ebooklib.ITEM_DOCUMENT:
soup = BeautifulSoup(item.content, 'html.parser')
text_content.append(soup.get_text())
return "\n\n".join(text_content)
load_dotenv()
epub_path = os.getenv("EPUB_PATH")
# Specify the path to your EPUB file
text = extract_text_from_epub(epub_path)
print(text)
# TODO decide where text should be saved or whatever
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment