Skip to content

Instantly share code, notes, and snippets.

@Kraballa
Last active March 28, 2022 08:33
Show Gist options
  • Save Kraballa/1d2bf0e04d87addf103ce8a68bd88da0 to your computer and use it in GitHub Desktop.
Save Kraballa/1d2bf0e04d87addf103ce8a68bd88da0 to your computer and use it in GitHub Desktop.
Extracts plain text from epub files. Specify directory in `dir_to_epubs` variable. The extracted `.txt` files will be written to `./epub_text`.
from ebooklib import epub
import ebooklib
import re
import html
import os
#folder to epub files
dir_to_epubs = ''
#generate list of epub-files in specified directory
def look_through(dirname, suffix = ""):
files = []
for file in os.listdir(dirname):
if file.endswith(suffix):
files.append(os.path.join(dirname.replace("/","\\"), file))
return files
#extract text from specified file
def extract_text(file_path: str):
book = epub.read_epub(file_path)
book_name = book.get_metadata('DC', 'title')[0][0]
book_name = book_name[0:24]
print('extracting from',book_name)
book_content = ''
for item in book.get_items():
if item.get_type() == ebooklib.ITEM_DOCUMENT:
bpage = item.get_content()
page = bpage.decode('utf-8').encode('ascii', errors='ignore').decode()
page = html.unescape(page) # unescape html chars
page = re.sub('<[^<]+?>', '', page) # remove html tags
page = re.sub('(\n|\r)+',' ', page) # replace multiple newlines
page = re.sub(' {2,}','\n',page) # replace 2 or more spaces with a newline
book_content = book_content + '\n\n' + page
book_file_path = 'epub_text\\' + book_name + '.txt'
with open(book_file_path,'w') as file:
file.write(book_content)
print('done writing', len(book_content), ' characters to', file.name)
files = look_through(dir_to_epubs,'.epub')
if not os.path.exists('epub_text'):
os.mkdir('epub_text')
for file in files:
extract_text(file)
print('created',len(files),'files')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment