Last active
March 28, 2022 08:33
-
-
Save Kraballa/1d2bf0e04d87addf103ce8a68bd88da0 to your computer and use it in GitHub Desktop.
Extracts plain text from epub files. Specify directory in `dir_to_epubs` variable. The extracted `.txt` files will be written to `./epub_text`.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from ebooklib import epub | |
import ebooklib | |
import re | |
import html | |
import os | |
#folder to epub files | |
dir_to_epubs = '' | |
#generate list of epub-files in specified directory | |
def look_through(dirname, suffix = ""): | |
files = [] | |
for file in os.listdir(dirname): | |
if file.endswith(suffix): | |
files.append(os.path.join(dirname.replace("/","\\"), file)) | |
return files | |
#extract text from specified file | |
def extract_text(file_path: str): | |
book = epub.read_epub(file_path) | |
book_name = book.get_metadata('DC', 'title')[0][0] | |
book_name = book_name[0:24] | |
print('extracting from',book_name) | |
book_content = '' | |
for item in book.get_items(): | |
if item.get_type() == ebooklib.ITEM_DOCUMENT: | |
bpage = item.get_content() | |
page = bpage.decode('utf-8').encode('ascii', errors='ignore').decode() | |
page = html.unescape(page) # unescape html chars | |
page = re.sub('<[^<]+?>', '', page) # remove html tags | |
page = re.sub('(\n|\r)+',' ', page) # replace multiple newlines | |
page = re.sub(' {2,}','\n',page) # replace 2 or more spaces with a newline | |
book_content = book_content + '\n\n' + page | |
book_file_path = 'epub_text\\' + book_name + '.txt' | |
with open(book_file_path,'w') as file: | |
file.write(book_content) | |
print('done writing', len(book_content), ' characters to', file.name) | |
files = look_through(dir_to_epubs,'.epub') | |
if not os.path.exists('epub_text'): | |
os.mkdir('epub_text') | |
for file in files: | |
extract_text(file) | |
print('created',len(files),'files') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment