Kraballa/ebook_textractor.py

## ebook_textractor.py
from ebooklib import epub
import ebooklib
import re
import html
import os

#folder to epub files
dir_to_epubs = ''

#generate list of epub-files in specified directory
def look_through(dirname, suffix = ""):
    files = []
    for file in os.listdir(dirname):
        if file.endswith(suffix):
            files.append(os.path.join(dirname.replace("/","\\"), file))
    return files

#extract text from specified file
def extract_text(file_path: str):
    book = epub.read_epub(file_path)
    book_name = book.get_metadata('DC', 'title')[0][0]
    book_name = book_name[0:24]
    print('extracting from',book_name)
    book_content = ''
    for item in book.get_items():
        if item.get_type() == ebooklib.ITEM_DOCUMENT:
            bpage = item.get_content()
            page = bpage.decode('utf-8').encode('ascii', errors='ignore').decode()
            page = html.unescape(page)          # unescape html chars
            page = re.sub('<[^<]+?>', '', page) # remove html tags
            page = re.sub('(\n|\r)+',' ', page) # replace multiple newlines
            page = re.sub(' {2,}','\n',page)    # replace 2 or more spaces with a newline
            book_content = book_content + '\n\n' + page

    book_file_path = 'epub_text\\' + book_name + '.txt'

    with open(book_file_path,'w') as file:
        file.write(book_content)
        print('done writing', len(book_content), ' characters to', file.name)

files = look_through(dir_to_epubs,'.epub')

if not os.path.exists('epub_text'):
    os.mkdir('epub_text')

for file in files:
    extract_text(file)

print('created',len(files),'files')
	from ebooklib import epub
	import ebooklib
	import re
	import html
	import os

	#folder to epub files
	dir_to_epubs = ''

	#generate list of epub-files in specified directory
	def look_through(dirname, suffix = ""):
	files = []
	for file in os.listdir(dirname):
	if file.endswith(suffix):
	files.append(os.path.join(dirname.replace("/","\\"), file))
	return files

	#extract text from specified file
	def extract_text(file_path: str):
	book = epub.read_epub(file_path)
	book_name = book.get_metadata('DC', 'title')[0][0]
	book_name = book_name[0:24]
	print('extracting from',book_name)
	book_content = ''
	for item in book.get_items():
	if item.get_type() == ebooklib.ITEM_DOCUMENT:
	bpage = item.get_content()
	page = bpage.decode('utf-8').encode('ascii', errors='ignore').decode()
	page = html.unescape(page) # unescape html chars
	page = re.sub('<[^<]+?>', '', page) # remove html tags
	page = re.sub('(\n\|\r)+',' ', page) # replace multiple newlines
	page = re.sub(' {2,}','\n',page) # replace 2 or more spaces with a newline
	book_content = book_content + '\n\n' + page

	book_file_path = 'epub_text\\' + book_name + '.txt'

	with open(book_file_path,'w') as file:
	file.write(book_content)
	print('done writing', len(book_content), ' characters to', file.name)

	files = look_through(dir_to_epubs,'.epub')

	if not os.path.exists('epub_text'):
	os.mkdir('epub_text')

	for file in files:
	extract_text(file)

	print('created',len(files),'files')