|
from typing import Pattern, Dict, Tuple, Union |
|
from pathlib import Path |
|
|
|
import pyttsx3 |
|
import fitz |
|
import argparse |
|
import re |
|
|
|
# CHAPTER_RE: A regex pattern to identify the start of a chapter and its number. |
|
# The pattern looks for two spaces followed by a newline, followed by the chapter |
|
# number and then again two spaces followed by a newline. |
|
CHAPTER_RE: Pattern = re.compile(r'\s\n\s\n(\d+)\s\n\s\n') |
|
|
|
# CHAPTER_PAGE_RE: A regex pattern to identify the page number and the chapter number. |
|
# The pattern looks for the page number followed by five spaces followed by a newline, |
|
# and then the chapter number with two spaces followed by a newline. |
|
CHAPTER_PAGE_RE: Pattern = re.compile(r'(\d+)\s\n\s\n\s\n\s\n(\d+)\s\n\s\n') |
|
|
|
# PAGE_RE: A regex pattern to identify only the page number. |
|
# The pattern looks for the page number followed by two spaces followed by a newline. |
|
PAGE_RE: Pattern = re.compile(r'(\d+)\s\n\s\n') |
|
|
|
|
|
def on_end(name: str, _) -> None: |
|
""" |
|
Callback function to be called when an audiobook finishes processing. |
|
|
|
:param name: The name of the audiobook. |
|
:param _: Unused parameter. |
|
:return: None |
|
""" |
|
print(f'Audiobook {name} finished!') |
|
|
|
|
|
def parse_page(content: str) -> Tuple[int, int, str]: |
|
""" |
|
Parses the text content of a page, extracting the chapter number, page number, and text. |
|
|
|
:param content: The text content of a page. |
|
:return: A tuple containing the chapter number, page number, and the cleaned text content. |
|
""" |
|
chapter, page = 1, 0 |
|
|
|
found = re.search(CHAPTER_PAGE_RE, content) |
|
if found: |
|
page, chapter = found.groups() |
|
return int(chapter), int(page), re.sub(CHAPTER_PAGE_RE, '', content) |
|
|
|
found = re.search(CHAPTER_RE, content) |
|
if found: |
|
chapter = found.group(1) |
|
return int(chapter), int(page), re.sub(CHAPTER_RE, '', content) |
|
|
|
found = re.search(PAGE_RE, content) |
|
if found: |
|
page = found.group(1) |
|
return int(chapter), int(page), re.sub(PAGE_RE, '', content) |
|
|
|
return chapter, page, content |
|
|
|
|
|
def open_book(filename: str) -> Dict[int, str]: |
|
""" |
|
Opens the given PDF file, extracts the text content and organizes it into chapters. |
|
|
|
:param filename: The path of the PDF file to be processed. |
|
:return: A dictionary containing the chapters with their content. |
|
The keys represent the chapter numbers and the values the content. |
|
""" |
|
chapters = {} |
|
|
|
with fitz.Document(filename) as doc: |
|
current = 0 |
|
stop = False |
|
|
|
for page in doc: |
|
text = page.get_text() |
|
|
|
if stop: |
|
chapter, _, content = parse_page(text) |
|
chapters[0] = content |
|
break |
|
|
|
chapter, _, content = parse_page(text) |
|
current = chapter if chapter > current else current |
|
chapters[current] = '' if current not in chapters else chapters[current] |
|
|
|
chapters[current] += content |
|
stop = re.search(r'FIM', content) |
|
|
|
return chapters |
|
|
|
|
|
def read_book(book: str, chapters: Dict[int, str], folder: str) -> None: |
|
""" |
|
Reads the given book and saves each chapter as an MP3 file in the specified folder. |
|
|
|
:param book: The name of the book. |
|
:param chapters: A dictionary containing the chapters with their content as string. |
|
The keys represent the chapter numbers and the values are lists of pages. |
|
:param folder: The output folder where the MP3 files will be saved. |
|
:return: None |
|
""" |
|
if not chapters: |
|
return |
|
|
|
engine = pyttsx3.init() |
|
engine.connect('finished-utterance', on_end) |
|
|
|
path = Path(folder) |
|
path.mkdir(parents=True, exist_ok=True) |
|
|
|
for chapter, pages in chapters.items(): |
|
content = f'{book}. Capítulo {chapter}.\n\n' + pages |
|
filename = f'{book} - Capitulo {str(chapter).zfill(3)}.mp3' |
|
|
|
engine.save_to_file(content, path.joinpath(filename), name=filename) |
|
|
|
engine.runAndWait() |
|
|
|
|
|
class FileAction(argparse.Action): |
|
""" |
|
Custom argparse action to validate and process a file argument. |
|
|
|
This class is designed to be used with argparse to ensure that the given |
|
file argument exists, is a file, and has a '.pdf' extension. |
|
""" |
|
|
|
def __call__(self, parser, namespace, values, option_string=None): |
|
path = Path(str(values)) |
|
if not path.exists(): |
|
parser.error(f'file not found: {path}') |
|
if not path.is_file(): |
|
parser.error(f'not a file: {path}') |
|
if not path.suffix == '.pdf': |
|
parser.error(f'not a pdf file: {path}') |
|
setattr(namespace, self.dest, values) |
|
|
|
|
|
class FolderAction(argparse.Action): |
|
""" |
|
Custom argparse action to validate and process a folder argument. |
|
|
|
This class is designed to be used with argparse to ensure that the given |
|
folder argument is not a file. |
|
""" |
|
|
|
def __call__(self, parser, namespace, values, option_string=None): |
|
path = Path(str(values)) |
|
if path.is_file(): |
|
parser.error(f'not a folder: {path}') |
|
setattr(namespace, self.dest, values) |
|
|
|
|
|
if __name__ == '__main__': |
|
parser = argparse.ArgumentParser( |
|
prog='audiobook', description='audiobook from pdf file') |
|
parser.add_argument('book', action=FileAction, help='pdf file') |
|
parser.add_argument( |
|
'-o', '--output', action=FolderAction, default='output', help='output folder') |
|
|
|
args = parser.parse_args() |
|
|
|
chapters = open_book(args.book) |
|
name = args.book.replace('.pdf', '') |
|
read_book(name, chapters, args.output) |
🐍
🍎