Skip to content

Instantly share code, notes, and snippets.

@cuckookernel
Last active February 2, 2024 18:09
Show Gist options
  • Save cuckookernel/d7ccf214a000c025b723554cbb5fac59 to your computer and use it in GitHub Desktop.
Save cuckookernel/d7ccf214a000c025b723554cbb5fac59 to your computer and use it in GitHub Desktop.
#!/bin/env python
"""
Turn a long text into a sound file, using Coqui TTS: text-to-speech
Example usage:
./doc_2_audio.py -e utf16 my_dir/my_document.txt
Will produce output my_dir/my_document.wav
For other options:
./doc_2_audio.py -h
Requirements:
pip install TTS # install coqui-tts library
It could be a good idea to create venv first (before pip install):
python3 -m venv venv-d2a; venv-d2a/bin/activate.sh; pip install wheel
TODO:
- Automatically extract plain text from a pdf document.
- Generate mp3 instead of wav
"""
import re
import argparse
from pathlib import Path
import torch
from TTS.api import TTS
TTS_MODELS_BY_KEY = {
'vits': 'tts_models/en/vctk/vits'
}
# %%
def main():
args = get_cli_args()
run(in_txt_file=Path(args.in_txt_file),
encoding=args.encoding,
tts_model_key=args.tts_model_key,
speaker=args.speaker)
def get_cli_args() -> argparse.Namespace:
arg_parser = argparse.ArgumentParser(description=__doc__)
arg_parser.add_argument('in_txt_file', help='input text file')
arg_parser.add_argument('-e', '--encoding', help="input text file's encoding",
default="utf8")
arg_parser.add_argument('-k', '--tts_model_key',
help=f'tts model key, possible values: '
f'{list(TTS_MODELS_BY_KEY.keys())}',
default="vits")
arg_parser.add_argument('-s', '--speaker',
help=f'speaker used in the case of a multispeaker model',
default="p225")
return arg_parser.parse_args()
def run(*, in_txt_file: Path, encoding: str, tts_model_key: str,
speaker: str) -> None:
# Get device
tts_model_name = TTS_MODELS_BY_KEY[tts_model_key]
device = "cuda" if torch.cuda.is_available() else "cpu"
tts = TTS(tts_model_name).to(device)
sections = read_text_file(in_txt_file, encoding=encoding)
text = "\n".join(sections)
out_wav_file = in_txt_file.with_suffix(".wav")
tts.tts_to_file(text=text, speaker=speaker, file_path=out_wav_file)
print(f"Done generating wav file: {out_wav_file}")
# %%
def read_text_file(in_txt_file: Path, encoding) -> list[str]:
"""Return one string for each section"""
whole_text = in_txt_file.read_text(encoding=encoding)
sections = re.split("\n\n", whole_text, flags=re.MULTILINE)
print(f"{len(sections)} sections found:")
for i, section_raw in enumerate(sections):
section = clean_section(section_raw)
print(f" {i}: {len(section)} : {section[:16]} .. {section[-16:]}")
return sections
# %%
def clean_section(section: str) -> str:
lines = section.split("\n")
new_lines = [line.strip().strip('"') for line in lines]
return " ".join(new_lines)
# %%
def _interactive_testing():
# %%
in_txt_file = Path("/home/teo/gdrive_rclone/EBooks/Machine Learning & Statistics/"
"Incorporating_Ethics_into_Artificial_Intelligence_.txt")
encoding = "utf8"
run(in_txt_file=in_txt_file, encoding=encoding,
tts_model_key="vits", speaker="p225")
# %%
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment