Last active
August 26, 2018 09:04
-
-
Save ms8r/8efa83a8c7a688ffdeed72619c71b01c to your computer and use it in GitHub Desktop.
Simple script to synthesize text to speech from text file using Google Cloud TTS
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
*.pyc | |
*~ | |
*.log | |
*.mp3 | |
archive | |
non-git | |
.ipynb_checkpoints | |
ipython_log.py | |
gce_cred.json | |
*.md | |
ipython_log.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Synthesizes text input file to speech in an mp3 file. Uses Google | |
Text-to-Speech API. Authenticate by setting environment variable | |
GOOGLE_APPLICATION_CREDENTIALS to path to credentials JSON file. | |
Run script with -h option to see more details. | |
""" | |
import sys | |
import re | |
from io import BytesIO | |
import json | |
from concurrent import futures | |
from functools import partial | |
import argparse | |
import logging | |
from google.cloud import texttospeech_v1beta1 as texttospeech | |
from mutagen.mp3 import MP3 | |
from mutagen.id3 import ID3, TIT2, SYLT | |
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', | |
level=logging.INFO) | |
# Speech rate (speed) as percentage value | |
SSML_RATE_DEFAULT = 90 | |
SSML_BREAK_STRENGTH_DEFAULT = 'medium' | |
SSML_BREAK_TIME_DEFAULT = '300ms' | |
DEFAULT_THREADS = 8 | |
SSML_TEMPLATE = """\ | |
<speak> | |
<prosody rate="{rate}%"> | |
{text} | |
</prosody> | |
<break strength="{strength}" time="{time}" /> | |
</speak>""" | |
def gen_pars(src_file): | |
""" | |
Generator function that iterates over lines in `src_file` and skipping | |
subsequently empty lines. Assumes that each line represents a paragraph of | |
text with paragraphs being separated by enmpty lines. | |
Yields tuple (raw_text, text processed by prep_input). | |
""" | |
def prep_input(text): | |
""" | |
Pre-processes input text and returns result as string. | |
Here simply removes '*', '\', any html tags, and html entities. | |
""" | |
drop_res = [r'[*\\]', r'&[^;]+;', r'<[^>]+>'] | |
for dr in drop_res: | |
text = re.sub(dr, '', text) | |
return text | |
with open(src_file) as fp: | |
for par in fp: | |
raw = par.strip() | |
if not raw: | |
continue | |
s = prep_input(raw) | |
yield (raw, s) | |
def gen_tts_text_inputs(pars, rate='100%', break_strength='strong', | |
break_time='300ms'): | |
""" | |
Generator function that yields Google TTS input text of proper type for | |
each par in `pars` (an iterable thet yields (raw, processed) tuples). | |
`rate` (a non-negative percentage) controls the speed of speech. | |
`break_strength` indicates the strength of the prosodic break between pars. | |
Valid values are "none", "x-weak", "weak", "medium" (default value), | |
"strong", or "x-strong". The value "none" indicates that no prosodic break | |
boundary should be outputted, which can be used to prevent a prosodic break | |
which the processor would otherwise produce. | |
`break_time` indicates the duration of a pause to be inserted between | |
paragraphs in the output in seconds or milliseconds. | |
Yields (raw_text, TTS input) tuples. | |
See https://www.w3.org/TR/speech-synthesis for details. | |
""" | |
for raw, p in pars: | |
ssml = SSML_TEMPLATE.format(text=p, rate=rate, strength=break_strength, | |
time=break_time) | |
yield (raw, texttospeech.types.SynthesisInput(ssml=ssml)) | |
def synth_text(text_input, tts_client, tts_voice, tts_audio_config): | |
""" | |
Speech synthsizes the TTS input in `text_input` which must be of the proper | |
Google TTS type. `text_input` is a tuple (raw_text, TTS input). | |
Returns a tuple (raw_text, binary string with mp3 audio). | |
""" | |
raw_text, tts_input = text_input | |
response = tts_client.synthesize_speech(tts_input, tts_voice, | |
tts_audio_config) | |
return (raw_text, response.SerializeToString()) | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser(description=__doc__) | |
parser.add_argument('src', help="""input text file; assumes each lines | |
represents a paragraph, with paragraphs separated by an empty line | |
(markdown-like)""") | |
parser.add_argument('--heading', help="""heading text to prepend to | |
output""") | |
parser.add_argument('--out', required=True, help="""path to mp3 output | |
file""") | |
parser.add_argument('--sylt', action='store_true', help="""if specified | |
synchronized text info will be stored with the output mp3 file as a | |
SYLT ID3 frame""") | |
parser.add_argument('--rate', type=int, default=SSML_RATE_DEFAULT, | |
help="""rate (speed) of speech in output as a positive integer | |
percent value (default: {})""".format(SSML_RATE_DEFAULT)) | |
parser.add_argument('--break_strength', default=SSML_BREAK_STRENGTH_DEFAULT, | |
help="""used to indicate the strength of the prosodic break between | |
paragraphs in the speech output; valid values are "none", "x-weak", | |
"weak", "medium", "strong", or "x-strong"; default: {}; see | |
https://www.w3.org/TR/speech-synthesis""".format( | |
SSML_BREAK_STRENGTH_DEFAULT)) | |
parser.add_argument('--break_time', default=SSML_BREAK_TIME_DEFAULT, | |
help="""indicates the duration of a pause to be inserted in the | |
output between paragraphs in seconds or milliseconds (e.g. "3s", | |
"250ms"); defaults to {}""".format(SSML_BREAK_TIME_DEFAULT)) | |
parser.add_argument('--threads', type=int, default=DEFAULT_THREADS, | |
help="""number of threads to spawn as workers when calling Google | |
TTS API; defaults to {}""".format(DEFAULT_THREADS)) | |
args = parser.parse_args() | |
logging.info('running with parameters: heading=%s, rate=%d, strength=%s, ' | |
'time=%s, threads=%d', args.heading, args.rate, | |
args.break_strength, args.break_time, args.threads) | |
tts_client = texttospeech.TextToSpeechClient() | |
tts_voice = texttospeech.types.VoiceSelectionParams( | |
language_code='en-US', | |
ssml_gender=texttospeech.enums.SsmlVoiceGender.FEMALE) | |
tts_audio_config = texttospeech.types.AudioConfig( | |
audio_encoding=texttospeech.enums.AudioEncoding.MP3) | |
if args.heading: | |
ssml = SSML_TEMPLATE.format(text=args.heading, rate=args.rate, | |
strength=args.break_strength, time=args.break_time) | |
text_input = texttospeech.types.SynthesisInput(ssml=ssml) | |
response = tts_client.synthesize_speech(text_input, tts_voice, | |
tts_audio_config) | |
mp3_audio = response.SerializeToString() | |
sync_text = [(args.heading, 0)] | |
elapsed_time = MP3(BytesIO(mp3_audio)).info.length | |
else: | |
mp3_audio=b'' | |
elapsed_time = 0 | |
sync_text = [] | |
pars = gen_pars(args.src) | |
tts_text_inputs = gen_tts_text_inputs(pars, rate=args.rate, | |
break_strength=args.break_strength, break_time=args.break_time) | |
synth_text_configured = partial(synth_text, | |
tts_client=tts_client, tts_voice=tts_voice, | |
tts_audio_config=tts_audio_config) | |
with futures.ThreadPoolExecutor(max_workers=args.threads) as executor: | |
par_count = 0 | |
for raw, sp in executor.map(synth_text_configured, tts_text_inputs): | |
sync_text.append((raw, round(elapsed_time * 1000))) | |
mp3_audio += sp | |
elapsed_time += MP3(BytesIO(sp)).info.length | |
par_count += 1 | |
logging.info('synthesized %d paragraphs', par_count) | |
tags = ID3() | |
if args.heading: | |
tags.add(TIT2(encoding=3, text=args.heading)) | |
if args.sylt: | |
tags.add(SYLT(encoding=3, lang='eng', format=2, type=1, | |
desc='Narrated text', text=sync_text)) | |
fp_id3 = BytesIO() | |
tags.save(fp_id3) | |
fp_id3.seek(0) | |
logging.info('writing audio to %s', args.out) | |
with open(args.out, 'wb') as fp: | |
fp.write(fp_id3.read() + mp3_audio) | |
print(json.dumps(sync_text)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment