Skip to content

Instantly share code, notes, and snippets.

@ms8r
Last active August 26, 2018 09:04
Show Gist options
  • Save ms8r/8efa83a8c7a688ffdeed72619c71b01c to your computer and use it in GitHub Desktop.
Save ms8r/8efa83a8c7a688ffdeed72619c71b01c to your computer and use it in GitHub Desktop.
Simple script to synthesize text to speech from text file using Google Cloud TTS
*.pyc
*~
*.log
*.mp3
archive
non-git
.ipynb_checkpoints
ipython_log.py
gce_cred.json
*.md
ipython_log.py
#!/usr/bin/env python
"""
Synthesizes text input file to speech in an mp3 file. Uses Google
Text-to-Speech API. Authenticate by setting environment variable
GOOGLE_APPLICATION_CREDENTIALS to path to credentials JSON file.
Run script with -h option to see more details.
"""
import sys
import re
from io import BytesIO
import json
from concurrent import futures
from functools import partial
import argparse
import logging
from google.cloud import texttospeech_v1beta1 as texttospeech
from mutagen.mp3 import MP3
from mutagen.id3 import ID3, TIT2, SYLT
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
level=logging.INFO)
# Speech rate (speed) as percentage value
SSML_RATE_DEFAULT = 90
SSML_BREAK_STRENGTH_DEFAULT = 'medium'
SSML_BREAK_TIME_DEFAULT = '300ms'
DEFAULT_THREADS = 8
SSML_TEMPLATE = """\
<speak>
<prosody rate="{rate}%">
{text}
</prosody>
<break strength="{strength}" time="{time}" />
</speak>"""
def gen_pars(src_file):
"""
Generator function that iterates over lines in `src_file` and skipping
subsequently empty lines. Assumes that each line represents a paragraph of
text with paragraphs being separated by enmpty lines.
Yields tuple (raw_text, text processed by prep_input).
"""
def prep_input(text):
"""
Pre-processes input text and returns result as string.
Here simply removes '*', '\', any html tags, and html entities.
"""
drop_res = [r'[*\\]', r'&[^;]+;', r'<[^>]+>']
for dr in drop_res:
text = re.sub(dr, '', text)
return text
with open(src_file) as fp:
for par in fp:
raw = par.strip()
if not raw:
continue
s = prep_input(raw)
yield (raw, s)
def gen_tts_text_inputs(pars, rate='100%', break_strength='strong',
break_time='300ms'):
"""
Generator function that yields Google TTS input text of proper type for
each par in `pars` (an iterable thet yields (raw, processed) tuples).
`rate` (a non-negative percentage) controls the speed of speech.
`break_strength` indicates the strength of the prosodic break between pars.
Valid values are "none", "x-weak", "weak", "medium" (default value),
"strong", or "x-strong". The value "none" indicates that no prosodic break
boundary should be outputted, which can be used to prevent a prosodic break
which the processor would otherwise produce.
`break_time` indicates the duration of a pause to be inserted between
paragraphs in the output in seconds or milliseconds.
Yields (raw_text, TTS input) tuples.
See https://www.w3.org/TR/speech-synthesis for details.
"""
for raw, p in pars:
ssml = SSML_TEMPLATE.format(text=p, rate=rate, strength=break_strength,
time=break_time)
yield (raw, texttospeech.types.SynthesisInput(ssml=ssml))
def synth_text(text_input, tts_client, tts_voice, tts_audio_config):
"""
Speech synthsizes the TTS input in `text_input` which must be of the proper
Google TTS type. `text_input` is a tuple (raw_text, TTS input).
Returns a tuple (raw_text, binary string with mp3 audio).
"""
raw_text, tts_input = text_input
response = tts_client.synthesize_speech(tts_input, tts_voice,
tts_audio_config)
return (raw_text, response.SerializeToString())
if __name__ == '__main__':
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('src', help="""input text file; assumes each lines
represents a paragraph, with paragraphs separated by an empty line
(markdown-like)""")
parser.add_argument('--heading', help="""heading text to prepend to
output""")
parser.add_argument('--out', required=True, help="""path to mp3 output
file""")
parser.add_argument('--sylt', action='store_true', help="""if specified
synchronized text info will be stored with the output mp3 file as a
SYLT ID3 frame""")
parser.add_argument('--rate', type=int, default=SSML_RATE_DEFAULT,
help="""rate (speed) of speech in output as a positive integer
percent value (default: {})""".format(SSML_RATE_DEFAULT))
parser.add_argument('--break_strength', default=SSML_BREAK_STRENGTH_DEFAULT,
help="""used to indicate the strength of the prosodic break between
paragraphs in the speech output; valid values are "none", "x-weak",
"weak", "medium", "strong", or "x-strong"; default: {}; see
https://www.w3.org/TR/speech-synthesis""".format(
SSML_BREAK_STRENGTH_DEFAULT))
parser.add_argument('--break_time', default=SSML_BREAK_TIME_DEFAULT,
help="""indicates the duration of a pause to be inserted in the
output between paragraphs in seconds or milliseconds (e.g. "3s",
"250ms"); defaults to {}""".format(SSML_BREAK_TIME_DEFAULT))
parser.add_argument('--threads', type=int, default=DEFAULT_THREADS,
help="""number of threads to spawn as workers when calling Google
TTS API; defaults to {}""".format(DEFAULT_THREADS))
args = parser.parse_args()
logging.info('running with parameters: heading=%s, rate=%d, strength=%s, '
'time=%s, threads=%d', args.heading, args.rate,
args.break_strength, args.break_time, args.threads)
tts_client = texttospeech.TextToSpeechClient()
tts_voice = texttospeech.types.VoiceSelectionParams(
language_code='en-US',
ssml_gender=texttospeech.enums.SsmlVoiceGender.FEMALE)
tts_audio_config = texttospeech.types.AudioConfig(
audio_encoding=texttospeech.enums.AudioEncoding.MP3)
if args.heading:
ssml = SSML_TEMPLATE.format(text=args.heading, rate=args.rate,
strength=args.break_strength, time=args.break_time)
text_input = texttospeech.types.SynthesisInput(ssml=ssml)
response = tts_client.synthesize_speech(text_input, tts_voice,
tts_audio_config)
mp3_audio = response.SerializeToString()
sync_text = [(args.heading, 0)]
elapsed_time = MP3(BytesIO(mp3_audio)).info.length
else:
mp3_audio=b''
elapsed_time = 0
sync_text = []
pars = gen_pars(args.src)
tts_text_inputs = gen_tts_text_inputs(pars, rate=args.rate,
break_strength=args.break_strength, break_time=args.break_time)
synth_text_configured = partial(synth_text,
tts_client=tts_client, tts_voice=tts_voice,
tts_audio_config=tts_audio_config)
with futures.ThreadPoolExecutor(max_workers=args.threads) as executor:
par_count = 0
for raw, sp in executor.map(synth_text_configured, tts_text_inputs):
sync_text.append((raw, round(elapsed_time * 1000)))
mp3_audio += sp
elapsed_time += MP3(BytesIO(sp)).info.length
par_count += 1
logging.info('synthesized %d paragraphs', par_count)
tags = ID3()
if args.heading:
tags.add(TIT2(encoding=3, text=args.heading))
if args.sylt:
tags.add(SYLT(encoding=3, lang='eng', format=2, type=1,
desc='Narrated text', text=sync_text))
fp_id3 = BytesIO()
tags.save(fp_id3)
fp_id3.seek(0)
logging.info('writing audio to %s', args.out)
with open(args.out, 'wb') as fp:
fp.write(fp_id3.read() + mp3_audio)
print(json.dumps(sync_text))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment