ms8r/.gitignore

## .gitignore
*.pyc
*~
*.log
*.mp3
archive
non-git
.ipynb_checkpoints
ipython_log.py
gce_cred.json
*.md
ipython_log.py

## tts.py
#!/usr/bin/env python

"""
Synthesizes text input file to speech in an mp3 file. Uses Google
Text-to-Speech API. Authenticate by setting environment variable
GOOGLE_APPLICATION_CREDENTIALS to path to credentials JSON file.
Run script with -h option to see more details.
"""

import sys
import re
from io import BytesIO
import json
from concurrent import futures
from functools import partial
import argparse
import logging

from google.cloud import texttospeech_v1beta1 as texttospeech
from mutagen.mp3 import MP3
from mutagen.id3 import ID3, TIT2, SYLT


logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
        level=logging.INFO)


# Speech rate (speed) as percentage value
SSML_RATE_DEFAULT = 90

SSML_BREAK_STRENGTH_DEFAULT = 'medium'
SSML_BREAK_TIME_DEFAULT = '300ms'

DEFAULT_THREADS = 8

SSML_TEMPLATE = """\
<speak>
  <prosody rate="{rate}%">
    {text}
  </prosody>
  <break strength="{strength}" time="{time}" />
</speak>"""


def gen_pars(src_file):
    """
    Generator function that iterates over lines in `src_file` and skipping
    subsequently empty lines. Assumes that each line represents a paragraph of
    text with paragraphs being separated by enmpty lines.

    Yields tuple (raw_text, text processed by prep_input).
    """
    def prep_input(text):
        """
        Pre-processes input text and returns result as string.
        Here simply removes '*', '\', any html tags, and html entities.
        """
        drop_res = [r'[*\\]', r'&[^;]+;', r'<[^>]+>']
        for dr in drop_res:
            text = re.sub(dr, '', text)
        return text

    with open(src_file) as fp:
        for par in fp:
            raw = par.strip()
            if not raw:
                continue
            s = prep_input(raw)
            yield (raw, s)


def gen_tts_text_inputs(pars, rate='100%', break_strength='strong',
        break_time='300ms'):
    """
    Generator function that yields Google TTS input text of proper type for
    each par in `pars` (an iterable thet yields (raw, processed) tuples).

    `rate` (a non-negative percentage) controls the speed of speech.

    `break_strength` indicates the strength of the prosodic break between pars.
    Valid values are "none", "x-weak", "weak", "medium" (default value),
    "strong", or "x-strong". The value "none" indicates that no prosodic break
    boundary should be outputted, which can be used to prevent a prosodic break
    which the processor would otherwise produce.

    `break_time` indicates the duration of a pause to be inserted between
    paragraphs in the output in seconds or milliseconds.

    Yields (raw_text, TTS input) tuples.

    See https://www.w3.org/TR/speech-synthesis for details.
    """
    for raw, p in pars:
        ssml = SSML_TEMPLATE.format(text=p, rate=rate, strength=break_strength,
                time=break_time)
        yield (raw, texttospeech.types.SynthesisInput(ssml=ssml))


def synth_text(text_input, tts_client, tts_voice, tts_audio_config):
    """
    Speech synthsizes the TTS input in `text_input` which must be of the proper
    Google TTS type. `text_input` is a tuple (raw_text, TTS input).

    Returns a tuple (raw_text, binary string with mp3 audio).
    """
    raw_text, tts_input = text_input
    response = tts_client.synthesize_speech(tts_input, tts_voice,
            tts_audio_config)
    return (raw_text, response.SerializeToString())


if __name__ == '__main__':

    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('src', help="""input text file; assumes each lines
            represents a paragraph, with paragraphs separated by an empty line
            (markdown-like)""")
    parser.add_argument('--heading', help="""heading text to prepend to
            output""")
    parser.add_argument('--out', required=True, help="""path to mp3 output
            file""")
    parser.add_argument('--sylt', action='store_true',  help="""if specified
            synchronized text info will be stored with the output mp3 file as a
            SYLT ID3 frame""")
    parser.add_argument('--rate', type=int, default=SSML_RATE_DEFAULT,
            help="""rate (speed) of speech in output as a positive integer
            percent value (default: {})""".format(SSML_RATE_DEFAULT))
    parser.add_argument('--break_strength', default=SSML_BREAK_STRENGTH_DEFAULT,
            help="""used to indicate the strength of the prosodic break between
            paragraphs in the speech output; valid values are "none", "x-weak",
            "weak", "medium", "strong", or "x-strong"; default: {}; see
            https://www.w3.org/TR/speech-synthesis""".format(
                SSML_BREAK_STRENGTH_DEFAULT))
    parser.add_argument('--break_time', default=SSML_BREAK_TIME_DEFAULT,
            help="""indicates the duration of a pause to be inserted in the
            output between paragraphs in seconds or milliseconds (e.g. "3s",
            "250ms"); defaults to {}""".format(SSML_BREAK_TIME_DEFAULT))
    parser.add_argument('--threads', type=int, default=DEFAULT_THREADS,
            help="""number of threads to spawn as workers when calling Google
            TTS API; defaults to {}""".format(DEFAULT_THREADS))

    args = parser.parse_args()

    logging.info('running with parameters: heading=%s, rate=%d, strength=%s, '
                 'time=%s, threads=%d', args.heading, args.rate,
                 args.break_strength, args.break_time, args.threads)

    tts_client = texttospeech.TextToSpeechClient()
    tts_voice = texttospeech.types.VoiceSelectionParams(
            language_code='en-US',
            ssml_gender=texttospeech.enums.SsmlVoiceGender.FEMALE)
    tts_audio_config = texttospeech.types.AudioConfig(
            audio_encoding=texttospeech.enums.AudioEncoding.MP3)

    if args.heading:
        ssml = SSML_TEMPLATE.format(text=args.heading, rate=args.rate,
                strength=args.break_strength, time=args.break_time)
        text_input = texttospeech.types.SynthesisInput(ssml=ssml)
        response = tts_client.synthesize_speech(text_input, tts_voice,
                tts_audio_config)
        mp3_audio = response.SerializeToString()
        sync_text = [(args.heading, 0)]
        elapsed_time = MP3(BytesIO(mp3_audio)).info.length
    else:
        mp3_audio=b''
        elapsed_time = 0
        sync_text = []

    pars = gen_pars(args.src)
    tts_text_inputs = gen_tts_text_inputs(pars, rate=args.rate,
            break_strength=args.break_strength, break_time=args.break_time)

    synth_text_configured = partial(synth_text,
            tts_client=tts_client, tts_voice=tts_voice,
            tts_audio_config=tts_audio_config)

    with futures.ThreadPoolExecutor(max_workers=args.threads) as executor:
        par_count = 0
        for raw, sp in executor.map(synth_text_configured, tts_text_inputs):
            sync_text.append((raw, round(elapsed_time * 1000)))
            mp3_audio += sp
            elapsed_time += MP3(BytesIO(sp)).info.length
            par_count += 1

    logging.info('synthesized %d paragraphs', par_count)

    tags = ID3()
    if args.heading:
        tags.add(TIT2(encoding=3, text=args.heading))
    if args.sylt:
        tags.add(SYLT(encoding=3, lang='eng', format=2, type=1,
                 desc='Narrated text', text=sync_text))
    fp_id3 = BytesIO()
    tags.save(fp_id3)
    fp_id3.seek(0)

    logging.info('writing audio to %s', args.out)
    with open(args.out, 'wb') as fp:
        fp.write(fp_id3.read() + mp3_audio)

    print(json.dumps(sync_text))
	*.pyc
	*~
	*.log
	*.mp3
	archive
	non-git
	.ipynb_checkpoints
	ipython_log.py
	gce_cred.json
	*.md
	ipython_log.py
	#!/usr/bin/env python

	"""
	Synthesizes text input file to speech in an mp3 file. Uses Google
	Text-to-Speech API. Authenticate by setting environment variable
	GOOGLE_APPLICATION_CREDENTIALS to path to credentials JSON file.
	Run script with -h option to see more details.
	"""

	import sys
	import re
	from io import BytesIO
	import json
	from concurrent import futures
	from functools import partial
	import argparse
	import logging

	from google.cloud import texttospeech_v1beta1 as texttospeech
	from mutagen.mp3 import MP3
	from mutagen.id3 import ID3, TIT2, SYLT


	logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
	level=logging.INFO)


	# Speech rate (speed) as percentage value
	SSML_RATE_DEFAULT = 90

	SSML_BREAK_STRENGTH_DEFAULT = 'medium'
	SSML_BREAK_TIME_DEFAULT = '300ms'

	DEFAULT_THREADS = 8

	SSML_TEMPLATE = """\
	<speak>
	<prosody rate="{rate}%">
	{text}
	</prosody>
	<break strength="{strength}" time="{time}" />
	</speak>"""


	def gen_pars(src_file):
	"""
	Generator function that iterates over lines in `src_file` and skipping
	subsequently empty lines. Assumes that each line represents a paragraph of
	text with paragraphs being separated by enmpty lines.

	Yields tuple (raw_text, text processed by prep_input).
	"""
	def prep_input(text):
	"""
	Pre-processes input text and returns result as string.
	Here simply removes '*', '\', any html tags, and html entities.
	"""
	drop_res = [r'[*\\]', r'&[^;]+;', r'<[^>]+>']
	for dr in drop_res:
	text = re.sub(dr, '', text)
	return text

	with open(src_file) as fp:
	for par in fp:
	raw = par.strip()
	if not raw:
	continue
	s = prep_input(raw)
	yield (raw, s)


	def gen_tts_text_inputs(pars, rate='100%', break_strength='strong',
	break_time='300ms'):
	"""
	Generator function that yields Google TTS input text of proper type for
	each par in `pars` (an iterable thet yields (raw, processed) tuples).

	`rate` (a non-negative percentage) controls the speed of speech.

	`break_strength` indicates the strength of the prosodic break between pars.
	Valid values are "none", "x-weak", "weak", "medium" (default value),
	"strong", or "x-strong". The value "none" indicates that no prosodic break
	boundary should be outputted, which can be used to prevent a prosodic break
	which the processor would otherwise produce.

	`break_time` indicates the duration of a pause to be inserted between
	paragraphs in the output in seconds or milliseconds.

	Yields (raw_text, TTS input) tuples.

	See https://www.w3.org/TR/speech-synthesis for details.
	"""
	for raw, p in pars:
	ssml = SSML_TEMPLATE.format(text=p, rate=rate, strength=break_strength,
	time=break_time)
	yield (raw, texttospeech.types.SynthesisInput(ssml=ssml))



	def synth_text(text_input, tts_client, tts_voice, tts_audio_config):
	"""
	Speech synthsizes the TTS input in `text_input` which must be of the proper
	Google TTS type. `text_input` is a tuple (raw_text, TTS input).

	Returns a tuple (raw_text, binary string with mp3 audio).
	"""
	raw_text, tts_input = text_input
	response = tts_client.synthesize_speech(tts_input, tts_voice,
	tts_audio_config)
	return (raw_text, response.SerializeToString())


	if __name__ == '__main__':

	parser = argparse.ArgumentParser(description=__doc__)
	parser.add_argument('src', help="""input text file; assumes each lines
	represents a paragraph, with paragraphs separated by an empty line
	(markdown-like)""")
	parser.add_argument('--heading', help="""heading text to prepend to
	output""")
	parser.add_argument('--out', required=True, help="""path to mp3 output
	file""")
	parser.add_argument('--sylt', action='store_true', help="""if specified
	synchronized text info will be stored with the output mp3 file as a
	SYLT ID3 frame""")
	parser.add_argument('--rate', type=int, default=SSML_RATE_DEFAULT,
	help="""rate (speed) of speech in output as a positive integer
	percent value (default: {})""".format(SSML_RATE_DEFAULT))
	parser.add_argument('--break_strength', default=SSML_BREAK_STRENGTH_DEFAULT,
	help="""used to indicate the strength of the prosodic break between
	paragraphs in the speech output; valid values are "none", "x-weak",
	"weak", "medium", "strong", or "x-strong"; default: {}; see
	https://www.w3.org/TR/speech-synthesis""".format(
	SSML_BREAK_STRENGTH_DEFAULT))
	parser.add_argument('--break_time', default=SSML_BREAK_TIME_DEFAULT,
	help="""indicates the duration of a pause to be inserted in the
	output between paragraphs in seconds or milliseconds (e.g. "3s",
	"250ms"); defaults to {}""".format(SSML_BREAK_TIME_DEFAULT))
	parser.add_argument('--threads', type=int, default=DEFAULT_THREADS,
	help="""number of threads to spawn as workers when calling Google
	TTS API; defaults to {}""".format(DEFAULT_THREADS))

	args = parser.parse_args()

	logging.info('running with parameters: heading=%s, rate=%d, strength=%s, '
	'time=%s, threads=%d', args.heading, args.rate,
	args.break_strength, args.break_time, args.threads)

	tts_client = texttospeech.TextToSpeechClient()
	tts_voice = texttospeech.types.VoiceSelectionParams(
	language_code='en-US',
	ssml_gender=texttospeech.enums.SsmlVoiceGender.FEMALE)
	tts_audio_config = texttospeech.types.AudioConfig(
	audio_encoding=texttospeech.enums.AudioEncoding.MP3)

	if args.heading:
	ssml = SSML_TEMPLATE.format(text=args.heading, rate=args.rate,
	strength=args.break_strength, time=args.break_time)
	text_input = texttospeech.types.SynthesisInput(ssml=ssml)
	response = tts_client.synthesize_speech(text_input, tts_voice,
	tts_audio_config)
	mp3_audio = response.SerializeToString()
	sync_text = [(args.heading, 0)]
	elapsed_time = MP3(BytesIO(mp3_audio)).info.length
	else:
	mp3_audio=b''
	elapsed_time = 0
	sync_text = []

	pars = gen_pars(args.src)
	tts_text_inputs = gen_tts_text_inputs(pars, rate=args.rate,
	break_strength=args.break_strength, break_time=args.break_time)

	synth_text_configured = partial(synth_text,
	tts_client=tts_client, tts_voice=tts_voice,
	tts_audio_config=tts_audio_config)

	with futures.ThreadPoolExecutor(max_workers=args.threads) as executor:
	par_count = 0
	for raw, sp in executor.map(synth_text_configured, tts_text_inputs):
	sync_text.append((raw, round(elapsed_time * 1000)))
	mp3_audio += sp
	elapsed_time += MP3(BytesIO(sp)).info.length
	par_count += 1

	logging.info('synthesized %d paragraphs', par_count)

	tags = ID3()
	if args.heading:
	tags.add(TIT2(encoding=3, text=args.heading))
	if args.sylt:
	tags.add(SYLT(encoding=3, lang='eng', format=2, type=1,
	desc='Narrated text', text=sync_text))
	fp_id3 = BytesIO()
	tags.save(fp_id3)
	fp_id3.seek(0)

	logging.info('writing audio to %s', args.out)
	with open(args.out, 'wb') as fp:
	fp.write(fp_id3.read() + mp3_audio)

	print(json.dumps(sync_text))