jarmitage/latex_to_speech.py

## latex_to_speech.py
# super rough code!
# only supports simple tex + ssml
# requires sox for audio concatenation

# %autoawait - if in .ipynb, this is needed for async to work

import pypandoc
import html
import re
import os
import sys
from IPython.core.display import HTML
from google.cloud import texttospeech
import sox
from html.parser import HTMLParser
from html.entities import name2codepoint

def html_to_ssml(_html):
    tmp = _html.replace('\n','')
    tmp = tmp.replace('<p><br /></p>','')
    tmp = tmp.replace('</a>','')
    tmp = tmp.replace('<em>', '<emphasis level="strong">')
    tmp = tmp.replace('</em>', '</emphasis><break time="150ms"/>')
    tmp = tmp.replace('<h1', '<emphasis level="strong"')
    tmp = tmp.replace('</h1>', '</emphasis><break time="2000ms"/>')
    tmp = tmp.replace('<h2', '<emphasis level="strong"')
    tmp = tmp.replace('</h2>', '</emphasis><break time="1500ms"/>')
    tmp = tmp.replace('<h3', '<emphasis level="strong"')
    tmp = tmp.replace('</h3>', '</emphasis><break time="1200ms"/>')
    tmp = tmp.replace('</p>', '</p><break time="750ms"/>')
    tmp = tmp.replace('<ul>','')
    tmp = tmp.replace('</ul>','')
    tmp = tmp.replace('<li>','')
    tmp = tmp.replace('</li>','<break time="200ms"/>')
    tmp = tmp.replace('<strong>','<emphasis level="strong">')
    tmp = tmp.replace('</strong>','</emphasis><break time="150ms"/>')

    tmp = re.sub('<span.*?span>', '', tmp) # spans (citations)
    tmp = re.sub('<section.*?section>', '', tmp) # footnotes
    tmp = re.sub('<sup.*?sup>', '', tmp) # superscripts
    tmp = re.sub('<a.*?>', '', tmp) # links
    # tmp = re.sub('\[cha.*?\]', '', tmp) # edge case
    return tmp

class SSMLParser(HTMLParser):

    def __init__(self):
        HTMLParser.__init__(self)
        self.start = '<speak><prosody rate="125%" pitch="+2st">'
        self.end = '</prosody></speak>'
        self.count    = len(self.start)
        self.interval = 4500
        self.content  = self.start
        self.batches  = []
        self.open_tags = []
        self.dbg = False # print msgs

    def handle_starttag(self, tag, attrs):
        if self.dbg:
            print("Open", tag)
        self.open_tags.append(tag)
        self.content += '<'+tag
        for attr in attrs:
            if (attr[0] == 'level'):
                self.content += ' '+attr[0]+'="'+attr[1]+'"'
            elif (attr[0] == 'time'):
                self.content += ' '+attr[0]+'="'+attr[1]+'"'
        self.content +='>'

    def handle_endtag(self, tag):
        if (tag in self.open_tags):
            self.open_tags.remove(tag)
            self.content += '</'+tag+'>'
        else:
            if self.dbg:
                print('Error: end tag not found in self.open_tags:', tag)
        if self.dbg:
            print("Close", tag)

    def handle_data(self, data):
        #print('Data: ('+str(len(self.content)+ len(data))+')\n', data,'\n')
        if (len(self.content) + len(data) + len(self.end) < self.interval):
            self.count += len(data)
            self.content += data
            if self.dbg:
                print('Data: ('+str(len(self.content))+')\n', data,'\n')
        else:
            self.handle_batch(data)

    def handle_batch(self, data):
        if self.dbg:
            print('handle_batch() tags still open:', self.open_tags)
        leftover = ''
        for tag in self.open_tags:
            self.content += '</'+tag+'>'
            leftover += '<'+tag+'>'
        leftover+=data
        self.content+=self.end
        self.batches.append(self.content)
        self.count = 0
        if self.dbg:
            print('handle_batch() adding batch\n\n', self.content,'\n\nleftover:',leftover)
        self.content = self.start+leftover

    def handle_comment(self, data):
        print("Comment  :", data)

    def handle_entityref(self, name):
        c = chr(name2codepoint[name])
        print("Named ent:", c)

    def handle_charref(self, name):
        if name.startswith('x'):
            c = chr(int(name[1:], 16))
        else:
            c = chr(int(name))
        print("Num ent  :", c)

    def handle_decl(self, data):
        print("Decl     :", data)

def batch_ssml(ssml):
    parser = SSMLParser()
    parser.feed(ssml)
    parser.handle_batch('') # turn leftover content into final batch
    return parser.batches

async def render_batches_tmp(batches, tmp_dir):
    files = []
    for i, batch in enumerate(batches):
        f = tmp_dir+'tmp_'+str(i)+'.mp3'
        ssml_to_audio(batch, f)
        files.append(f)
        print('Rendered batch', str(i+1),'/',str(len(batches)))
    return files

async def render_batches_to_mp3(batches, filename='out'):
    tmp_dir = 'mp3/'
    print('About to render',str(len(batches)),'batches...')
    files = await render_batches_tmp(batches, tmp_dir)
    print('Finished rendering batches:',files,' concatenating...')
    cbn = sox.Combiner()
    cbn.build(files, tmp_dir+filename+'.mp3', 'concatenate')
    print('Finished concatenating, deleting tmp files...')
    [os.remove(f) for f in files]
    print('Finished!')

def ssml_to_audio(ssml_text, outfile):
    # Generates SSML text from plaintext.
    #
    # Given a string of SSML text and an output file name, this function
    # calls the Text-to-Speech API. The API returns a synthetic audio
    # version of the text, formatted according to the SSML commands. This
    # function saves the synthetic audio to the designated output file.
    #
    # Args:
    # ssml_text: string of SSML text
    # outfile: string name of file under which to save audio output
    #
    # Returns:
    # nothing

    # Instantiates a client
    client = texttospeech.TextToSpeechClient()

    # Sets the text input to be synthesized
    synthesis_input = texttospeech.SynthesisInput(ssml=ssml_text)

    # Builds the voice request, selects the language code ("en-US") and
    # the SSML voice gender ("MALE")
    #voice = texttospeech.VoiceSelectionParams(
    #    language_code="en-US", ssml_gender=texttospeech.SsmlVoiceGender.MALE
    #)
    voice = texttospeech.VoiceSelectionParams(
        language_code="en-US",
        name="en-US-Wavenet-C",
        ssml_gender=texttospeech.SsmlVoiceGender.FEMALE,
    )

    # Selects the type of audio file to return
    audio_config = texttospeech.AudioConfig(
        audio_encoding=texttospeech.AudioEncoding.MP3
    )

    # Performs the text-to-speech request on the text input with the selected
    # voice parameters and audio file type
    response = client.synthesize_speech(
        input=synthesis_input, voice=voice, audio_config=audio_config
    )

    # Writes the synthetic audio to the output file.
    with open(outfile, "wb") as out:
        out.write(response.audio_content)
        print("Audio content written to file " + outfile)

output_filename = 'chapter_01'
base_path = '/tex/path/'
tmp_tex = base_path+'tts.tex'
tmp_html = pypandoc.convert_file(tmp_tex, 'html')
tmp_ssml = html_to_ssml(tmp_html)
tmp_ssml_batches = batch_ssml(tmp_ssml)

await render_batches_to_mp3(tmp_ssml_batches, output_filename)
	# super rough code!
	# only supports simple tex + ssml
	# requires sox for audio concatenation

	# %autoawait - if in .ipynb, this is needed for async to work

	import pypandoc
	import html
	import re
	import os
	import sys
	from IPython.core.display import HTML
	from google.cloud import texttospeech
	import sox
	from html.parser import HTMLParser
	from html.entities import name2codepoint

	def html_to_ssml(_html):
	tmp = _html.replace('\n','')
	tmp = tmp.replace('<p><br /></p>','')
	tmp = tmp.replace('</a>','')
	tmp = tmp.replace('<em>', '<emphasis level="strong">')
	tmp = tmp.replace('</em>', '</emphasis><break time="150ms"/>')
	tmp = tmp.replace('<h1', '<emphasis level="strong"')
	tmp = tmp.replace('</h1>', '</emphasis><break time="2000ms"/>')
	tmp = tmp.replace('<h2', '<emphasis level="strong"')
	tmp = tmp.replace('</h2>', '</emphasis><break time="1500ms"/>')
	tmp = tmp.replace('<h3', '<emphasis level="strong"')
	tmp = tmp.replace('</h3>', '</emphasis><break time="1200ms"/>')
	tmp = tmp.replace('</p>', '</p><break time="750ms"/>')
	tmp = tmp.replace('<ul>','')
	tmp = tmp.replace('</ul>','')
	tmp = tmp.replace('<li>','')
	tmp = tmp.replace('</li>','<break time="200ms"/>')
	tmp = tmp.replace('<strong>','<emphasis level="strong">')
	tmp = tmp.replace('</strong>','</emphasis><break time="150ms"/>')

	tmp = re.sub('<span.*?span>', '', tmp) # spans (citations)
	tmp = re.sub('<section.*?section>', '', tmp) # footnotes
	tmp = re.sub('<sup.*?sup>', '', tmp) # superscripts
	tmp = re.sub('<a.*?>', '', tmp) # links
	# tmp = re.sub('\[cha.*?\]', '', tmp) # edge case
	return tmp

	class SSMLParser(HTMLParser):

	def __init__(self):
	HTMLParser.__init__(self)
	self.start = '<speak><prosody rate="125%" pitch="+2st">'
	self.end = '</prosody></speak>'
	self.count = len(self.start)
	self.interval = 4500
	self.content = self.start
	self.batches = []
	self.open_tags = []
	self.dbg = False # print msgs

	def handle_starttag(self, tag, attrs):
	if self.dbg:
	print("Open", tag)
	self.open_tags.append(tag)
	self.content += '<'+tag
	for attr in attrs:
	if (attr[0] == 'level'):
	self.content += ' '+attr[0]+'="'+attr[1]+'"'
	elif (attr[0] == 'time'):
	self.content += ' '+attr[0]+'="'+attr[1]+'"'
	self.content +='>'

	def handle_endtag(self, tag):
	if (tag in self.open_tags):
	self.open_tags.remove(tag)
	self.content += '</'+tag+'>'
	else:
	if self.dbg:
	print('Error: end tag not found in self.open_tags:', tag)
	if self.dbg:
	print("Close", tag)

	def handle_data(self, data):
	#print('Data: ('+str(len(self.content)+ len(data))+')\n', data,'\n')
	if (len(self.content) + len(data) + len(self.end) < self.interval):
	self.count += len(data)
	self.content += data
	if self.dbg:
	print('Data: ('+str(len(self.content))+')\n', data,'\n')
	else:
	self.handle_batch(data)

	def handle_batch(self, data):
	if self.dbg:
	print('handle_batch() tags still open:', self.open_tags)
	leftover = ''
	for tag in self.open_tags:
	self.content += '</'+tag+'>'
	leftover += '<'+tag+'>'
	leftover+=data
	self.content+=self.end
	self.batches.append(self.content)
	self.count = 0
	if self.dbg:
	print('handle_batch() adding batch\n\n', self.content,'\n\nleftover:',leftover)
	self.content = self.start+leftover

	def handle_comment(self, data):
	print("Comment :", data)

	def handle_entityref(self, name):
	c = chr(name2codepoint[name])
	print("Named ent:", c)

	def handle_charref(self, name):
	if name.startswith('x'):
	c = chr(int(name[1:], 16))
	else:
	c = chr(int(name))
	print("Num ent :", c)

	def handle_decl(self, data):
	print("Decl :", data)

	def batch_ssml(ssml):
	parser = SSMLParser()
	parser.feed(ssml)
	parser.handle_batch('') # turn leftover content into final batch
	return parser.batches

	async def render_batches_tmp(batches, tmp_dir):
	files = []
	for i, batch in enumerate(batches):
	f = tmp_dir+'tmp_'+str(i)+'.mp3'
	ssml_to_audio(batch, f)
	files.append(f)
	print('Rendered batch', str(i+1),'/',str(len(batches)))
	return files

	async def render_batches_to_mp3(batches, filename='out'):
	tmp_dir = 'mp3/'
	print('About to render',str(len(batches)),'batches...')
	files = await render_batches_tmp(batches, tmp_dir)
	print('Finished rendering batches:',files,' concatenating...')
	cbn = sox.Combiner()
	cbn.build(files, tmp_dir+filename+'.mp3', 'concatenate')
	print('Finished concatenating, deleting tmp files...')
	[os.remove(f) for f in files]
	print('Finished!')

	def ssml_to_audio(ssml_text, outfile):
	# Generates SSML text from plaintext.
	#
	# Given a string of SSML text and an output file name, this function
	# calls the Text-to-Speech API. The API returns a synthetic audio
	# version of the text, formatted according to the SSML commands. This
	# function saves the synthetic audio to the designated output file.
	#
	# Args:
	# ssml_text: string of SSML text
	# outfile: string name of file under which to save audio output
	#
	# Returns:
	# nothing

	# Instantiates a client
	client = texttospeech.TextToSpeechClient()

	# Sets the text input to be synthesized
	synthesis_input = texttospeech.SynthesisInput(ssml=ssml_text)

	# Builds the voice request, selects the language code ("en-US") and
	# the SSML voice gender ("MALE")
	#voice = texttospeech.VoiceSelectionParams(
	# language_code="en-US", ssml_gender=texttospeech.SsmlVoiceGender.MALE
	#)
	voice = texttospeech.VoiceSelectionParams(
	language_code="en-US",
	name="en-US-Wavenet-C",
	ssml_gender=texttospeech.SsmlVoiceGender.FEMALE,
	)

	# Selects the type of audio file to return
	audio_config = texttospeech.AudioConfig(
	audio_encoding=texttospeech.AudioEncoding.MP3
	)

	# Performs the text-to-speech request on the text input with the selected
	# voice parameters and audio file type
	response = client.synthesize_speech(
	input=synthesis_input, voice=voice, audio_config=audio_config
	)

	# Writes the synthetic audio to the output file.
	with open(outfile, "wb") as out:
	out.write(response.audio_content)
	print("Audio content written to file " + outfile)

	output_filename = 'chapter_01'
	base_path = '/tex/path/'
	tmp_tex = base_path+'tts.tex'
	tmp_html = pypandoc.convert_file(tmp_tex, 'html')
	tmp_ssml = html_to_ssml(tmp_html)
	tmp_ssml_batches = batch_ssml(tmp_ssml)

	await render_batches_to_mp3(tmp_ssml_batches, output_filename)