Skip to content

Instantly share code, notes, and snippets.

@jarmitage
Created June 5, 2021 19:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jarmitage/fa2aa8d2cbfb70bb292c1f8d89ac96e8 to your computer and use it in GitHub Desktop.
Save jarmitage/fa2aa8d2cbfb70bb292c1f8d89ac96e8 to your computer and use it in GitHub Desktop.
Latex to audiobook via Google Cloud TTS (tex to HTML to SSML to batches to gcloud)
# super rough code!
# only supports simple tex + ssml
# requires sox for audio concatenation
# %autoawait - if in .ipynb, this is needed for async to work
import pypandoc
import html
import re
import os
import sys
from IPython.core.display import HTML
from google.cloud import texttospeech
import sox
from html.parser import HTMLParser
from html.entities import name2codepoint
def html_to_ssml(_html):
tmp = _html.replace('\n','')
tmp = tmp.replace('<p><br /></p>','')
tmp = tmp.replace('</a>','')
tmp = tmp.replace('<em>', '<emphasis level="strong">')
tmp = tmp.replace('</em>', '</emphasis><break time="150ms"/>')
tmp = tmp.replace('<h1', '<emphasis level="strong"')
tmp = tmp.replace('</h1>', '</emphasis><break time="2000ms"/>')
tmp = tmp.replace('<h2', '<emphasis level="strong"')
tmp = tmp.replace('</h2>', '</emphasis><break time="1500ms"/>')
tmp = tmp.replace('<h3', '<emphasis level="strong"')
tmp = tmp.replace('</h3>', '</emphasis><break time="1200ms"/>')
tmp = tmp.replace('</p>', '</p><break time="750ms"/>')
tmp = tmp.replace('<ul>','')
tmp = tmp.replace('</ul>','')
tmp = tmp.replace('<li>','')
tmp = tmp.replace('</li>','<break time="200ms"/>')
tmp = tmp.replace('<strong>','<emphasis level="strong">')
tmp = tmp.replace('</strong>','</emphasis><break time="150ms"/>')
tmp = re.sub('<span.*?span>', '', tmp) # spans (citations)
tmp = re.sub('<section.*?section>', '', tmp) # footnotes
tmp = re.sub('<sup.*?sup>', '', tmp) # superscripts
tmp = re.sub('<a.*?>', '', tmp) # links
# tmp = re.sub('\[cha.*?\]', '', tmp) # edge case
return tmp
class SSMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.start = '<speak><prosody rate="125%" pitch="+2st">'
self.end = '</prosody></speak>'
self.count = len(self.start)
self.interval = 4500
self.content = self.start
self.batches = []
self.open_tags = []
self.dbg = False # print msgs
def handle_starttag(self, tag, attrs):
if self.dbg:
print("Open", tag)
self.open_tags.append(tag)
self.content += '<'+tag
for attr in attrs:
if (attr[0] == 'level'):
self.content += ' '+attr[0]+'="'+attr[1]+'"'
elif (attr[0] == 'time'):
self.content += ' '+attr[0]+'="'+attr[1]+'"'
self.content +='>'
def handle_endtag(self, tag):
if (tag in self.open_tags):
self.open_tags.remove(tag)
self.content += '</'+tag+'>'
else:
if self.dbg:
print('Error: end tag not found in self.open_tags:', tag)
if self.dbg:
print("Close", tag)
def handle_data(self, data):
#print('Data: ('+str(len(self.content)+ len(data))+')\n', data,'\n')
if (len(self.content) + len(data) + len(self.end) < self.interval):
self.count += len(data)
self.content += data
if self.dbg:
print('Data: ('+str(len(self.content))+')\n', data,'\n')
else:
self.handle_batch(data)
def handle_batch(self, data):
if self.dbg:
print('handle_batch() tags still open:', self.open_tags)
leftover = ''
for tag in self.open_tags:
self.content += '</'+tag+'>'
leftover += '<'+tag+'>'
leftover+=data
self.content+=self.end
self.batches.append(self.content)
self.count = 0
if self.dbg:
print('handle_batch() adding batch\n\n', self.content,'\n\nleftover:',leftover)
self.content = self.start+leftover
def handle_comment(self, data):
print("Comment :", data)
def handle_entityref(self, name):
c = chr(name2codepoint[name])
print("Named ent:", c)
def handle_charref(self, name):
if name.startswith('x'):
c = chr(int(name[1:], 16))
else:
c = chr(int(name))
print("Num ent :", c)
def handle_decl(self, data):
print("Decl :", data)
def batch_ssml(ssml):
parser = SSMLParser()
parser.feed(ssml)
parser.handle_batch('') # turn leftover content into final batch
return parser.batches
async def render_batches_tmp(batches, tmp_dir):
files = []
for i, batch in enumerate(batches):
f = tmp_dir+'tmp_'+str(i)+'.mp3'
ssml_to_audio(batch, f)
files.append(f)
print('Rendered batch', str(i+1),'/',str(len(batches)))
return files
async def render_batches_to_mp3(batches, filename='out'):
tmp_dir = 'mp3/'
print('About to render',str(len(batches)),'batches...')
files = await render_batches_tmp(batches, tmp_dir)
print('Finished rendering batches:',files,' concatenating...')
cbn = sox.Combiner()
cbn.build(files, tmp_dir+filename+'.mp3', 'concatenate')
print('Finished concatenating, deleting tmp files...')
[os.remove(f) for f in files]
print('Finished!')
def ssml_to_audio(ssml_text, outfile):
# Generates SSML text from plaintext.
#
# Given a string of SSML text and an output file name, this function
# calls the Text-to-Speech API. The API returns a synthetic audio
# version of the text, formatted according to the SSML commands. This
# function saves the synthetic audio to the designated output file.
#
# Args:
# ssml_text: string of SSML text
# outfile: string name of file under which to save audio output
#
# Returns:
# nothing
# Instantiates a client
client = texttospeech.TextToSpeechClient()
# Sets the text input to be synthesized
synthesis_input = texttospeech.SynthesisInput(ssml=ssml_text)
# Builds the voice request, selects the language code ("en-US") and
# the SSML voice gender ("MALE")
#voice = texttospeech.VoiceSelectionParams(
# language_code="en-US", ssml_gender=texttospeech.SsmlVoiceGender.MALE
#)
voice = texttospeech.VoiceSelectionParams(
language_code="en-US",
name="en-US-Wavenet-C",
ssml_gender=texttospeech.SsmlVoiceGender.FEMALE,
)
# Selects the type of audio file to return
audio_config = texttospeech.AudioConfig(
audio_encoding=texttospeech.AudioEncoding.MP3
)
# Performs the text-to-speech request on the text input with the selected
# voice parameters and audio file type
response = client.synthesize_speech(
input=synthesis_input, voice=voice, audio_config=audio_config
)
# Writes the synthetic audio to the output file.
with open(outfile, "wb") as out:
out.write(response.audio_content)
print("Audio content written to file " + outfile)
output_filename = 'chapter_01'
base_path = '/tex/path/'
tmp_tex = base_path+'tts.tex'
tmp_html = pypandoc.convert_file(tmp_tex, 'html')
tmp_ssml = html_to_ssml(tmp_html)
tmp_ssml_batches = batch_ssml(tmp_ssml)
await render_batches_to_mp3(tmp_ssml_batches, output_filename)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment