Created
June 5, 2021 19:51
-
-
Save jarmitage/fa2aa8d2cbfb70bb292c1f8d89ac96e8 to your computer and use it in GitHub Desktop.
Latex to audiobook via Google Cloud TTS (tex to HTML to SSML to batches to gcloud)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# super rough code! | |
# only supports simple tex + ssml | |
# requires sox for audio concatenation | |
# %autoawait - if in .ipynb, this is needed for async to work | |
import pypandoc | |
import html | |
import re | |
import os | |
import sys | |
from IPython.core.display import HTML | |
from google.cloud import texttospeech | |
import sox | |
from html.parser import HTMLParser | |
from html.entities import name2codepoint | |
def html_to_ssml(_html): | |
tmp = _html.replace('\n','') | |
tmp = tmp.replace('<p><br /></p>','') | |
tmp = tmp.replace('</a>','') | |
tmp = tmp.replace('<em>', '<emphasis level="strong">') | |
tmp = tmp.replace('</em>', '</emphasis><break time="150ms"/>') | |
tmp = tmp.replace('<h1', '<emphasis level="strong"') | |
tmp = tmp.replace('</h1>', '</emphasis><break time="2000ms"/>') | |
tmp = tmp.replace('<h2', '<emphasis level="strong"') | |
tmp = tmp.replace('</h2>', '</emphasis><break time="1500ms"/>') | |
tmp = tmp.replace('<h3', '<emphasis level="strong"') | |
tmp = tmp.replace('</h3>', '</emphasis><break time="1200ms"/>') | |
tmp = tmp.replace('</p>', '</p><break time="750ms"/>') | |
tmp = tmp.replace('<ul>','') | |
tmp = tmp.replace('</ul>','') | |
tmp = tmp.replace('<li>','') | |
tmp = tmp.replace('</li>','<break time="200ms"/>') | |
tmp = tmp.replace('<strong>','<emphasis level="strong">') | |
tmp = tmp.replace('</strong>','</emphasis><break time="150ms"/>') | |
tmp = re.sub('<span.*?span>', '', tmp) # spans (citations) | |
tmp = re.sub('<section.*?section>', '', tmp) # footnotes | |
tmp = re.sub('<sup.*?sup>', '', tmp) # superscripts | |
tmp = re.sub('<a.*?>', '', tmp) # links | |
# tmp = re.sub('\[cha.*?\]', '', tmp) # edge case | |
return tmp | |
class SSMLParser(HTMLParser): | |
def __init__(self): | |
HTMLParser.__init__(self) | |
self.start = '<speak><prosody rate="125%" pitch="+2st">' | |
self.end = '</prosody></speak>' | |
self.count = len(self.start) | |
self.interval = 4500 | |
self.content = self.start | |
self.batches = [] | |
self.open_tags = [] | |
self.dbg = False # print msgs | |
def handle_starttag(self, tag, attrs): | |
if self.dbg: | |
print("Open", tag) | |
self.open_tags.append(tag) | |
self.content += '<'+tag | |
for attr in attrs: | |
if (attr[0] == 'level'): | |
self.content += ' '+attr[0]+'="'+attr[1]+'"' | |
elif (attr[0] == 'time'): | |
self.content += ' '+attr[0]+'="'+attr[1]+'"' | |
self.content +='>' | |
def handle_endtag(self, tag): | |
if (tag in self.open_tags): | |
self.open_tags.remove(tag) | |
self.content += '</'+tag+'>' | |
else: | |
if self.dbg: | |
print('Error: end tag not found in self.open_tags:', tag) | |
if self.dbg: | |
print("Close", tag) | |
def handle_data(self, data): | |
#print('Data: ('+str(len(self.content)+ len(data))+')\n', data,'\n') | |
if (len(self.content) + len(data) + len(self.end) < self.interval): | |
self.count += len(data) | |
self.content += data | |
if self.dbg: | |
print('Data: ('+str(len(self.content))+')\n', data,'\n') | |
else: | |
self.handle_batch(data) | |
def handle_batch(self, data): | |
if self.dbg: | |
print('handle_batch() tags still open:', self.open_tags) | |
leftover = '' | |
for tag in self.open_tags: | |
self.content += '</'+tag+'>' | |
leftover += '<'+tag+'>' | |
leftover+=data | |
self.content+=self.end | |
self.batches.append(self.content) | |
self.count = 0 | |
if self.dbg: | |
print('handle_batch() adding batch\n\n', self.content,'\n\nleftover:',leftover) | |
self.content = self.start+leftover | |
def handle_comment(self, data): | |
print("Comment :", data) | |
def handle_entityref(self, name): | |
c = chr(name2codepoint[name]) | |
print("Named ent:", c) | |
def handle_charref(self, name): | |
if name.startswith('x'): | |
c = chr(int(name[1:], 16)) | |
else: | |
c = chr(int(name)) | |
print("Num ent :", c) | |
def handle_decl(self, data): | |
print("Decl :", data) | |
def batch_ssml(ssml): | |
parser = SSMLParser() | |
parser.feed(ssml) | |
parser.handle_batch('') # turn leftover content into final batch | |
return parser.batches | |
async def render_batches_tmp(batches, tmp_dir): | |
files = [] | |
for i, batch in enumerate(batches): | |
f = tmp_dir+'tmp_'+str(i)+'.mp3' | |
ssml_to_audio(batch, f) | |
files.append(f) | |
print('Rendered batch', str(i+1),'/',str(len(batches))) | |
return files | |
async def render_batches_to_mp3(batches, filename='out'): | |
tmp_dir = 'mp3/' | |
print('About to render',str(len(batches)),'batches...') | |
files = await render_batches_tmp(batches, tmp_dir) | |
print('Finished rendering batches:',files,' concatenating...') | |
cbn = sox.Combiner() | |
cbn.build(files, tmp_dir+filename+'.mp3', 'concatenate') | |
print('Finished concatenating, deleting tmp files...') | |
[os.remove(f) for f in files] | |
print('Finished!') | |
def ssml_to_audio(ssml_text, outfile): | |
# Generates SSML text from plaintext. | |
# | |
# Given a string of SSML text and an output file name, this function | |
# calls the Text-to-Speech API. The API returns a synthetic audio | |
# version of the text, formatted according to the SSML commands. This | |
# function saves the synthetic audio to the designated output file. | |
# | |
# Args: | |
# ssml_text: string of SSML text | |
# outfile: string name of file under which to save audio output | |
# | |
# Returns: | |
# nothing | |
# Instantiates a client | |
client = texttospeech.TextToSpeechClient() | |
# Sets the text input to be synthesized | |
synthesis_input = texttospeech.SynthesisInput(ssml=ssml_text) | |
# Builds the voice request, selects the language code ("en-US") and | |
# the SSML voice gender ("MALE") | |
#voice = texttospeech.VoiceSelectionParams( | |
# language_code="en-US", ssml_gender=texttospeech.SsmlVoiceGender.MALE | |
#) | |
voice = texttospeech.VoiceSelectionParams( | |
language_code="en-US", | |
name="en-US-Wavenet-C", | |
ssml_gender=texttospeech.SsmlVoiceGender.FEMALE, | |
) | |
# Selects the type of audio file to return | |
audio_config = texttospeech.AudioConfig( | |
audio_encoding=texttospeech.AudioEncoding.MP3 | |
) | |
# Performs the text-to-speech request on the text input with the selected | |
# voice parameters and audio file type | |
response = client.synthesize_speech( | |
input=synthesis_input, voice=voice, audio_config=audio_config | |
) | |
# Writes the synthetic audio to the output file. | |
with open(outfile, "wb") as out: | |
out.write(response.audio_content) | |
print("Audio content written to file " + outfile) | |
output_filename = 'chapter_01' | |
base_path = '/tex/path/' | |
tmp_tex = base_path+'tts.tex' | |
tmp_html = pypandoc.convert_file(tmp_tex, 'html') | |
tmp_ssml = html_to_ssml(tmp_html) | |
tmp_ssml_batches = batch_ssml(tmp_ssml) | |
await render_batches_to_mp3(tmp_ssml_batches, output_filename) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment