Dash Text to Speech
# -*- coding: utf-8 -*- | |
""" | |
Speech Synthesis with Dash. This will only work with a Nvidia GPU. | |
Requirements (put them in requirements.txt): | |
apex | |
dash | |
numpy | |
scipy | |
torch | |
""" | |
import base64 | |
import time | |
import io | |
from apex import amp | |
import dash | |
import dash_core_components as dcc | |
import dash_html_components as html | |
from dash.dependencies import Input, Output, State | |
import numpy as np | |
from scipy.io.wavfile import write | |
import torch | |
# Load model | |
tacotron2 = torch.hub.load('nvidia/DeepLearningExamples:torchhub', 'nvidia_tacotron2') | |
tacotron2 = tacotron2.to('cuda') | |
tacotron2 = amp.initialize(tacotron2, opt_level="O1") | |
tacotron2.eval() | |
waveglow = torch.hub.load('nvidia/DeepLearningExamples:torchhub', 'nvidia_waveglow') | |
waveglow = waveglow.remove_weightnorm(waveglow) | |
waveglow = waveglow.to('cuda') | |
waveglow = amp.initialize(waveglow, opt_level="O1") | |
waveglow.eval() | |
# Dash app starts here | |
external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css'] | |
app = dash.Dash(__name__, external_stylesheets=external_stylesheets) | |
app.layout = html.Div(children=[ | |
html.H1(children='Dash Text-to-Speech'), | |
dcc.Textarea( | |
id='textarea-input', | |
value=text, | |
style={'width': '100%', 'height': '45vh'} | |
), | |
dcc.Loading([ | |
html.Button("Generate", id='button'), | |
html.Audio(id='audio-out', controls=True) | |
]) | |
]) | |
@app.callback(Output("audio-out", "src"), | |
[Input("button", "n_clicks")], | |
[State("textarea-input", "value")]) | |
def generate_audio(n_clicks, text): | |
if text == "": | |
text = "Sorry, there's nothing in the text input. Please write something." | |
t0 = time.time() | |
# preprocessing | |
sequence = np.array(tacotron2.text_to_sequence(text, ['english_cleaners']))[None, :] | |
sequence = torch.from_numpy(sequence).to(device='cuda', dtype=torch.int64) | |
# run the models | |
with torch.no_grad(): | |
_, mel, _, _ = tacotron2.infer(sequence) | |
audio = waveglow.infer(mel) | |
audio_numpy = audio[0].data.cpu().numpy() | |
rate = 22050 | |
t1 = time.time() | |
buffer = io.BytesIO() | |
write(buffer, rate, audio_numpy) | |
b64 = base64.b64encode(buffer.getvalue()) | |
sound = "data:audio/x-wav;base64," + b64.decode("ascii") | |
t2 = time.time() | |
print(f"Completed in {t2-t0:.3f}s. Generation took {t1-t0:.3f}s, file creation took {t2-t1:.3f}s") | |
return sound | |
if __name__ == "__main__": | |
app.run_server(debug=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment