Skip to content

Instantly share code, notes, and snippets.

@anotherdirtbag
Created September 19, 2019 17:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anotherdirtbag/a03a8826ddf04bed0f1433bbc84032b0 to your computer and use it in GitHub Desktop.
Save anotherdirtbag/a03a8826ddf04bed0f1433bbc84032b0 to your computer and use it in GitHub Desktop.
Working example of Mozilla TTS tacotron2+wavernn
#following instructions from https://github.com/mozilla/TTS/blob/master/notebooks/Benchmark.ipynb
#%load_ext autoreload
#%autoreload 2
import os
import sys
import io
import time
import numpy as np
#pip3 install --user numpy
from collections import OrderedDict
#from matplotlib import pylab as plt
import torch
#To install with CUDA 9.2. This worked for me
#https://developer.nvidia.com/cuda-92-download-archive?target_os=Windows&target_arch=x86_64&target_version=10&target_type=exenetwork
#pip3 install --user --no-cache-dir torch==1.2.0+cu92 torchvision==0.4.0+cu92 -f https://download.pytorch.org/whl/torch_stable.html
#For trying CUDA 10.0. This didn't work for me
#https://developer.nvidia.com/cuda-10.0-download-archive?target_os=Windows&target_arch=x86_64&target_version=10&target_type=exelocal
#pip3 install --no-cache-dir --user torch==1.2.0 torchvision==0.4.0 -f https://download.pytorch.org/whl/cu100/torch_stable.html
TTS_PATH = os.path.join(r'C:\Users\sokka\Documents\tts\TTS')
WAVERNN_PATH = os.path.join(TTS_PATH, 'WaveRNN')
#%pylab inline
#rcParams["figure.figsize"] = (16,5)
# add libraries into environment
#import importlib
#importlib.import_module('TTS')
sys.path.append(TTS_PATH) # set this if TTS is not installed globally
sys.path.append(WAVERNN_PATH) # set this if TTS is not installed globally
import librosa
import librosa.display
from models.tacotron import Tacotron
from layers import *
from utils.data import *
from utils.audio import AudioProcessor
from utils.generic_utils import load_config, setup_model
from utils.text import text_to_sequence, cleaners
from utils.synthesis import synthesis
#from utils.visual import visualize
#import IPython
#from IPython.display import Audio
#pip3 install --user ipython
import os
import re
#os.environ['CUDA_VISIBLE_DEVICES']='1'
#os.environ['OMP_NUM_THREADS']='1'
iscuda = torch.cuda.is_available()
print('torch.cuda.is_available()=' + str(iscuda))
runcounter = 0
def tts(model, text, CONFIG, use_cuda, ap, use_gl, speaker_id=None, figures=True):
global runcounter
t_1 = time.time()
submatch = re.sub(r'\s+',' ',text)
file_namematch = re.search( r'([^\s]+\s?\d+)', submatch)
if file_namematch:
file_name = file_namematch.group(0) + '_' + str(runcounter) + '.wav'
else:
file_name = 'tempout_' + str(runcounter) + '.wav'
runcounter += 1
waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens = synthesis(model, text, CONFIG, use_cuda, ap, truncated=False)
if CONFIG.model == "Tacotron" and not use_gl:
mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T
if not use_gl:
waveform = wavernn.generate(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0).cuda(), batched=batched_wavernn, target=11000, overlap=550)
print(" > Run-time: {}".format(time.time() - t_1))
#if figures:
# visualize(alignment, mel_postnet_spec, stop_tokens, text, ap.hop_length, CONFIG, mel_spec)
#IPython.display.display(Audio(waveform, rate=CONFIG.audio['sample_rate']))
os.makedirs(OUT_FOLDER, exist_ok=True)
out_path = os.path.join(OUT_FOLDER, file_name)
ap.save_wav(waveform, out_path)
return alignment, mel_postnet_spec, stop_tokens, waveform
# Set constants
ROOT_PATH = TTS_PATH
MODEL_PATH = os.path.join(r'C:\Users\sokka\Documents\tts\wavernn_mold\ljspeech_8a47b46','checkpoint_261000.pth.tar')
CONFIG_PATH = os.path.join(r'C:\Users\sokka\Documents\tts\wavernn_mold\ljspeech_8a47b46','config.json')
OUT_FOLDER = os.path.join(ROOT_PATH,'AudioSamples/benchmark_samples/')
CONFIG = load_config(CONFIG_PATH)
VOCODER_MODEL_PATH = os.path.join(r'C:\Users\sokka\Documents\tts\wavernn_mold\wavernn_mold_8a1c152', 'checkpoint_433000.pth.tar')
VOCODER_CONFIG_PATH = os.path.join(r'C:\Users\sokka\Documents\tts\wavernn_mold\wavernn_mold_8a1c152', 'config.json')
VOCODER_CONFIG = load_config(VOCODER_CONFIG_PATH)
use_cuda = True
# Set some config fields manually for testing
# CONFIG.windowing = False
# CONFIG.prenet_dropout = False
# CONFIG.separate_stopnet = True
# CONFIG.stopnet = True
# Set the vocoder
use_gl = False # use GL if True
batched_wavernn = True # use batched wavernn inference if True
# LOAD TTS MODEL
from utils.text.symbols import symbols, phonemes
# load the model
num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)
model = setup_model(num_chars, CONFIG)
# load the audio processor
ap = AudioProcessor(**CONFIG.audio)
# load model state
if use_cuda:
cp = torch.load(MODEL_PATH)
else:
cp = torch.load(MODEL_PATH, map_location=lambda storage, loc: storage)
# load the model
model.load_state_dict(cp['model'])
if use_cuda:
model.cuda()
model.eval()
print(cp['step'])
# LOAD WAVERNN
if use_gl == False:
from WaveRNN.models.wavernn import Model
bits = 10
wavernn = Model(
rnn_dims=512,
fc_dims=512,
mode="mold",
pad=2,
upsample_factors=VOCODER_CONFIG.upsample_factors, # set this depending on dataset
feat_dims=VOCODER_CONFIG.audio["num_mels"],
compute_dims=128,
res_out_dims=128,
res_blocks=10,
hop_length=ap.hop_length,
sample_rate=ap.sample_rate,
).cuda()
check = torch.load(VOCODER_MODEL_PATH)
wavernn.load_state_dict(check['model'])
if use_cuda:
wavernn.cuda()
wavernn.eval()
print(check['step'])
illegalchars_exclusive = re.compile(r'[^\w\d\.\,\;\!\?\s]')
repitiion = re.compile(r'\s{2,}')
def custom_text_fix(sentence):
global illegalchars_exclusive
global repitiion
newsentance = illegalchars_exclusive.sub(' ', sentence)
newsentance = repitiion.sub(' ', newsentance)
return newsentance
model.eval()
model.decoder.max_decoder_steps = 2000
speaker_id = 0
sentences = ["Bill got in the habit of asking himself “Is that thought true?” And if he wasn’t absolutely certain it was, he just let it go." ]
for sentence in sentences:
sentence = custom_text_fix(sentence)
sentence = cleaners.english_cleaners(sentence)
alizgn, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment