anotherdirtbag/synthesize2.py

## synthesize2.py
#following instructions from https://github.com/mozilla/TTS/blob/master/notebooks/Benchmark.ipynb

#%load_ext autoreload
#%autoreload 2
import os
import sys
import io
import time
import numpy as np
#pip3 install --user numpy


from collections import OrderedDict
#from matplotlib import pylab as plt

import torch
#To install with CUDA 9.2. This worked for me
#https://developer.nvidia.com/cuda-92-download-archive?target_os=Windows&target_arch=x86_64&target_version=10&target_type=exenetwork
#pip3 install --user --no-cache-dir torch==1.2.0+cu92 torchvision==0.4.0+cu92 -f https://download.pytorch.org/whl/torch_stable.html

#For trying CUDA 10.0. This didn't work for me
#https://developer.nvidia.com/cuda-10.0-download-archive?target_os=Windows&target_arch=x86_64&target_version=10&target_type=exelocal
#pip3 install --no-cache-dir --user torch==1.2.0 torchvision==0.4.0 -f https://download.pytorch.org/whl/cu100/torch_stable.html


TTS_PATH = os.path.join(r'C:\Users\sokka\Documents\tts\TTS')
WAVERNN_PATH = os.path.join(TTS_PATH, 'WaveRNN')

#%pylab inline
#rcParams["figure.figsize"] = (16,5)

# add libraries into environment
#import importlib
#importlib.import_module('TTS')

sys.path.append(TTS_PATH) # set this if TTS is not installed globally
sys.path.append(WAVERNN_PATH) # set this if TTS is not installed globally

import librosa
import librosa.display

from models.tacotron import Tacotron
from layers import *
from utils.data import *
from utils.audio import AudioProcessor
from utils.generic_utils import load_config, setup_model
from utils.text import text_to_sequence, cleaners
from utils.synthesis import synthesis
#from utils.visual import visualize


#import IPython
#from IPython.display import Audio
#pip3 install --user ipython

import os
import re
#os.environ['CUDA_VISIBLE_DEVICES']='1'
#os.environ['OMP_NUM_THREADS']='1'

iscuda = torch.cuda.is_available()
print('torch.cuda.is_available()=' + str(iscuda))

runcounter = 0
def tts(model, text, CONFIG, use_cuda, ap, use_gl, speaker_id=None, figures=True):
    global runcounter
    t_1 = time.time()
    submatch = re.sub(r'\s+',' ',text)
    file_namematch = re.search( r'([^\s]+\s?\d+)',  submatch)
    if file_namematch:
        file_name = file_namematch.group(0) + '_' + str(runcounter) + '.wav'
    else:
        file_name = 'tempout_' + str(runcounter) + '.wav'
    runcounter += 1

    waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens = synthesis(model, text, CONFIG, use_cuda, ap, truncated=False)
    if CONFIG.model == "Tacotron" and not use_gl:
        mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T
    if not use_gl:
        waveform = wavernn.generate(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0).cuda(), batched=batched_wavernn, target=11000, overlap=550)

    print(" >  Run-time: {}".format(time.time() - t_1))
    #if figures:
    #    visualize(alignment, mel_postnet_spec, stop_tokens, text, ap.hop_length, CONFIG, mel_spec)
    #IPython.display.display(Audio(waveform, rate=CONFIG.audio['sample_rate']))
    os.makedirs(OUT_FOLDER, exist_ok=True)

    out_path = os.path.join(OUT_FOLDER, file_name)
    ap.save_wav(waveform, out_path)
    return alignment, mel_postnet_spec, stop_tokens, waveform


# Set constants
ROOT_PATH = TTS_PATH
MODEL_PATH = os.path.join(r'C:\Users\sokka\Documents\tts\wavernn_mold\ljspeech_8a47b46','checkpoint_261000.pth.tar')
CONFIG_PATH =  os.path.join(r'C:\Users\sokka\Documents\tts\wavernn_mold\ljspeech_8a47b46','config.json')
OUT_FOLDER = os.path.join(ROOT_PATH,'AudioSamples/benchmark_samples/')
CONFIG = load_config(CONFIG_PATH)
VOCODER_MODEL_PATH = os.path.join(r'C:\Users\sokka\Documents\tts\wavernn_mold\wavernn_mold_8a1c152', 'checkpoint_433000.pth.tar')
VOCODER_CONFIG_PATH = os.path.join(r'C:\Users\sokka\Documents\tts\wavernn_mold\wavernn_mold_8a1c152', 'config.json')
VOCODER_CONFIG = load_config(VOCODER_CONFIG_PATH)
use_cuda = True

# Set some config fields manually for testing
# CONFIG.windowing = False
# CONFIG.prenet_dropout = False
# CONFIG.separate_stopnet = True
# CONFIG.stopnet = True

# Set the vocoder
use_gl = False # use GL if True
batched_wavernn = True    # use batched wavernn inference if True


# LOAD TTS MODEL
from utils.text.symbols import symbols, phonemes

# load the model
num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)
model = setup_model(num_chars, CONFIG)

# load the audio processor
ap = AudioProcessor(**CONFIG.audio)


# load model state
if use_cuda:
    cp = torch.load(MODEL_PATH)
else:
    cp = torch.load(MODEL_PATH, map_location=lambda storage, loc: storage)

# load the model
model.load_state_dict(cp['model'])
if use_cuda:
    model.cuda()
model.eval()
print(cp['step'])


# LOAD WAVERNN
if use_gl == False:
    from WaveRNN.models.wavernn import Model
    bits = 10

    wavernn = Model(
            rnn_dims=512,
            fc_dims=512,
            mode="mold",
            pad=2,
            upsample_factors=VOCODER_CONFIG.upsample_factors,  # set this depending on dataset
            feat_dims=VOCODER_CONFIG.audio["num_mels"],
            compute_dims=128,
            res_out_dims=128,
            res_blocks=10,
            hop_length=ap.hop_length,
            sample_rate=ap.sample_rate,
        ).cuda()


    check = torch.load(VOCODER_MODEL_PATH)
    wavernn.load_state_dict(check['model'])
    if use_cuda:
        wavernn.cuda()
    wavernn.eval()
    print(check['step'])


illegalchars_exclusive = re.compile(r'[^\w\d\.\,\;\!\?\s]')
repitiion = re.compile(r'\s{2,}')
def custom_text_fix(sentence):
    global illegalchars_exclusive
    global repitiion
    newsentance = illegalchars_exclusive.sub(' ', sentence)
    newsentance = repitiion.sub(' ', newsentance)
    return newsentance


model.eval()
model.decoder.max_decoder_steps = 2000
speaker_id = 0

sentences = ["Bill got in the habit of asking himself “Is that thought true?” And if he wasn’t absolutely certain it was, he just let it go." ]

for sentence in sentences:
    sentence = custom_text_fix(sentence)
    sentence = cleaners.english_cleaners(sentence)
    alizgn, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)
	#following instructions from https://github.com/mozilla/TTS/blob/master/notebooks/Benchmark.ipynb

	#%load_ext autoreload
	#%autoreload 2
	import os
	import sys
	import io
	import time
	import numpy as np
	#pip3 install --user numpy


	from collections import OrderedDict
	#from matplotlib import pylab as plt

	import torch
	#To install with CUDA 9.2. This worked for me
	#https://developer.nvidia.com/cuda-92-download-archive?target_os=Windows&target_arch=x86_64&target_version=10&target_type=exenetwork
	#pip3 install --user --no-cache-dir torch==1.2.0+cu92 torchvision==0.4.0+cu92 -f https://download.pytorch.org/whl/torch_stable.html

	#For trying CUDA 10.0. This didn't work for me
	#https://developer.nvidia.com/cuda-10.0-download-archive?target_os=Windows&target_arch=x86_64&target_version=10&target_type=exelocal
	#pip3 install --no-cache-dir --user torch==1.2.0 torchvision==0.4.0 -f https://download.pytorch.org/whl/cu100/torch_stable.html


	TTS_PATH = os.path.join(r'C:\Users\sokka\Documents\tts\TTS')
	WAVERNN_PATH = os.path.join(TTS_PATH, 'WaveRNN')

	#%pylab inline
	#rcParams["figure.figsize"] = (16,5)

	# add libraries into environment
	#import importlib
	#importlib.import_module('TTS')

	sys.path.append(TTS_PATH) # set this if TTS is not installed globally
	sys.path.append(WAVERNN_PATH) # set this if TTS is not installed globally

	import librosa
	import librosa.display

	from models.tacotron import Tacotron
	from layers import *
	from utils.data import *
	from utils.audio import AudioProcessor
	from utils.generic_utils import load_config, setup_model
	from utils.text import text_to_sequence, cleaners
	from utils.synthesis import synthesis
	#from utils.visual import visualize




	#import IPython
	#from IPython.display import Audio
	#pip3 install --user ipython

	import os
	import re
	#os.environ['CUDA_VISIBLE_DEVICES']='1'
	#os.environ['OMP_NUM_THREADS']='1'

	iscuda = torch.cuda.is_available()
	print('torch.cuda.is_available()=' + str(iscuda))

	runcounter = 0
	def tts(model, text, CONFIG, use_cuda, ap, use_gl, speaker_id=None, figures=True):
	global runcounter
	t_1 = time.time()
	submatch = re.sub(r'\s+',' ',text)
	file_namematch = re.search( r'([^\s]+\s?\d+)', submatch)
	if file_namematch:
	file_name = file_namematch.group(0) + '_' + str(runcounter) + '.wav'
	else:
	file_name = 'tempout_' + str(runcounter) + '.wav'
	runcounter += 1

	waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens = synthesis(model, text, CONFIG, use_cuda, ap, truncated=False)
	if CONFIG.model == "Tacotron" and not use_gl:
	mel_postnet_spec = ap.out_linear_to_mel(mel_postnet_spec.T).T
	if not use_gl:
	waveform = wavernn.generate(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0).cuda(), batched=batched_wavernn, target=11000, overlap=550)

	print(" > Run-time: {}".format(time.time() - t_1))
	#if figures:
	# visualize(alignment, mel_postnet_spec, stop_tokens, text, ap.hop_length, CONFIG, mel_spec)
	#IPython.display.display(Audio(waveform, rate=CONFIG.audio['sample_rate']))
	os.makedirs(OUT_FOLDER, exist_ok=True)

	out_path = os.path.join(OUT_FOLDER, file_name)
	ap.save_wav(waveform, out_path)
	return alignment, mel_postnet_spec, stop_tokens, waveform



	# Set constants
	ROOT_PATH = TTS_PATH
	MODEL_PATH = os.path.join(r'C:\Users\sokka\Documents\tts\wavernn_mold\ljspeech_8a47b46','checkpoint_261000.pth.tar')
	CONFIG_PATH = os.path.join(r'C:\Users\sokka\Documents\tts\wavernn_mold\ljspeech_8a47b46','config.json')
	OUT_FOLDER = os.path.join(ROOT_PATH,'AudioSamples/benchmark_samples/')
	CONFIG = load_config(CONFIG_PATH)
	VOCODER_MODEL_PATH = os.path.join(r'C:\Users\sokka\Documents\tts\wavernn_mold\wavernn_mold_8a1c152', 'checkpoint_433000.pth.tar')
	VOCODER_CONFIG_PATH = os.path.join(r'C:\Users\sokka\Documents\tts\wavernn_mold\wavernn_mold_8a1c152', 'config.json')
	VOCODER_CONFIG = load_config(VOCODER_CONFIG_PATH)
	use_cuda = True

	# Set some config fields manually for testing
	# CONFIG.windowing = False
	# CONFIG.prenet_dropout = False
	# CONFIG.separate_stopnet = True
	# CONFIG.stopnet = True

	# Set the vocoder
	use_gl = False # use GL if True
	batched_wavernn = True # use batched wavernn inference if True



	# LOAD TTS MODEL
	from utils.text.symbols import symbols, phonemes

	# load the model
	num_chars = len(phonemes) if CONFIG.use_phonemes else len(symbols)
	model = setup_model(num_chars, CONFIG)

	# load the audio processor
	ap = AudioProcessor(**CONFIG.audio)


	# load model state
	if use_cuda:
	cp = torch.load(MODEL_PATH)
	else:
	cp = torch.load(MODEL_PATH, map_location=lambda storage, loc: storage)

	# load the model
	model.load_state_dict(cp['model'])
	if use_cuda:
	model.cuda()
	model.eval()
	print(cp['step'])



	# LOAD WAVERNN
	if use_gl == False:
	from WaveRNN.models.wavernn import Model
	bits = 10

	wavernn = Model(
	rnn_dims=512,
	fc_dims=512,
	mode="mold",
	pad=2,
	upsample_factors=VOCODER_CONFIG.upsample_factors, # set this depending on dataset
	feat_dims=VOCODER_CONFIG.audio["num_mels"],
	compute_dims=128,
	res_out_dims=128,
	res_blocks=10,
	hop_length=ap.hop_length,
	sample_rate=ap.sample_rate,
	).cuda()


	check = torch.load(VOCODER_MODEL_PATH)
	wavernn.load_state_dict(check['model'])
	if use_cuda:
	wavernn.cuda()
	wavernn.eval()
	print(check['step'])



	illegalchars_exclusive = re.compile(r'[^\w\d\.\,\;\!\?\s]')
	repitiion = re.compile(r'\s{2,}')
	def custom_text_fix(sentence):
	global illegalchars_exclusive
	global repitiion
	newsentance = illegalchars_exclusive.sub(' ', sentence)
	newsentance = repitiion.sub(' ', newsentance)
	return newsentance


	model.eval()
	model.decoder.max_decoder_steps = 2000
	speaker_id = 0

	sentences = ["Bill got in the habit of asking himself “Is that thought true?” And if he wasn’t absolutely certain it was, he just let it go." ]

	for sentence in sentences:
	sentence = custom_text_fix(sentence)
	sentence = cleaners.english_cleaners(sentence)
	alizgn, spec, stop_tokens, wav = tts(model, sentence, CONFIG, use_cuda, ap, speaker_id=speaker_id, use_gl=use_gl, figures=True)