karkirowle/gist:dff84a07a8c60e5745605e10a288752f

## gistfile1.txt
###############################################################################
#
#  Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#
###############################################################################
import matplotlib
#matplotlib.use("Agg")
#import matplotlib.pylab as plt
import matplotlib.pyplot as plt

import os
import argparse
import json
import sys
import numpy as np
import torch


from flowtron import Flowtron
from torch.utils.data import DataLoader
from data import Data, load_wav_to_torch
from train import update_params

sys.path.insert(0, "tacotron2")
sys.path.insert(0, "tacotron2/waveglow")
from glow import WaveGlow
from scipy.io.wavfile import write
from torch.nn import ReplicationPad1d, ReflectionPad1d
from glob import glob
from torch.distributions import Normal


def tile(a, dim, n_tile):
    " a = array, dim=on which dim to tile, how"
    init_dim = a.size(dim)
    repeat_idx = [1] * a.dim()
    repeat_idx[dim] = n_tile
    a = a.repeat(*(repeat_idx))
    order_index = torch.LongTensor(np.concatenate([init_dim * np.arange(n_tile) + i for i in range(init_dim)]))
    return torch.index_select(a, dim, order_index)

def infer(flowtron_path, waveglow_path, text, speaker_id, n_frames, sigma,
          seed,utterance=None):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

    # load waveglow
    waveglow = torch.load(waveglow_path)['model'].cuda().eval()
    waveglow.cuda().half()
    for k in waveglow.convinv:
        k.float()
    waveglow.eval()

    # load flowtron
    model = Flowtron(**model_config).cuda()
    state_dict = torch.load(flowtron_path, map_location='cpu')['state_dict']
    model.load_state_dict(state_dict)
    model.eval()
    print("Loaded checkpoint '{}')" .format(flowtron_path))

    ignore_keys = ['training_files', 'validation_files']
    trainset = Data(
        data_config['training_files'],
        **dict((k, v) for k, v in data_config.items() if k not in ignore_keys))
    speaker_vecs = trainset.get_speaker_id(speaker_id).cuda()

    text = trainset.get_text(text).cuda()
    speaker_vecs = speaker_vecs[None]
    text = text[None]

    category = "happy"
    with torch.no_grad():
        files = glob("data/" + category + "/*.wav")
        residual_accumulator = torch.zeros((1,80,n_frames)).to("cuda")
        for utterance in files:
            if utterance is None:
                residual = torch.cuda.FloatTensor(1, 80, n_frames).normal_() * sigma
            else:

                utt_text = "Dogs are sitting by the door!"
                utt_text = trainset.get_text(utt_text).cuda()
                utt_text = utt_text[None]

                # loading mel spectra, in_lens, out_lens?
                audio, _ = load_wav_to_torch(utterance)
                mel = trainset.get_mel(audio).to(device="cuda")

                # You need to pad this because of the permute
                mel = mel[None]
                out_lens = torch.LongTensor(1).to(device="cuda")

                # talan
                out_lens[0] = mel.size(2)
                in_lens = torch.LongTensor([utt_text.shape[1]]).to(device="cuda")
                residual, _, _, _, _, _, _ = model.forward(mel, speaker_vecs, utt_text, in_lens, out_lens)
                residual = residual.permute(1, 2, 0)

                residual = residual[:,:,:n_frames]

                if residual.shape[2] < n_frames:
                    num_tile = int(np.ceil(n_frames/residual.shape[2]))

                    # I used tiling instead of replication
                    residual = tile(residual.cpu(),2,num_tile).to("cuda")

                residual_accumulator = residual_accumulator + residual[:,:,:n_frames]

        residual_accumulator = residual_accumulator / len(files)

        average_over_time = True
        if not average_over_time:
            dist = Normal(residual_accumulator, sigma)
            z_style = dist.sample()
        else:
            residual_accumulator = residual_accumulator.mean(dim=2)
            dist = Normal(residual_accumulator,sigma)
            z_style = dist.sample((n_frames,)).permute(1,2,0)

        mels, attentions = model.infer(z_style, speaker_vecs, text)


    for k in range(len(attentions)):
        attention = torch.cat(attentions[k]).cpu().numpy()
        fig, axes = plt.subplots(1, 2, figsize=(16, 4))
        axes[0].imshow(mels[0].cpu().numpy(), origin='bottom', aspect='auto')
        axes[1].imshow(attention[:, 0].transpose(), origin='bottom', aspect='auto')
        fig.savefig('sid{}_sigma{}_attnlayer{}.png'.format(speaker_id, sigma, k))
        plt.close("all")

    audio = waveglow.infer(mels.half(), sigma=0.8).float()
    audio = audio.cpu().numpy()[0]
    # normalize audio for now
    audio = audio / np.abs(audio).max()
    print(audio.shape)
    write("sid{}_sigma{}_{}_timeav{}_2_seed{}.wav".format(speaker_id, sigma,category,average_over_time,seed),
          data_config['sampling_rate'], audio)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('-c', '--config', type=str,
                        help='JSON file for configuration')
    parser.add_argument('-p', '--params', nargs='+', default=[])
    parser.add_argument('-f', '--flowtron_path',
                        help='Path to flowtron state dict', type=str)
    parser.add_argument('-w', '--waveglow_path',
                        help='Path to waveglow state dict', type=str)
    parser.add_argument('-t', '--text', help='Text to synthesize', type=str)
    parser.add_argument('-i', '--id', help='Speaker id', type=int)
    parser.add_argument('-u', '--utterance', help='Utterance', type=str)
    parser.add_argument('-n', '--n_frames', help='Number of frames',
                        default=400, type=int)
    parser.add_argument('-o', "--output_dir", default="results/")
    parser.add_argument("-s", "--sigma", default=0.5, type=float)
    parser.add_argument("--seed", default=1234, type=int) # 0 1234
    args = parser.parse_args()

    # Parse configs.  Globals nicer in this case
    with open(args.config) as f:
        data = f.read()

    global config
    config = json.loads(data)
    update_params(config, args.params)

    data_config = config["data_config"]
    global model_config
    model_config = config["model_config"]

    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = False
    infer(args.flowtron_path, args.waveglow_path, args.text, args.id,
          args.n_frames, args.sigma, args.seed,args.utterance)
	###############################################################################
	#
	# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#
	###############################################################################
	import matplotlib
	#matplotlib.use("Agg")
	#import matplotlib.pylab as plt
	import matplotlib.pyplot as plt

	import os
	import argparse
	import json
	import sys
	import numpy as np
	import torch


	from flowtron import Flowtron
	from torch.utils.data import DataLoader
	from data import Data, load_wav_to_torch
	from train import update_params

	sys.path.insert(0, "tacotron2")
	sys.path.insert(0, "tacotron2/waveglow")
	from glow import WaveGlow
	from scipy.io.wavfile import write
	from torch.nn import ReplicationPad1d, ReflectionPad1d
	from glob import glob
	from torch.distributions import Normal


	def tile(a, dim, n_tile):
	" a = array, dim=on which dim to tile, how"
	init_dim = a.size(dim)
	repeat_idx = [1] * a.dim()
	repeat_idx[dim] = n_tile
	a = a.repeat(*(repeat_idx))
	order_index = torch.LongTensor(np.concatenate([init_dim * np.arange(n_tile) + i for i in range(init_dim)]))
	return torch.index_select(a, dim, order_index)

	def infer(flowtron_path, waveglow_path, text, speaker_id, n_frames, sigma,
	seed,utterance=None):
	torch.manual_seed(seed)
	torch.cuda.manual_seed(seed)

	# load waveglow
	waveglow = torch.load(waveglow_path)['model'].cuda().eval()
	waveglow.cuda().half()
	for k in waveglow.convinv:
	k.float()
	waveglow.eval()

	# load flowtron
	model = Flowtron(**model_config).cuda()
	state_dict = torch.load(flowtron_path, map_location='cpu')['state_dict']
	model.load_state_dict(state_dict)
	model.eval()
	print("Loaded checkpoint '{}')" .format(flowtron_path))

	ignore_keys = ['training_files', 'validation_files']
	trainset = Data(
	data_config['training_files'],
	**dict((k, v) for k, v in data_config.items() if k not in ignore_keys))
	speaker_vecs = trainset.get_speaker_id(speaker_id).cuda()

	text = trainset.get_text(text).cuda()
	speaker_vecs = speaker_vecs[None]
	text = text[None]

	category = "happy"
	with torch.no_grad():
	files = glob("data/" + category + "/*.wav")
	residual_accumulator = torch.zeros((1,80,n_frames)).to("cuda")
	for utterance in files:
	if utterance is None:
	residual = torch.cuda.FloatTensor(1, 80, n_frames).normal_() * sigma
	else:

	utt_text = "Dogs are sitting by the door!"
	utt_text = trainset.get_text(utt_text).cuda()
	utt_text = utt_text[None]

	# loading mel spectra, in_lens, out_lens?
	audio, _ = load_wav_to_torch(utterance)
	mel = trainset.get_mel(audio).to(device="cuda")

	# You need to pad this because of the permute
	mel = mel[None]
	out_lens = torch.LongTensor(1).to(device="cuda")

	# talan
	out_lens[0] = mel.size(2)
	in_lens = torch.LongTensor([utt_text.shape[1]]).to(device="cuda")
	residual, _, _, _, _, _, _ = model.forward(mel, speaker_vecs, utt_text, in_lens, out_lens)
	residual = residual.permute(1, 2, 0)

	residual = residual[:,:,:n_frames]

	if residual.shape[2] < n_frames:
	num_tile = int(np.ceil(n_frames/residual.shape[2]))

	# I used tiling instead of replication
	residual = tile(residual.cpu(),2,num_tile).to("cuda")

	residual_accumulator = residual_accumulator + residual[:,:,:n_frames]

	residual_accumulator = residual_accumulator / len(files)

	average_over_time = True
	if not average_over_time:
	dist = Normal(residual_accumulator, sigma)
	z_style = dist.sample()
	else:
	residual_accumulator = residual_accumulator.mean(dim=2)
	dist = Normal(residual_accumulator,sigma)
	z_style = dist.sample((n_frames,)).permute(1,2,0)

	mels, attentions = model.infer(z_style, speaker_vecs, text)


	for k in range(len(attentions)):
	attention = torch.cat(attentions[k]).cpu().numpy()
	fig, axes = plt.subplots(1, 2, figsize=(16, 4))
	axes[0].imshow(mels[0].cpu().numpy(), origin='bottom', aspect='auto')
	axes[1].imshow(attention[:, 0].transpose(), origin='bottom', aspect='auto')
	fig.savefig('sid{}_sigma{}_attnlayer{}.png'.format(speaker_id, sigma, k))
	plt.close("all")

	audio = waveglow.infer(mels.half(), sigma=0.8).float()
	audio = audio.cpu().numpy()[0]
	# normalize audio for now
	audio = audio / np.abs(audio).max()
	print(audio.shape)
	write("sid{}_sigma{}_{}_timeav{}_2_seed{}.wav".format(speaker_id, sigma,category,average_over_time,seed),
	data_config['sampling_rate'], audio)


	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument('-c', '--config', type=str,
	help='JSON file for configuration')
	parser.add_argument('-p', '--params', nargs='+', default=[])
	parser.add_argument('-f', '--flowtron_path',
	help='Path to flowtron state dict', type=str)
	parser.add_argument('-w', '--waveglow_path',
	help='Path to waveglow state dict', type=str)
	parser.add_argument('-t', '--text', help='Text to synthesize', type=str)
	parser.add_argument('-i', '--id', help='Speaker id', type=int)
	parser.add_argument('-u', '--utterance', help='Utterance', type=str)
	parser.add_argument('-n', '--n_frames', help='Number of frames',
	default=400, type=int)
	parser.add_argument('-o', "--output_dir", default="results/")
	parser.add_argument("-s", "--sigma", default=0.5, type=float)
	parser.add_argument("--seed", default=1234, type=int) # 0 1234
	args = parser.parse_args()

	# Parse configs. Globals nicer in this case
	with open(args.config) as f:
	data = f.read()

	global config
	config = json.loads(data)
	update_params(config, args.params)

	data_config = config["data_config"]
	global model_config
	model_config = config["model_config"]

	torch.backends.cudnn.enabled = True
	torch.backends.cudnn.benchmark = False
	infer(args.flowtron_path, args.waveglow_path, args.text, args.id,
	args.n_frames, args.sigma, args.seed,args.utterance)