bahorn/voice-script.py

## voice-script.py
from encoder.params_model import model_embedding_size as speaker_embedding_size
from utils.argutils import print_args
from synthesizer.inference import Synthesizer
from encoder import inference as encoder
from vocoder import inference as vocoder
from pathlib import Path
import numpy as np
import librosa
import argparse
import torch
import sys
import random
import string
import re
import math


def smooth(x,window_len=11,window='hamming'):
    if x.ndim != 1:
        raise ValueError("smooth only accepts 1 dimension arrays.")

    if x.size < window_len:
        raise ValueError("Input vector needs to be bigger than window size.")


    if window_len<3:
        return x


    if not window in ['flat', 'hanning', 'hamming', 'bartlett', 'blackman']:
        raise ValueError("Window is on of 'flat', 'hanning', 'hamming', 'bartlett', 'blackman'")

    if window == 'flat': #moving average
        w=np.ones(window_len,'d')
    else:
        w=eval('np.'+window+'(window_len)')

    y=np.convolve(w/w.sum(),x,mode='valid')
    return y

def make_text_buckets(text, ideal=30):
    # first remove special characters
    striped = text #re.sub(r"[^a-zA-Z0-9]+", ' ', text)
    # split on spaces so we can bucket each word
    split = striped.split(' ')
    bucket_size = math.ceil(((ideal/len(split)))*ideal)
    items = [split[i:i + bucket_size] for i in range(0, len(split), bucket_size)]
    res = []
    for i in items:
        res.append(' '.join(i))
    return res

def random_string(N):
    return ''.join(random.choices(string.ascii_uppercase + string.digits, k=N))

if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter
    )
    parser.add_argument("-e", "--enc_model_fpath", type=Path,
                        default="encoder/saved_models/pretrained.pt",
                        help="Path to a saved encoder")
    parser.add_argument("-s", "--syn_model_dir", type=Path,
                        default="synthesizer/saved_models/logs-pretrained/",
                        help="Directory containing the synthesizer model")
    parser.add_argument("-v", "--voc_model_fpath", type=Path,
                        default="vocoder/saved_models/pretrained/pretrained.pt",
                        help="Path to a saved vocoder")
    parser.add_argument("--low_mem", action="store_true", help=\
                        "If True, the memory used by the synthesizer will be freed after each use. Adds large "
                        "overhead but allows to save some GPU memory for lower-end GPUs.")
    parser.add_argument("--window", type=int, default=5)
    parser.add_argument("--length", type=int, default=30)
    parser.add_argument("--output")
    parser.add_argument("voice")
    parser.add_argument("text")
    args = parser.parse_args()

    output_fname = '{}/{}.wav'.format('outputs',random_string(16))

    if args.output:
        output_fname= args.output

    ## Print some environment information (for debugging purposes)
    if not torch.cuda.is_available():
        print("Your PyTorch installation is not configured to use CUDA. If you have a GPU ready "
              "for deep learning, ensure that the drivers are properly installed, and that your "
              "CUDA version matches your PyTorch installation. CPU-only inference is currently "
              "not supported.", file=sys.stderr)
        quit(-1)
    device_id = torch.cuda.current_device()
    gpu_properties = torch.cuda.get_device_properties(device_id)
    print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
          "%.1fGb total memory.\n" %
          (torch.cuda.device_count(),
           device_id,
           gpu_properties.name,
           gpu_properties.major,
           gpu_properties.minor,
           gpu_properties.total_memory / 1e9))


    ## Load the models one by one.
    print("Preparing the encoder, the synthesizer and the vocoder...")
    encoder.load_model(args.enc_model_fpath)
    synthesizer = Synthesizer(args.syn_model_dir.joinpath("taco_pretrained"), low_mem=args.low_mem)
    vocoder.load_model(args.voc_model_fpath)


    try:
        # Get the reference audio filepath
        message = "Reference voice: enter an audio filepath of a voice to be cloned (mp3, " \
            "wav, m4a, flac, ...):\n"
        in_fpath = Path(args.voice.replace("\"", "").replace("\'", ""))


        ## Computing the embedding
        # First, we load the wav using the function that the speaker encoder provides. This is
        # important: there is preprocessing that must be applied.

        # The following two methods are equivalent:
        # - Directly load from the filepath:
        preprocessed_wav = encoder.preprocess_wav(in_fpath)
        # - If the wav is already loaded:
        original_wav, sampling_rate = librosa.load(in_fpath)
        preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
        print("Loaded file succesfully")

        # Then we derive the embedding. There are many functions and parameters that the
        # speaker encoder interfaces. These are mostly for in-depth research. You will typically
        # only use this function (with its default parameters):
        embed = encoder.embed_utterance(preprocessed_wav)
        print("Created the embedding")


        # into optimally sized buckets.

        texts = make_text_buckets(args.text, ideal=args.length)
        embeds = [embed for _ in texts]
        # If you know what the attention layer alignments are, you can retrieve them here by
        # passing return_alignments=True
        specs = synthesizer.synthesize_spectrograms(texts, embeds)
        generated_wav = []
        for spec in specs:
            curr_wav = vocoder.infer_waveform(spec)
            yt, index = librosa.effects.trim(curr_wav, top_db=100)
            generated_wav.append(yt)
        combined = np.concatenate(generated_wav)
        intervals = librosa.effects.split(combined, top_db=100)
        res = []
        for start, end in intervals:
            res.append(
                np.pad(
                    combined[start:end],
                    (0, int(synthesizer.sample_rate/2)),
                    mode='constant'
                )
            )
        output = np.concatenate(res).astype(np.float32)
        # finally smooth the output
        output = smooth(output, window_len=args.window)
        librosa.output.write_wav(
            output_fname,
            output,
            synthesizer.sample_rate
        )

        print("\nSaved output as %s\n\n" % output_fname)
    except Exception as e:
        print("Caught exception: %s" % repr(e))
        print("Restarting\n")
	from encoder.params_model import model_embedding_size as speaker_embedding_size
	from utils.argutils import print_args
	from synthesizer.inference import Synthesizer
	from encoder import inference as encoder
	from vocoder import inference as vocoder
	from pathlib import Path
	import numpy as np
	import librosa
	import argparse
	import torch
	import sys
	import random
	import string
	import re
	import math


	def smooth(x,window_len=11,window='hamming'):
	if x.ndim != 1:
	raise ValueError("smooth only accepts 1 dimension arrays.")

	if x.size < window_len:
	raise ValueError("Input vector needs to be bigger than window size.")


	if window_len<3:
	return x


	if not window in ['flat', 'hanning', 'hamming', 'bartlett', 'blackman']:
	raise ValueError("Window is on of 'flat', 'hanning', 'hamming', 'bartlett', 'blackman'")

	if window == 'flat': #moving average
	w=np.ones(window_len,'d')
	else:
	w=eval('np.'+window+'(window_len)')

	y=np.convolve(w/w.sum(),x,mode='valid')
	return y

	def make_text_buckets(text, ideal=30):
	# first remove special characters
	striped = text #re.sub(r"[^a-zA-Z0-9]+", ' ', text)
	# split on spaces so we can bucket each word
	split = striped.split(' ')
	bucket_size = math.ceil(((ideal/len(split)))*ideal)
	items = [split[i:i + bucket_size] for i in range(0, len(split), bucket_size)]
	res = []
	for i in items:
	res.append(' '.join(i))
	return res

	def random_string(N):
	return ''.join(random.choices(string.ascii_uppercase + string.digits, k=N))

	if __name__ == '__main__':
	parser = argparse.ArgumentParser(
	formatter_class=argparse.ArgumentDefaultsHelpFormatter
	)
	parser.add_argument("-e", "--enc_model_fpath", type=Path,
	default="encoder/saved_models/pretrained.pt",
	help="Path to a saved encoder")
	parser.add_argument("-s", "--syn_model_dir", type=Path,
	default="synthesizer/saved_models/logs-pretrained/",
	help="Directory containing the synthesizer model")
	parser.add_argument("-v", "--voc_model_fpath", type=Path,
	default="vocoder/saved_models/pretrained/pretrained.pt",
	help="Path to a saved vocoder")
	parser.add_argument("--low_mem", action="store_true", help=\
	"If True, the memory used by the synthesizer will be freed after each use. Adds large "
	"overhead but allows to save some GPU memory for lower-end GPUs.")
	parser.add_argument("--window", type=int, default=5)
	parser.add_argument("--length", type=int, default=30)
	parser.add_argument("--output")
	parser.add_argument("voice")
	parser.add_argument("text")
	args = parser.parse_args()

	output_fname = '{}/{}.wav'.format('outputs',random_string(16))

	if args.output:
	output_fname= args.output

	## Print some environment information (for debugging purposes)
	if not torch.cuda.is_available():
	print("Your PyTorch installation is not configured to use CUDA. If you have a GPU ready "
	"for deep learning, ensure that the drivers are properly installed, and that your "
	"CUDA version matches your PyTorch installation. CPU-only inference is currently "
	"not supported.", file=sys.stderr)
	quit(-1)
	device_id = torch.cuda.current_device()
	gpu_properties = torch.cuda.get_device_properties(device_id)
	print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
	"%.1fGb total memory.\n" %
	(torch.cuda.device_count(),
	device_id,
	gpu_properties.name,
	gpu_properties.major,
	gpu_properties.minor,
	gpu_properties.total_memory / 1e9))


	## Load the models one by one.
	print("Preparing the encoder, the synthesizer and the vocoder...")
	encoder.load_model(args.enc_model_fpath)
	synthesizer = Synthesizer(args.syn_model_dir.joinpath("taco_pretrained"), low_mem=args.low_mem)
	vocoder.load_model(args.voc_model_fpath)


	try:
	# Get the reference audio filepath
	message = "Reference voice: enter an audio filepath of a voice to be cloned (mp3, " \
	"wav, m4a, flac, ...):\n"
	in_fpath = Path(args.voice.replace("\"", "").replace("\'", ""))


	## Computing the embedding
	# First, we load the wav using the function that the speaker encoder provides. This is
	# important: there is preprocessing that must be applied.

	# The following two methods are equivalent:
	# - Directly load from the filepath:
	preprocessed_wav = encoder.preprocess_wav(in_fpath)
	# - If the wav is already loaded:
	original_wav, sampling_rate = librosa.load(in_fpath)
	preprocessed_wav = encoder.preprocess_wav(original_wav, sampling_rate)
	print("Loaded file succesfully")

	# Then we derive the embedding. There are many functions and parameters that the
	# speaker encoder interfaces. These are mostly for in-depth research. You will typically
	# only use this function (with its default parameters):
	embed = encoder.embed_utterance(preprocessed_wav)
	print("Created the embedding")


	# into optimally sized buckets.

	texts = make_text_buckets(args.text, ideal=args.length)
	embeds = [embed for _ in texts]
	# If you know what the attention layer alignments are, you can retrieve them here by
	# passing return_alignments=True
	specs = synthesizer.synthesize_spectrograms(texts, embeds)
	generated_wav = []
	for spec in specs:
	curr_wav = vocoder.infer_waveform(spec)
	yt, index = librosa.effects.trim(curr_wav, top_db=100)
	generated_wav.append(yt)
	combined = np.concatenate(generated_wav)
	intervals = librosa.effects.split(combined, top_db=100)
	res = []
	for start, end in intervals:
	res.append(
	np.pad(
	combined[start:end],
	(0, int(synthesizer.sample_rate/2)),
	mode='constant'
	)
	)
	output = np.concatenate(res).astype(np.float32)
	# finally smooth the output
	output = smooth(output, window_len=args.window)
	librosa.output.write_wav(
	output_fname,
	output,
	synthesizer.sample_rate
	)

	print("\nSaved output as %s\n\n" % output_fname)
	except Exception as e:
	print("Caught exception: %s" % repr(e))
	print("Restarting\n")