r9y9/gmm_vc.py

## gmm_vc.py
"""GMM-based voice conversion"""

from nnmnkwii.datasets import FileSourceDataset
from nnmnkwii.datasets.cmu_arctic import CMUArcticWavFileDataSource
from nnmnkwii.preprocessing.alignment import DTWAligner
from nnmnkwii.util import apply_each2d_trim
from nnmnkwii.preprocessing import remove_zeros_frames, delta_features
from nnmnkwii.metrics import melcd
from nnmnkwii.baseline.gmm import MLPG

from os.path import join, expanduser, basename, splitext
import sys
import time

import numpy as np
from scipy.io import wavfile
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import train_test_split
import pyworld
import pysptk
from pysptk.synthesis import MLSADF, Synthesizer

DATA_ROOT = join(expanduser("~"), "data", "cmu_arctic")

fs = 16000
fftlen = pyworld.get_cheaptrick_fft_size(fs)
alpha = pysptk.util.mcepalpha(fs)
order = 24
frame_period = 5
max_files = 100
use_delta = True
diff = True

if use_delta:
    windows = [
        (0, 0, np.array([1.0])),
        (1, 1, np.array([-0.5, 0.0, 0.5])),
        (1, 1, np.array([1.0, -2.0, 1.0])),
    ]
else:
    windows = [
        (0, 0, np.array([1.0])),
    ]


class CMUArcticSpectrumDataSource(CMUArcticWavFileDataSource):
    def __init__(self, *args, **kwargs):
        super(CMUArcticSpectrumDataSource, self).__init__(*args, **kwargs)
        self.test_paths = None

    def collect_files(self):
        paths = super(
            CMUArcticSpectrumDataSource, self).collect_files()
        paths_train, paths_test = train_test_split(
            paths, test_size=0.3, random_state=1234)

        # keep paths for later testing
        self.test_paths = paths_test

        return paths_train

    def collect_features(self, path):
        fs, x = wavfile.read(path)
        x = x.astype(np.float64)
        f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period)
        spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
        mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha)
        return mc


# Data sources
# Since we need to create a parallel dataset, read data separetely
clb_source = CMUArcticSpectrumDataSource(data_root=DATA_ROOT,
                                         speakers=["clb"], max_files=max_files)
slt_source = CMUArcticSpectrumDataSource(data_root=DATA_ROOT,
                                         speakers=["slt"], max_files=max_files)

# Build dataset as 3D tensor (NxTxD)
X = FileSourceDataset(clb_source).asarray(padded_length=1200)
Y = FileSourceDataset(slt_source).asarray(padded_length=1200)

# Alignment
X, Y = DTWAligner(verbose=0, dist=melcd).transform((X, Y))

# Drop 1st dimention
X, Y = X[:, :, 1:], Y[:, :, 1:]

static_dim = X.shape[-1]
if use_delta:
    X = apply_each2d_trim(delta_features, X, windows)
    Y = apply_each2d_trim(delta_features, Y, windows)

# Joint features
XY = np.concatenate((X, Y), axis=-1).reshape(-1, X.shape[-1] * 2)
XY = remove_zeros_frames(XY)
print(XY.shape)
gmm = GaussianMixture(
    n_components=32, covariance_type="full", max_iter=100, verbose=1)

gmm.fit(XY)

# Parameter generation
paramgen = MLPG(gmm, windows=windows, diff=diff)

# Waveform generation for test set
for idx, path in enumerate(clb_source.test_paths):
    fs, x = wavfile.read(path)
    x = x.astype(np.float64)
    f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period)
    spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
    aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)

    mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha)
    c0, mc = mc[:, 0], mc[:, 1:]
    if use_delta:
        mc = delta_features(mc, windows)
    since = time.time()
    mc = paramgen.transform(mc)
    print("{}, Elapsed time in conversion: {}s".format(idx, time.time() - since))
    assert mc.shape[-1] == static_dim
    mc = np.hstack((c0[:, None], mc))
    if diff:
        mc[:, 0] = 0
        engine = Synthesizer(MLSADF(order=24, alpha=alpha), hopsize=80)
        b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha)
        waveform = engine.synthesis(x, b)
    else:
        spectrogram = pysptk.mc2sp(
            mc.astype(np.float64), alpha=alpha, fftlen=fftlen)
        waveform = pyworld.synthesize(
            f0, spectrogram, aperiodicity, fs, frame_period)

    suffix = "mlpg" if use_delta else "mse"
    wavfile.write("samples/{}_{}.wav".format(splitext(basename(path))[0], suffix),
                  fs, waveform.astype(np.int16))
    print("Source range:", np.min(x), np.max(x))
    print("Converted range:", np.min(waveform), np.max(waveform))

sys.exit(0)
	"""GMM-based voice conversion"""

	from nnmnkwii.datasets import FileSourceDataset
	from nnmnkwii.datasets.cmu_arctic import CMUArcticWavFileDataSource
	from nnmnkwii.preprocessing.alignment import DTWAligner
	from nnmnkwii.util import apply_each2d_trim
	from nnmnkwii.preprocessing import remove_zeros_frames, delta_features
	from nnmnkwii.metrics import melcd
	from nnmnkwii.baseline.gmm import MLPG

	from os.path import join, expanduser, basename, splitext
	import sys
	import time

	import numpy as np
	from scipy.io import wavfile
	from sklearn.mixture import GaussianMixture
	from sklearn.model_selection import train_test_split
	import pyworld
	import pysptk
	from pysptk.synthesis import MLSADF, Synthesizer

	DATA_ROOT = join(expanduser("~"), "data", "cmu_arctic")

	fs = 16000
	fftlen = pyworld.get_cheaptrick_fft_size(fs)
	alpha = pysptk.util.mcepalpha(fs)
	order = 24
	frame_period = 5
	max_files = 100
	use_delta = True
	diff = True

	if use_delta:
	windows = [
	(0, 0, np.array([1.0])),
	(1, 1, np.array([-0.5, 0.0, 0.5])),
	(1, 1, np.array([1.0, -2.0, 1.0])),
	]
	else:
	windows = [
	(0, 0, np.array([1.0])),
	]


	class CMUArcticSpectrumDataSource(CMUArcticWavFileDataSource):
	def __init__(self, args, *kwargs):
	super(CMUArcticSpectrumDataSource, self).__init__(args, *kwargs)
	self.test_paths = None

	def collect_files(self):
	paths = super(
	CMUArcticSpectrumDataSource, self).collect_files()
	paths_train, paths_test = train_test_split(
	paths, test_size=0.3, random_state=1234)

	# keep paths for later testing
	self.test_paths = paths_test

	return paths_train

	def collect_features(self, path):
	fs, x = wavfile.read(path)
	x = x.astype(np.float64)
	f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period)
	spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
	mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha)
	return mc


	# Data sources
	# Since we need to create a parallel dataset, read data separetely
	clb_source = CMUArcticSpectrumDataSource(data_root=DATA_ROOT,
	speakers=["clb"], max_files=max_files)
	slt_source = CMUArcticSpectrumDataSource(data_root=DATA_ROOT,
	speakers=["slt"], max_files=max_files)

	# Build dataset as 3D tensor (NxTxD)
	X = FileSourceDataset(clb_source).asarray(padded_length=1200)
	Y = FileSourceDataset(slt_source).asarray(padded_length=1200)

	# Alignment
	X, Y = DTWAligner(verbose=0, dist=melcd).transform((X, Y))

	# Drop 1st dimention
	X, Y = X[:, :, 1:], Y[:, :, 1:]

	static_dim = X.shape[-1]
	if use_delta:
	X = apply_each2d_trim(delta_features, X, windows)
	Y = apply_each2d_trim(delta_features, Y, windows)

	# Joint features
	XY = np.concatenate((X, Y), axis=-1).reshape(-1, X.shape[-1] * 2)
	XY = remove_zeros_frames(XY)
	print(XY.shape)
	gmm = GaussianMixture(
	n_components=32, covariance_type="full", max_iter=100, verbose=1)

	gmm.fit(XY)

	# Parameter generation
	paramgen = MLPG(gmm, windows=windows, diff=diff)

	# Waveform generation for test set
	for idx, path in enumerate(clb_source.test_paths):
	fs, x = wavfile.read(path)
	x = x.astype(np.float64)
	f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period)
	spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
	aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)

	mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha)
	c0, mc = mc[:, 0], mc[:, 1:]
	if use_delta:
	mc = delta_features(mc, windows)
	since = time.time()
	mc = paramgen.transform(mc)
	print("{}, Elapsed time in conversion: {}s".format(idx, time.time() - since))
	assert mc.shape[-1] == static_dim
	mc = np.hstack((c0[:, None], mc))
	if diff:
	mc[:, 0] = 0
	engine = Synthesizer(MLSADF(order=24, alpha=alpha), hopsize=80)
	b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha)
	waveform = engine.synthesis(x, b)
	else:
	spectrogram = pysptk.mc2sp(
	mc.astype(np.float64), alpha=alpha, fftlen=fftlen)
	waveform = pyworld.synthesize(
	f0, spectrogram, aperiodicity, fs, frame_period)

	suffix = "mlpg" if use_delta else "mse"
	wavfile.write("samples/{}_{}.wav".format(splitext(basename(path))[0], suffix),
	fs, waveform.astype(np.int16))
	print("Source range:", np.min(x), np.max(x))
	print("Converted range:", np.min(waveform), np.max(waveform))

	sys.exit(0)