Created
September 12, 2017 17:23
-
-
Save r9y9/ed5abc86fff3397b78ab68e57c2a4cb7 to your computer and use it in GitHub Desktop.
GMM voice conversion using https://github.com/r9y9/nnmnkwii
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""GMM-based voice conversion""" | |
from nnmnkwii.datasets import FileSourceDataset | |
from nnmnkwii.datasets.cmu_arctic import CMUArcticWavFileDataSource | |
from nnmnkwii.preprocessing.alignment import DTWAligner | |
from nnmnkwii.util import apply_each2d_trim | |
from nnmnkwii.preprocessing import remove_zeros_frames, delta_features | |
from nnmnkwii.metrics import melcd | |
from nnmnkwii.baseline.gmm import MLPG | |
from os.path import join, expanduser, basename, splitext | |
import sys | |
import time | |
import numpy as np | |
from scipy.io import wavfile | |
from sklearn.mixture import GaussianMixture | |
from sklearn.model_selection import train_test_split | |
import pyworld | |
import pysptk | |
from pysptk.synthesis import MLSADF, Synthesizer | |
DATA_ROOT = join(expanduser("~"), "data", "cmu_arctic") | |
fs = 16000 | |
fftlen = pyworld.get_cheaptrick_fft_size(fs) | |
alpha = pysptk.util.mcepalpha(fs) | |
order = 24 | |
frame_period = 5 | |
max_files = 100 | |
use_delta = True | |
diff = True | |
if use_delta: | |
windows = [ | |
(0, 0, np.array([1.0])), | |
(1, 1, np.array([-0.5, 0.0, 0.5])), | |
(1, 1, np.array([1.0, -2.0, 1.0])), | |
] | |
else: | |
windows = [ | |
(0, 0, np.array([1.0])), | |
] | |
class CMUArcticSpectrumDataSource(CMUArcticWavFileDataSource): | |
def __init__(self, *args, **kwargs): | |
super(CMUArcticSpectrumDataSource, self).__init__(*args, **kwargs) | |
self.test_paths = None | |
def collect_files(self): | |
paths = super( | |
CMUArcticSpectrumDataSource, self).collect_files() | |
paths_train, paths_test = train_test_split( | |
paths, test_size=0.3, random_state=1234) | |
# keep paths for later testing | |
self.test_paths = paths_test | |
return paths_train | |
def collect_features(self, path): | |
fs, x = wavfile.read(path) | |
x = x.astype(np.float64) | |
f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period) | |
spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) | |
mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha) | |
return mc | |
# Data sources | |
# Since we need to create a parallel dataset, read data separetely | |
clb_source = CMUArcticSpectrumDataSource(data_root=DATA_ROOT, | |
speakers=["clb"], max_files=max_files) | |
slt_source = CMUArcticSpectrumDataSource(data_root=DATA_ROOT, | |
speakers=["slt"], max_files=max_files) | |
# Build dataset as 3D tensor (NxTxD) | |
X = FileSourceDataset(clb_source).asarray(padded_length=1200) | |
Y = FileSourceDataset(slt_source).asarray(padded_length=1200) | |
# Alignment | |
X, Y = DTWAligner(verbose=0, dist=melcd).transform((X, Y)) | |
# Drop 1st dimention | |
X, Y = X[:, :, 1:], Y[:, :, 1:] | |
static_dim = X.shape[-1] | |
if use_delta: | |
X = apply_each2d_trim(delta_features, X, windows) | |
Y = apply_each2d_trim(delta_features, Y, windows) | |
# Joint features | |
XY = np.concatenate((X, Y), axis=-1).reshape(-1, X.shape[-1] * 2) | |
XY = remove_zeros_frames(XY) | |
print(XY.shape) | |
gmm = GaussianMixture( | |
n_components=32, covariance_type="full", max_iter=100, verbose=1) | |
gmm.fit(XY) | |
# Parameter generation | |
paramgen = MLPG(gmm, windows=windows, diff=diff) | |
# Waveform generation for test set | |
for idx, path in enumerate(clb_source.test_paths): | |
fs, x = wavfile.read(path) | |
x = x.astype(np.float64) | |
f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period) | |
spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) | |
aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) | |
mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha) | |
c0, mc = mc[:, 0], mc[:, 1:] | |
if use_delta: | |
mc = delta_features(mc, windows) | |
since = time.time() | |
mc = paramgen.transform(mc) | |
print("{}, Elapsed time in conversion: {}s".format(idx, time.time() - since)) | |
assert mc.shape[-1] == static_dim | |
mc = np.hstack((c0[:, None], mc)) | |
if diff: | |
mc[:, 0] = 0 | |
engine = Synthesizer(MLSADF(order=24, alpha=alpha), hopsize=80) | |
b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha) | |
waveform = engine.synthesis(x, b) | |
else: | |
spectrogram = pysptk.mc2sp( | |
mc.astype(np.float64), alpha=alpha, fftlen=fftlen) | |
waveform = pyworld.synthesize( | |
f0, spectrogram, aperiodicity, fs, frame_period) | |
suffix = "mlpg" if use_delta else "mse" | |
wavfile.write("samples/{}_{}.wav".format(splitext(basename(path))[0], suffix), | |
fs, waveform.astype(np.int16)) | |
print("Source range:", np.min(x), np.max(x)) | |
print("Converted range:", np.min(waveform), np.max(waveform)) | |
sys.exit(0) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment