Skip to content

Instantly share code, notes, and snippets.

@r9y9
Created September 12, 2017 17:23
Show Gist options
  • Save r9y9/ed5abc86fff3397b78ab68e57c2a4cb7 to your computer and use it in GitHub Desktop.
Save r9y9/ed5abc86fff3397b78ab68e57c2a4cb7 to your computer and use it in GitHub Desktop.
GMM voice conversion using https://github.com/r9y9/nnmnkwii
"""GMM-based voice conversion"""
from nnmnkwii.datasets import FileSourceDataset
from nnmnkwii.datasets.cmu_arctic import CMUArcticWavFileDataSource
from nnmnkwii.preprocessing.alignment import DTWAligner
from nnmnkwii.util import apply_each2d_trim
from nnmnkwii.preprocessing import remove_zeros_frames, delta_features
from nnmnkwii.metrics import melcd
from nnmnkwii.baseline.gmm import MLPG
from os.path import join, expanduser, basename, splitext
import sys
import time
import numpy as np
from scipy.io import wavfile
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import train_test_split
import pyworld
import pysptk
from pysptk.synthesis import MLSADF, Synthesizer
DATA_ROOT = join(expanduser("~"), "data", "cmu_arctic")
fs = 16000
fftlen = pyworld.get_cheaptrick_fft_size(fs)
alpha = pysptk.util.mcepalpha(fs)
order = 24
frame_period = 5
max_files = 100
use_delta = True
diff = True
if use_delta:
windows = [
(0, 0, np.array([1.0])),
(1, 1, np.array([-0.5, 0.0, 0.5])),
(1, 1, np.array([1.0, -2.0, 1.0])),
]
else:
windows = [
(0, 0, np.array([1.0])),
]
class CMUArcticSpectrumDataSource(CMUArcticWavFileDataSource):
def __init__(self, *args, **kwargs):
super(CMUArcticSpectrumDataSource, self).__init__(*args, **kwargs)
self.test_paths = None
def collect_files(self):
paths = super(
CMUArcticSpectrumDataSource, self).collect_files()
paths_train, paths_test = train_test_split(
paths, test_size=0.3, random_state=1234)
# keep paths for later testing
self.test_paths = paths_test
return paths_train
def collect_features(self, path):
fs, x = wavfile.read(path)
x = x.astype(np.float64)
f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period)
spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha)
return mc
# Data sources
# Since we need to create a parallel dataset, read data separetely
clb_source = CMUArcticSpectrumDataSource(data_root=DATA_ROOT,
speakers=["clb"], max_files=max_files)
slt_source = CMUArcticSpectrumDataSource(data_root=DATA_ROOT,
speakers=["slt"], max_files=max_files)
# Build dataset as 3D tensor (NxTxD)
X = FileSourceDataset(clb_source).asarray(padded_length=1200)
Y = FileSourceDataset(slt_source).asarray(padded_length=1200)
# Alignment
X, Y = DTWAligner(verbose=0, dist=melcd).transform((X, Y))
# Drop 1st dimention
X, Y = X[:, :, 1:], Y[:, :, 1:]
static_dim = X.shape[-1]
if use_delta:
X = apply_each2d_trim(delta_features, X, windows)
Y = apply_each2d_trim(delta_features, Y, windows)
# Joint features
XY = np.concatenate((X, Y), axis=-1).reshape(-1, X.shape[-1] * 2)
XY = remove_zeros_frames(XY)
print(XY.shape)
gmm = GaussianMixture(
n_components=32, covariance_type="full", max_iter=100, verbose=1)
gmm.fit(XY)
# Parameter generation
paramgen = MLPG(gmm, windows=windows, diff=diff)
# Waveform generation for test set
for idx, path in enumerate(clb_source.test_paths):
fs, x = wavfile.read(path)
x = x.astype(np.float64)
f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period)
spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)
mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha)
c0, mc = mc[:, 0], mc[:, 1:]
if use_delta:
mc = delta_features(mc, windows)
since = time.time()
mc = paramgen.transform(mc)
print("{}, Elapsed time in conversion: {}s".format(idx, time.time() - since))
assert mc.shape[-1] == static_dim
mc = np.hstack((c0[:, None], mc))
if diff:
mc[:, 0] = 0
engine = Synthesizer(MLSADF(order=24, alpha=alpha), hopsize=80)
b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha)
waveform = engine.synthesis(x, b)
else:
spectrogram = pysptk.mc2sp(
mc.astype(np.float64), alpha=alpha, fftlen=fftlen)
waveform = pyworld.synthesize(
f0, spectrogram, aperiodicity, fs, frame_period)
suffix = "mlpg" if use_delta else "mse"
wavfile.write("samples/{}_{}.wav".format(splitext(basename(path))[0], suffix),
fs, waveform.astype(np.int16))
print("Source range:", np.min(x), np.max(x))
print("Converted range:", np.min(waveform), np.max(waveform))
sys.exit(0)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment