Create a gist now

Instantly share code, notes, and snippets.

@r9y9 /gmmmap.py
Last active Aug 29, 2015

GMM-based statistical voice conversion module (http://r9y9.github.io/blog/2014/07/13/statistical-voice-conversion-wakaran/)
#!/usr/bin/python
# coding: utf-8
import numpy as np
from numpy import linalg
from sklearn.mixture import GMM
import scipy.linalg
import scipy.sparse
import scipy.sparse.linalg
class GMMMap:
"""GMM-based frame-by-frame speech parameter mapping.
GMMMap represents a class to transform spectral features of a source
speaker to that of a target speaker based on Gaussian Mixture Models
of source and target joint spectral features.
Notation
--------
Source speaker's feature: X = {x_t}, 0 <= t < T
Target speaker's feature: Y = {y_t}, 0 <= t < T
where T is the number of time frames.
Parameters
----------
gmm : sklearn.mixture.GMM
Gaussian Mixture Models of source and target joint features
swap : bool
True: source -> target
False target -> source
Attributes
----------
num_mixtures : int
the number of Gaussian mixtures
weights : array, shape (`num_mixtures`)
weights for each gaussian
src_means : array, shape (`num_mixtures`, `order of spectral feature`)
means of GMM for a source speaker
tgt_means : array, shape (`num_mixtures`, `order of spectral feature`)
means of GMM for a target speaker
covarXX : array, shape (`num_mixtures`, `order of spectral feature`,
`order of spectral feature`)
variance matrix of source speaker's spectral feature
covarXY : array, shape (`num_mixtures`, `order of spectral feature`,
`order of spectral feature`)
covariance matrix of source and target speaker's spectral feature
covarYX : array, shape (`num_mixtures`, `order of spectral feature`,
`order of spectral feature`)
covariance matrix of target and source speaker's spectral feature
covarYY : array, shape (`num_mixtures`, `order of spectral feature`,
`order of spectral feature`)
variance matrix of target speaker's spectral feature
D : array, shape (`num_mixtures`, `order of spectral feature`,
`order of spectral feature`)
covariance matrices of target static spectral features
px : sklearn.mixture.GMM
Gaussian Mixture Models of source speaker's features
Reference
---------
- [Toda 2007] Voice Conversion Based on Maximum Likelihood Estimation
of Spectral Parameter Trajectory.
http://isw3.naist.jp/~tomoki/Tomoki/Journals/IEEE-Nov-2007_MLVC.pdf
"""
def __init__(self, gmm, swap=False):
# D is the order of spectral feature for a speaker
self.num_mixtures, D = gmm.means_.shape[0], gmm.means_.shape[1]/2
self.weights = gmm.weights_
# Split source and target parameters from joint GMM
self.src_means = gmm.means_[:, 0:D]
self.tgt_means = gmm.means_[:, D:]
self.covarXX = gmm.covars_[:, :D, :D]
self.covarXY = gmm.covars_[:, :D, D:]
self.covarYX = gmm.covars_[:, D:, :D]
self.covarYY = gmm.covars_[:, D:, D:]
# swap src and target parameters
if swap:
self.tgt_means, self.src_means = self.src_means, self.tgt_means
self.covarYY, self.covarXX = self.covarXX, self.covarYY
self.covarYX, self.covarXY = self.XY, self.covarYX
# Compute D eq.(12) in [Toda 2007]
self.D = np.zeros(self.num_mixtures*D*D).reshape(self.num_mixtures, D, D)
for m in range(self.num_mixtures):
xx_inv_xy = np.linalg.solve(self.covarXX[m], self.covarXY[m])
self.D[m] = self.covarYY[m] - np.dot(self.covarYX[m], xx_inv_xy)
# p(x), which is used to compute posterior prob. for a given source
# spectral feature in mapping stage.
self.px = GMM(n_components=self.num_mixtures, covariance_type="full")
self.px.means_ = self.src_means
self.px.covars_ = self.covarXX
self.px.weights_ = self.weights
def convert(self, src):
"""
Mapping source spectral feature x to target spectral feature y
so that minimize the mean least squared error.
More specifically, it returns the value E(p(y|x)].
Parameters
----------
src : array, shape (`order of spectral feature`)
source speaker's spectral feature that will be transformed
Return
------
converted spectral feature
"""
D = len(src)
# Eq.(11)
E = np.zeros((self.num_mixtures, D))
for m in range(self.num_mixtures):
xx = np.linalg.solve(self.covarXX[m], src - self.src_means[m])
E[m] = self.tgt_means[m] + self.covarYX[m].dot(xx)
# Eq.(9) p(m|x)
posterior = self.px.predict_proba(np.atleast_2d(src))
# Eq.(13) conditinal mean E[p(y|x)]
return posterior.dot(E)
class TrajectoryGMMMap(GMMMap):
"""
Trajectory-based speech parameter mapping for voice conversion
based on the maximum likelihood criterion.
Parameters
----------
gmm : scipy.mixture.GMM
Gaussian Mixture Models of source and target speaker joint features
gv : scipy.mixture.GMM (default=None)
Gaussian Mixture Models of target speaker's global variance of spectral
feature
swap : bool (default=False)
True: source -> target
False target -> source
Attributes
----------
TODO
Reference
---------
- [Toda 2007] Voice Conversion Based on Maximum Likelihood Estimation
of Spectral Parameter Trajectory.
http://isw3.naist.jp/~tomoki/Tomoki/Journals/IEEE-Nov-2007_MLVC.pdf
"""
def __init__(self, gmm, T, gv=None, swap=False):
GMMMap.__init__(self, gmm, swap)
self.T = T
# shape[1] = d(src) + d(src_delta) + d(tgt) + d(tgt_delta)
D = gmm.means_.shape[1] / 4
## Setup for Trajectory-based mapping
self.__construct_weight_matrix(T, D)
## Setup for GV post-filtering
# It is assumed that GV is modeled as a single mixture GMM
if gv != None:
self.gv_mean = gv.means_[0]
self.gv_covar = gv.covars_[0]
self.Pv = np.linalg.inv(self.gv_covar)
def __construct_weight_matrix(self, T, D):
# Construct Weight matrix W
# Eq.(25) ~ (28)
for t in range(T):
w0 = scipy.sparse.lil_matrix((D, D*T))
w1 = scipy.sparse.lil_matrix((D, D*T))
w0[0:,t*D:(t+1)*D] = scipy.sparse.diags(np.ones(D), 0)
if t-1 >= 0:
tmp = np.zeros(D)
tmp.fill(-0.5)
w1[0:,(t-1)*D:t*D] = scipy.sparse.diags(tmp, 0)
if t+1 < T:
tmp = np.zeros(D)
tmp.fill(0.5)
w1[0:,(t+1)*D:(t+2)*D] = scipy.sparse.diags(tmp, 0)
W_t = scipy.sparse.vstack([w0, w1])
# Slower
# self.W[2*D*t:2*D*(t+1),:] = W_t
if t == 0:
self.W = W_t
else:
self.W = scipy.sparse.vstack([self.W, W_t])
self.W = scipy.sparse.csr_matrix(self.W)
assert self.W.shape == (2*D*T, D*T)
def convert(self, src):
"""
Mapping source spectral feature x to target spectral feature y
so that maximize the likelihood of y given x.
Parameters
----------
src : array, shape (`the number of frames`, `the order of spectral feature`)
a sequence of source speaker's spectral feature that will be
transformed
Return
------
a sequence of transformed spectral features
"""
T, D = src.shape[0], src.shape[1]/2
if T != self.T:
self.__construct_weight_matrix(T, D)
# A suboptimum mixture sequence (eq.37)
optimum_mix = self.px.predict(src)
# Compute E eq.(40)
self.E = np.zeros((T, 2*D))
for t in range(T):
m = optimum_mix[t] # estimated mixture index at time t
xx = np.linalg.solve(self.covarXX[m], src[t] - self.src_means[m])
# Eq. (22)
self.E[t] = self.tgt_means[m] + np.dot(self.covarYX[m], xx)
self.E = self.E.flatten()
# Compute D eq.(41). Note that self.D represents D^-1.
self.D = np.zeros((T, 2*D, 2*D))
for t in range(T):
m = optimum_mix[t]
xx_inv_xy = np.linalg.solve(self.covarXX[m], self.covarXY[m])
# Eq. (23)
self.D[t] = self.covarYY[m] - np.dot(self.covarYX[m], xx_inv_xy)
self.D[t] = np.linalg.inv(self.D[t])
self.D = scipy.linalg.block_diag(*self.D)
# represent D as a sparse matrix
self.D = scipy.sparse.csr_matrix(self.D)
# Compute target static features
# eq.(39)
covar = self.W.T.dot(self.D.dot(self.W))
y = scipy.sparse.linalg.spsolve(covar, self.W.T.dot(self.D.dot(self.E)),\
use_umfpack=False)
return y.reshape((T, D))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment