Skip to content

Instantly share code, notes, and snippets.

@huchenxucs
Created August 21, 2020 07:53
Show Gist options
  • Save huchenxucs/1567469d30ece87eded5418754c3c4e7 to your computer and use it in GitHub Desktop.
Save huchenxucs/1567469d30ece87eded5418754c3c4e7 to your computer and use it in GitHub Desktop.
extract mel from wav file, can be used for pwg and waveglow.
def process_utterance(wav_path,
fft_size=1024,
hop_size=256,
win_length=1024,
window="hann",
num_mels=80,
fmin=80,
fmax=7600,
eps=1e-10,
sample_rate=22050,
loud_norm=False,
min_level_db=-100,
return_linear=False,
trim_long_sil=False, vocoder='pwg',
change_loud=False,
loud_range_min=0.9, loud_range_max=1.1):
if isinstance(wav_path, str):
if trim_long_sil:
wav, _ = trim_long_silences(wav_path, sample_rate)
else:
wav, _ = librosa.core.load(wav_path, sr=sample_rate)
else:
wav = wav_path
if change_loud:
sample_num = wav.shape[0] // (sample_rate * 2) + 1 # sample point every 1 seconds
random_point = np.random.permutation(wav.shape[0])
sample_up, sample_down = random_point[:sample_num], random_point[sample_num:2 * sample_num]
fp_up = np.random.uniform(2, loud_range_max, sample_num)
fp_down = np.random.uniform(loud_range_min, 0.5, sample_num)
fp = np.concatenate([fp_up, fp_down])
xp = np.concatenate([sample_up, sample_down])
index = np.argsort(xp)
xp = xp[index]
fp = fp[index]
# print(xp.shape)
change_curve = np.interp(np.arange(wav.shape[0]), xp, fp)
wav = wav * change_curve
if (np.abs(wav) > 1.0).sum() / wav.shape[0] > 1 / 200:
print("too much wav out of 1", wav_path)
wav = np.clip(wav, -1.0, 1.0)
if loud_norm:
assert not change_loud
meter = pyln.Meter(sample_rate) # create BS.1770 meter
loudness = meter.integrated_loudness(wav)
wav = pyln.normalize.loudness(wav, loudness, -22.0)
if np.abs(wav).max() > 1:
wav = wav / np.abs(wav).max()
# get amplitude spectrogram
x_stft = librosa.stft(wav, n_fft=fft_size, hop_length=hop_size,
win_length=win_length, window=window, pad_mode="constant")
spc = np.abs(x_stft) # (n_bins, T)
# get mel basis
fmin = 0 if fmin is -1 else fmin
fmax = sample_rate / 2 if fmax is -1 else fmax
mel_basis = librosa.filters.mel(sample_rate, fft_size, num_mels, fmin, fmax)
mel = mel_basis @ spc
if vocoder == 'pwg':
mel = np.log10(np.maximum(eps, mel)) # (n_mel_bins, T)
elif vocoder == 'waveglow':
mel = audio.dynamic_range_compression(mel)
else:
assert False, f'"{vocoder}" is not in ["pwg", "waveglow"].'
l_pad, r_pad = audio.librosa_pad_lr(wav, fft_size, hop_size, 1)
wav = np.pad(wav, (l_pad, r_pad), mode='constant', constant_values=0.0)
wav = wav[:mel.shape[1] * hop_size]
if not return_linear:
return wav, mel
else:
spc = audio.amp_to_db(spc)
spc = audio.normalize(spc, {'min_level_db': min_level_db})
return wav, mel, spc
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment