Skip to content

Instantly share code, notes, and snippets.

@FarisHijazi
Created October 12, 2021 14:06
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save FarisHijazi/a5ac05baae213f2f830dfa981c521270 to your computer and use it in GitHub Desktop.
Save FarisHijazi/a5ac05baae213f2f830dfa981c521270 to your computer and use it in GitHub Desktop.
"""
VoiceFixer
$ python voicefixer_cli.py -i ~/Downloads/download.mp3
# for running on multiple files
$ find path/to/folder -name "*.wav" -not -name "*denoised*" |xargs -P 20 -I{} sh -c 'python voicefixer_cli.py -i {}'
"""
"""
# installation:
sudo apt-get install libsox-fmt-all libsox-dev sox libsndfile-dev
pip install torchaudio
pip install ffmpeg-python
pip install --upgrade ddsp
pip install voicefixer==0.0.9
pip install git+https://github.com/facebookresearch/WavAugment.git
pip install note_seq
pip install tensorflow
"""
import argparse
from pathlib import Path
rate = 16000
parser = argparse.ArgumentParser(__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-i', '--input', type=Path, help='input file path')
parser.add_argument('-o', '--out', default='{input}{mode}{noisified}_denoised.wav', help='denoised output file path')
parser.add_argument('-n', '--noisify', action='store_true', help='add noise before denoising')
parser.add_argument('-m', '--modes', nargs='+', type=int, choices={0, 1, 2}, default=[0, 1, 2])
args = parser.parse_args()
args.input = args.input.as_posix()
# Alias these for backwards compatibility and ease.
print('importing')
# import IPython.display as ipd
import augment
import librosa
import matplotlib.pyplot as plt
import numpy as np
import scipy.io
import soundfile as sf
import torch
from matplotlib import cm
from scipy.io import wavfile
from voicefixer import VoiceFixer
#@title Run this block to define some helper functions
def uniform_sample(lower, upper):
if (abs(lower - upper) < 1e-5):
return upper
return float((upper - lower) * torch.rand(1) + lower)
def show_spectrogram(file_path):
samples, _ = librosa.load(file_path, sr=44100)
plt.figure(figsize=(12, 4))
plt.subplot(121)
plt.title("Linear Spectrogram")
librosa.display.specshow(
np.log10(np.abs(librosa.stft(samples)) + 1e-8),
sr=44100,
x_axis='frames',
y_axis='linear',
cmap=cm.jet,
vmax=2.8,
vmin=-1.7)
plt.subplot(122)
plt.title("Mel Spectrogram")
librosa.display.specshow(
np.log10(np.abs(librosa.feature.melspectrogram(samples, sr=44100)) + 1e-8),
sr=44100,
x_axis='frames',
y_axis='mel',
cmap=cm.jet,
vmax=2.5,
vmin=-4)
plt.show()
print('loading voice filter', end=' ... ')
vf = VoiceFixer() # Initilize a voicefixer.
print('loaded voicefixer')
sr, a = scipy.io.wavfile.read(args.input)
if len(a.shape) == 2:
a = a[:, 0]
a = librosa.util.normalize(a.astype('float32'))
a = librosa.resample(a, sr, 16000)
a.min(), a.max()
if args.noisify:
# Generate Random Distortion Parameters
clipping_ratio = uniform_sample(lower=0.25, upper=1.0)
print("clipping ratio: ", clipping_ratio) # lower>=0.1, upper<=1.0
lowpass_frequency = uniform_sample(lower=4000, upper=8000)
print("lowpass cutoff frequency: ", lowpass_frequency) # lower>=1000, and upper have no limit
reverb_level = uniform_sample(lower=10, upper=80) # lower>=0, upper<=100
dumping_factor = uniform_sample(lower=10, upper=50) # lower>=0, upper<=100
room_size = uniform_sample(lower=10, upper=50)
print("reverberate level, dumping_factor, room_size = ", reverb_level, dumping_factor, room_size) # lower>=0, upper<=100
highpass = uniform_sample(lower=10, upper=1500)
print("highpass cutoff frequency: ", highpass) # lower>=0, upper<=2000
# Apply Distortion Effects
effect_chain = augment.EffectChain().clip(clipping_ratio) # clipping ratio
effect_chain = effect_chain.lowpass(lowpass_frequency) # remove high frequency information
effect_chain = effect_chain.reverb(reverb_level, dumping_factor, room_size).channels(1) # reverberate level, dumping factor, room size
effect_chain = effect_chain.highpass(highpass) # remove low frequency information
noise_generator = lambda: torch.zeros_like(torch.tensor(a)).uniform_()
effect_chain = effect_chain.additive_noise(noise_generator, snr=15)
# print(effect_chain)
y = effect_chain.apply(torch.tensor(a), src_info={'rate': rate}, target_info={'rate': rate})
out = args.out.format(input=args.input, noisified="_noisified", mode=f'_distorted')
print("Distorted speech file name:", out)
sf.write(out, y[0, ...].numpy(), rate)
# sf.write(str(second)+"_"+str(rate)+".wav",a,rate)
# sf.write(str(second)+"_"+str(rate)+".wav",a,rate)
# show_spectrogram(args.out.format(input=args.input, mode='distorted'))
# ipd.Audio(args.out.format(input=args.input, mode='distorted'), rate=44100)
else:
y = torch.Tensor(a).unsqueeze(0)
for mode in args.modes:
out = args.out.format(input=args.input, noisified="_noisified" if args.noisify else "", mode=f'_mode{mode}')
print("Restore: ", out)
vf.restore(input=args.input, output=out, cuda=True, mode=mode)
# show_spectrogram(out)
# ipd.Audio(out, rate=44100)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment