Skip to content

Instantly share code, notes, and snippets.

@vadimkantorov
Last active February 5, 2023 09:39
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save vadimkantorov/a0de1cdec6b275a4f7455f9e44739a33 to your computer and use it in GitHub Desktop.
Save vadimkantorov/a0de1cdec6b275a4f7455f9e44739a33 to your computer and use it in GitHub Desktop.
Applying WebRTC Voice Activity Detection (VAD) to an audio file and saving the result in a WAV file along with original audio for inspection with Audacity
import argparse
import subprocess
import numpy as np
import scipy.io.wavfile
import scipy.ndimage
import webrtcvad
parser = argparse.ArgumentParser()
parser.add_argument('--audio-path', '-i', required = True)
parser.add_argument('--sample-rate', '-r', type = int, default = 8_000, choices = [8_000, 16_000, 32_000, 48_000], help = 'Sample rate used to load and normalize audio (in Hz)')
parser.add_argument('--mono', action = 'store_true')
parser.add_argument('--aggressiveness', type = int, choices = [0, 1, 2, 3], default = 3)
parser.add_argument('--window-size', type = float, choices = [0.01, 0.02, 0.03], default = 0.02, help = 'VAD window size (in seconds)')
parser.add_argument('--gain', type = float, default = 0.8)
parser.add_argument('--window-size-merge', type = float, default = 1.0)
parser.add_argument('--window-size-max', type = float, default = 0.2)
parser.add_argument('--energy-percentile', type = float, default = 0.9)
args = parser.parse_args()
num_channels = int(subprocess.check_output(['soxi', '-V0', '-c', args.audio_path])) if not args.mono else 1
signal = np.frombuffer(subprocess.check_output(['sox', '-V0', args.audio_path, '-b', '16', '-e', 'signed', '--endian', 'little', '-r', str(args.sample_rate), '-c', str(num_channels), '-t', 'raw', '-']), dtype = np.int16).reshape(-1, num_channels)
vad = webrtcvad.Vad(args.aggressiveness)
percentile_window_size = 10.0
frame_len = int(args.window_size * args.sample_rate)
merge_filter_size = int(args.window_size_merge * args.sample_rate / frame_len)
max_filter_size = int(args.window_size_max * args.sample_rate)
percentile_filter_size = int(percentile_window_size * args.sample_rate)
inflate = lambda voice, channel: np.repeat(voice, frame_len)[:len(channel)]
for c, channel in enumerate(signal.T):
voice = np.array([vad.is_speech(channel[sample_idx : sample_idx + frame_len].tobytes(), args.sample_rate) if sample_idx + frame_len <= len(signal) else False for sample_idx in range(0, len(channel), frame_len)])
channel_abs = np.abs(channel)
energy_threshold = np.quantile(channel_abs[inflate(voice, channel)], args.energy_percentile)
voice &= (scipy.ndimage.filters.maximum_filter1d(channel_abs, max_filter_size, mode = 'constant') > energy_threshold)[::frame_len]
voice = scipy.ndimage.morphology.binary_closing(voice, np.ones((merge_filter_size,), dtype = np.bool))
output_path = args.audio_path + f'.{c}.wav'
scipy.io.wavfile.write(output_path, args.sample_rate, np.vstack([channel, inflate(voice, channel).astype(channel.dtype) * int(channel.max() * args.gain)]).T)
print(output_path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment