Skip to content

Instantly share code, notes, and snippets.

@noetits
Forked from johnmeade/speech_noise.py
Created December 28, 2021 14:25
Show Gist options
  • Save noetits/e05fd42417bcf6f3a89f2aac99db52fa to your computer and use it in GitHub Desktop.
Save noetits/e05fd42417bcf6f3a89f2aac99db52fa to your computer and use it in GitHub Desktop.
Use VAD to separate out regions without speech content, and estimate mean power of background noise
"""
Estimate background noise power level of a speech waveform.
Requires some non-speech regions in the wave.
Requirements:
pip install numpy librosa soundfile webrtcvad
MIT License John Meade 2021
"""
import librosa
import numpy as np
import soundfile as sf
import webrtcvad
import struct
int16_max = (2 ** 15) - 1
def float_to_pcm16(wav):
"pack a wav, in [-1, 1] numpy float array format, into 16 bit PCM bytes"
return struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))
def nonspeech(wav, sr, vad_aggressiveness=3, webrtc_sr=16_000, webrtc_window_ms=30, dilations=3):
"""
Args:
wav: the waveform, a float numpy array with range [-1, 1].
sr: sample rate of input wav.
vad_aggressiveness: must be 1, 2 or 3. Higher is more strict about filtering nonspeech.
webrtc_sr: must be 8000, 16000, 32000, or 48000. Use a value close to your input.
webrtc_window_ms: must be 10, 20, or 30ms.
dilations: number of windows to dilate VAD results. Increase if phonemes are leaking into output.
Returns:
wave of all regions with no speech content, concatenated together
"""
# resample
if sr != webrtc_sr:
wav = librosa.core.resample(wav, sr, webrtc_sr)
# init VAD
vad = webrtcvad.Vad(vad_aggressiveness)
# trim wav to integer number of windows
W = webrtc_window_ms * webrtc_sr // 1000
T = len(wav)
rem = T % W
wav = wav if rem == 0 else wav[:-rem]
# run VAD
windows = [wav[i * W:(i+1) * W] for i in range(len(wav) // W)]
va = [vad.is_speech(float_to_pcm16(win), webrtc_sr) for win in windows]
# dilate VAD to adjacent frames
va = np.array(va, dtype=bool)
for _ in range(dilations):
va[:-1] |= va[1:]
va[1:] |= va[:-1]
# collect all frames without VA, concat into background-audio wave
bg = [win for win, is_speech in zip(windows, va) if not is_speech]
bg = bg if len(bg) == 0 else np.concatenate(bg)
return bg
def mean_noise_power(wav_fpath):
# read wav
wav, sr = sf.read(wav_fpath)
# normalize
wav /= abs(wav).max().clip(1e-6)
# obtain nonspeech samples
bg = nonspeech(wav, sr)
# compute mean power
return (bg ** 2).mean()
if __name__ == '__main__':
# Example, write nonspeech to file to verify that the VAD performed correctly
wav, sr = sf.read('path/to/file.wav')
wav /= abs(wav).max().clip(1e-6)
bg = nonspeech(wav, sr, webrtc_sr=16_000)
sf.write('path/to/file_nonspeech.wav', bg, 16_000)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment