Skip to content

Instantly share code, notes, and snippets.

@p-i-
Last active April 17, 2024 15:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save p-i-/417dab2ddccac7aaef8c798af665093a to your computer and use it in GitHub Desktop.
Save p-i-/417dab2ddccac7aaef8c798af665093a to your computer and use it in GitHub Desktop.
Realtime (Ultralow-latency) Speech Chunker using Voice Activity Detection (VAD) using WebRTC VAD
'''
# Realtime (Ultralow-latency) Voice Activity Detection (VAD) using WebRTC VAD
v3.2
π 17 Apr 2024
## Usage
```python
def on_segment(segment): # np.array of np.int16
print(f'☏ CALLBACK: Got segment: {len(segment)} frames')
ch = MicChunker(verbose=True, dump_wav=True, on_segment=on_segment)
```
## Notes
- pyaudio triggers a callback we supply (mic-data @ {16kHz, 16-bit PCM, 1 channel})
- each frame is 320 samples, as WebRTC's VAD requires 320 samples @ 16kHz
- We use WebRT's VoiceAudioDetection (VAD) to detect presence of speech (bool) in each frame.
- We avoid allocations in audio thread (it's a high-priority system thread)
- P consec good frames initiate recording, Q consec bad frames conclude recording
- Upon conclude, the audio segment is written to a queue for processing
- A worker thread monitors the queue, triggering our on_segment callback for each segment
## Terminology
I'm redefining some buzzwords here:
- In common lingo, the mic-callback is returning a BUFFER of FRAMES. Each FRAME has nChannels (1=MONO).
- In this file, the mic-callback is returning a FRAME of SAMPLES. And I'm using a RING-BUFFER of FRAMES.
## Sample output
```
Recording... press Ctrl+C to stop
STARTED RECORDING: start_frame=27
STOPPED RECORDING: end_frame=68, length=41 frames
☏ CALLBACK: Got segment: 41984 frames
STARTED RECORDING: start_frame=113
STOPPED RECORDING: end_frame=149, length=36 frames
☏ CALLBACK: Got segment: 36864 frames
^C👋
```
## TODOs
There is currently no provision for the user speaking for too long (> buf_s seconds).
'''
import time
import queue as Queue
from threading import Thread
import wave
import numpy as np
import pyaudio
import webrtcvad
class DictToObject:
def __init__(self, dictionary):
for key, value in dictionary.items():
setattr(self, key, value)
class MicChunker:
audioFormat = {
'format': pyaudio.paInt16,
'channels': 1,
'rate': 16000, # vad wants 16kHz
'frames_per_buffer': 320 # vad wants 320 samples
}
def __init__(self, **kwargs):
defaults = {
'verbose': False,
'dump_wav': False,
'on_segment': lambda: None,
'buf_s': 300,
'nframes_to_prepad': 10,
'n_good_frames_to_start_recording': 10,
'n_bad_frames_to_end_recording': 16,
}
self.settings = DictToObject(defaults | kwargs)
S, A = self.settings, DictToObject(MicChunker.audioFormat)
# We use a 2D ringbuffer to keep the pointer-math simple
samps_per_frame = A.frames_per_buffer
nFrames = S.buf_s * A.rate // samps_per_frame
self.circular_buf = np.zeros((nFrames, samps_per_frame), dtype=np.int16)
self.vad = webrtcvad.Vad(mode=3) # 0=off, 1=minimal, 2=low, 3=aggressive
# We use a generator to encapsulate state machine
self.process_frame = self.frame_processor()
next(self.process_frame) # prime it
self.pending_segments = Queue.Queue()
if S.dump_wav or S.on_segment:
Thread(target=self.worker, daemon=True).start() # consumes queue
def mic_callback(in_data, frame_count, time_info, status):
self.process_frame.send(in_data)
return (in_data, pyaudio.paContinue)
self.p = pyaudio.PyAudio()
self.stream = self.p.open(
**MicChunker.audioFormat,
input=True,
stream_callback=mic_callback
)
self.stream.start_stream()
def frame_processor(self):
S, frame_index, nconsec, recording = self.settings, 0, 0, False
dprint = print if S.verbose else lambda s: None
while True:
in_data = yield
self.circular_buf[frame_index] = np.frombuffer(in_data, dtype=np.int16)
is_speech = self.vad.is_speech(in_data, MicChunker.audioFormat['rate']) # Needs to be 320 samples, 16kHz (I think)
nconsec = 0 if recording == is_speech else nconsec + 1
if nconsec == (S.n_bad_frames_to_end_recording if recording else S.n_good_frames_to_start_recording):
nconsec = 0
recording = not recording
if recording:
start_frame = (frame_index - S.n_good_frames_to_start_recording - S.nframes_to_prepad) % len(self.circular_buf)
dprint(f'STARTED RECORDING: start_frame={start_frame}')
else:
end_frame = (frame_index - S.n_bad_frames_to_end_recording) % len(self.circular_buf)
get_frames = lambda B, s, e: B[s:e] if s < e else np.concatenate([B[s:], B[:e]])
segment = get_frames(self.circular_buf, start_frame, end_frame).flatten()
self.pending_segments.put(segment)
dprint(f'STOPPED RECORDING: end_frame={end_frame}')
frame_index = (frame_index + 1) % len(self.circular_buf)
def worker(self):
S = self.settings
file_index = 0
while True:
while (segment := self.pending_segments.get()) is None:
time.sleep(.01)
file_index += 1
if S.dump_wav:
with wave.open(f'segment-{file_index:03}.wav', 'wb') as f:
f.setnchannels(1)
f.setsampwidth(2)
f.setframerate(16000)
f.writeframes(segment.tobytes())
S.on_segment(segment)
def terminate(self):
self.stream.stop_stream()
self.stream.close()
self.p.terminate()
if __name__ == "__main__":
def on_segment(segment):
print(f'☏ CALLBACK: Got segment: {len(segment)} frames')
ch = MicChunker(verbose=True, dump_wav=True, on_segment=on_segment)
print('Recording... press Ctrl+C to stop')
try:
while True:
time.sleep(0.1)
except KeyboardInterrupt:
print('👋')
ch.terminate()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment