p-i-/rt_speech_chunker.py

## rt_speech_chunker.py
'''
# Realtime (Ultralow-latency) Voice Activity Detection (VAD) using WebRTC VAD
v3.2
π 17 Apr 2024

## Usage
```python
    def on_segment(segment):  # np.array of np.int16
        print(f'☏ CALLBACK: Got segment: {len(segment)} frames')

    ch = MicChunker(verbose=True, dump_wav=True, on_segment=on_segment)
```

## Notes
- pyaudio triggers a callback we supply (mic-data @ {16kHz, 16-bit PCM, 1 channel})
- each frame is 320 samples, as WebRTC's VAD requires 320 samples @ 16kHz
- We use WebRT's VoiceAudioDetection (VAD) to detect presence of speech (bool) in each frame.
- We avoid allocations in audio thread (it's a high-priority system thread)
- P consec good frames initiate recording, Q consec bad frames conclude recording
- Upon conclude, the audio segment is written to a queue for processing
- A worker thread monitors the queue, triggering our on_segment callback for each segment

## Terminology
I'm redefining some buzzwords here:
- In common lingo, the mic-callback is returning a BUFFER of FRAMES. Each FRAME has nChannels (1=MONO).
- In this file, the mic-callback is returning a FRAME of SAMPLES. And I'm using a RING-BUFFER of FRAMES.

## Sample output
```
Recording... press Ctrl+C to stop
STARTED RECORDING: start_frame=27
STOPPED RECORDING: end_frame=68, length=41 frames
☏ CALLBACK: Got segment: 41984 frames
STARTED RECORDING: start_frame=113
STOPPED RECORDING: end_frame=149, length=36 frames
☏ CALLBACK: Got segment: 36864 frames
^C👋
```

## TODOs
There is currently no provision for the user speaking for too long (> buf_s seconds).
'''

import time
import queue as Queue
from threading import Thread
import wave

import numpy as np
import pyaudio
import webrtcvad

class DictToObject:
    def __init__(self, dictionary):
        for key, value in dictionary.items():
            setattr(self, key, value)

class MicChunker:
    audioFormat = {
        'format': pyaudio.paInt16,
        'channels': 1,
        'rate': 16000,  # vad wants 16kHz
        'frames_per_buffer': 320  # vad wants 320 samples
    }

    def __init__(self, **kwargs):
        defaults = {
            'verbose': False,
            'dump_wav': False,
            'on_segment': lambda: None,
            'buf_s': 300,
            'nframes_to_prepad': 10,
            'n_good_frames_to_start_recording': 10,
            'n_bad_frames_to_end_recording': 16,
        }
        self.settings = DictToObject(defaults | kwargs)
        S, A = self.settings, DictToObject(MicChunker.audioFormat)

        # We use a 2D ringbuffer to keep the pointer-math simple
        samps_per_frame = A.frames_per_buffer
        nFrames = S.buf_s * A.rate // samps_per_frame
        self.circular_buf = np.zeros((nFrames, samps_per_frame), dtype=np.int16)

        self.vad = webrtcvad.Vad(mode=3)  # 0=off, 1=minimal, 2=low, 3=aggressive

        # We use a generator to encapsulate state machine
        self.process_frame = self.frame_processor()
        next(self.process_frame)  # prime it

        self.pending_segments = Queue.Queue()
        if S.dump_wav or S.on_segment:
            Thread(target=self.worker, daemon=True).start()  # consumes queue

        def mic_callback(in_data, frame_count, time_info, status):
            self.process_frame.send(in_data)
            return (in_data, pyaudio.paContinue)

        self.p = pyaudio.PyAudio()
        self.stream = self.p.open(
            **MicChunker.audioFormat,
            input=True,
            stream_callback=mic_callback
        )
        self.stream.start_stream()

    def frame_processor(self):
        S, frame_index, nconsec, recording = self.settings, 0, 0, False
        dprint = print if S.verbose else lambda s: None

        while True:
            in_data = yield
            self.circular_buf[frame_index] = np.frombuffer(in_data, dtype=np.int16)

            is_speech = self.vad.is_speech(in_data, MicChunker.audioFormat['rate'])  # Needs to be 320 samples, 16kHz (I think)

            nconsec = 0 if recording == is_speech else nconsec + 1
            if nconsec == (S.n_bad_frames_to_end_recording if recording else S.n_good_frames_to_start_recording):
                nconsec = 0
                recording = not recording
                if recording:
                    start_frame = (frame_index - S.n_good_frames_to_start_recording - S.nframes_to_prepad) % len(self.circular_buf)
                    dprint(f'STARTED RECORDING: start_frame={start_frame}')
                else:
                    end_frame = (frame_index - S.n_bad_frames_to_end_recording) % len(self.circular_buf)
                    get_frames = lambda B, s, e: B[s:e] if s < e else np.concatenate([B[s:], B[:e]])
                    segment = get_frames(self.circular_buf, start_frame, end_frame).flatten()
                    self.pending_segments.put(segment)
                    dprint(f'STOPPED RECORDING: end_frame={end_frame}')

            frame_index = (frame_index + 1) % len(self.circular_buf)

    def worker(self):
        S = self.settings
        file_index = 0
        while True:
            while (segment := self.pending_segments.get()) is None:
                time.sleep(.01)

            file_index += 1
            if S.dump_wav:
                with wave.open(f'segment-{file_index:03}.wav', 'wb') as f:
                    f.setnchannels(1)
                    f.setsampwidth(2)
                    f.setframerate(16000)
                    f.writeframes(segment.tobytes())

            S.on_segment(segment)

    def terminate(self):
        self.stream.stop_stream()
        self.stream.close()
        self.p.terminate()


if __name__ == "__main__":
    def on_segment(segment):
        print(f'☏ CALLBACK: Got segment: {len(segment)} frames')

    ch = MicChunker(verbose=True, dump_wav=True, on_segment=on_segment)

    print('Recording... press Ctrl+C to stop')
    try:
        while True:
            time.sleep(0.1)
    except KeyboardInterrupt:
        print('👋')
        ch.terminate()
	'''
	# Realtime (Ultralow-latency) Voice Activity Detection (VAD) using WebRTC VAD
	v3.2
	π 17 Apr 2024

	## Usage
	```python
	def on_segment(segment): # np.array of np.int16
	print(f'☏ CALLBACK: Got segment: {len(segment)} frames')

	ch = MicChunker(verbose=True, dump_wav=True, on_segment=on_segment)
	```

	## Notes
	- pyaudio triggers a callback we supply (mic-data @ {16kHz, 16-bit PCM, 1 channel})
	- each frame is 320 samples, as WebRTC's VAD requires 320 samples @ 16kHz
	- We use WebRT's VoiceAudioDetection (VAD) to detect presence of speech (bool) in each frame.
	- We avoid allocations in audio thread (it's a high-priority system thread)
	- P consec good frames initiate recording, Q consec bad frames conclude recording
	- Upon conclude, the audio segment is written to a queue for processing
	- A worker thread monitors the queue, triggering our on_segment callback for each segment

	## Terminology
	I'm redefining some buzzwords here:
	- In common lingo, the mic-callback is returning a BUFFER of FRAMES. Each FRAME has nChannels (1=MONO).
	- In this file, the mic-callback is returning a FRAME of SAMPLES. And I'm using a RING-BUFFER of FRAMES.

	## Sample output
	```
	Recording... press Ctrl+C to stop
	STARTED RECORDING: start_frame=27
	STOPPED RECORDING: end_frame=68, length=41 frames
	☏ CALLBACK: Got segment: 41984 frames
	STARTED RECORDING: start_frame=113
	STOPPED RECORDING: end_frame=149, length=36 frames
	☏ CALLBACK: Got segment: 36864 frames
	^C👋
	```

	## TODOs
	There is currently no provision for the user speaking for too long (> buf_s seconds).
	'''

	import time
	import queue as Queue
	from threading import Thread
	import wave

	import numpy as np
	import pyaudio
	import webrtcvad

	class DictToObject:
	def __init__(self, dictionary):
	for key, value in dictionary.items():
	setattr(self, key, value)

	class MicChunker:
	audioFormat = {
	'format': pyaudio.paInt16,
	'channels': 1,
	'rate': 16000, # vad wants 16kHz
	'frames_per_buffer': 320 # vad wants 320 samples
	}

	def __init__(self, **kwargs):
	defaults = {
	'verbose': False,
	'dump_wav': False,
	'on_segment': lambda: None,
	'buf_s': 300,
	'nframes_to_prepad': 10,
	'n_good_frames_to_start_recording': 10,
	'n_bad_frames_to_end_recording': 16,
	}
	self.settings = DictToObject(defaults \| kwargs)
	S, A = self.settings, DictToObject(MicChunker.audioFormat)

	# We use a 2D ringbuffer to keep the pointer-math simple
	samps_per_frame = A.frames_per_buffer
	nFrames = S.buf_s * A.rate // samps_per_frame
	self.circular_buf = np.zeros((nFrames, samps_per_frame), dtype=np.int16)

	self.vad = webrtcvad.Vad(mode=3) # 0=off, 1=minimal, 2=low, 3=aggressive

	# We use a generator to encapsulate state machine
	self.process_frame = self.frame_processor()
	next(self.process_frame) # prime it

	self.pending_segments = Queue.Queue()
	if S.dump_wav or S.on_segment:
	Thread(target=self.worker, daemon=True).start() # consumes queue

	def mic_callback(in_data, frame_count, time_info, status):
	self.process_frame.send(in_data)
	return (in_data, pyaudio.paContinue)

	self.p = pyaudio.PyAudio()
	self.stream = self.p.open(
	**MicChunker.audioFormat,
	input=True,
	stream_callback=mic_callback
	)
	self.stream.start_stream()

	def frame_processor(self):
	S, frame_index, nconsec, recording = self.settings, 0, 0, False
	dprint = print if S.verbose else lambda s: None

	while True:
	in_data = yield
	self.circular_buf[frame_index] = np.frombuffer(in_data, dtype=np.int16)

	is_speech = self.vad.is_speech(in_data, MicChunker.audioFormat['rate']) # Needs to be 320 samples, 16kHz (I think)

	nconsec = 0 if recording == is_speech else nconsec + 1
	if nconsec == (S.n_bad_frames_to_end_recording if recording else S.n_good_frames_to_start_recording):
	nconsec = 0
	recording = not recording
	if recording:
	start_frame = (frame_index - S.n_good_frames_to_start_recording - S.nframes_to_prepad) % len(self.circular_buf)
	dprint(f'STARTED RECORDING: start_frame={start_frame}')
	else:
	end_frame = (frame_index - S.n_bad_frames_to_end_recording) % len(self.circular_buf)
	get_frames = lambda B, s, e: B[s:e] if s < e else np.concatenate([B[s:], B[:e]])
	segment = get_frames(self.circular_buf, start_frame, end_frame).flatten()
	self.pending_segments.put(segment)
	dprint(f'STOPPED RECORDING: end_frame={end_frame}')

	frame_index = (frame_index + 1) % len(self.circular_buf)

	def worker(self):
	S = self.settings
	file_index = 0
	while True:
	while (segment := self.pending_segments.get()) is None:
	time.sleep(.01)

	file_index += 1
	if S.dump_wav:
	with wave.open(f'segment-{file_index:03}.wav', 'wb') as f:
	f.setnchannels(1)
	f.setsampwidth(2)
	f.setframerate(16000)
	f.writeframes(segment.tobytes())

	S.on_segment(segment)

	def terminate(self):
	self.stream.stop_stream()
	self.stream.close()
	self.p.terminate()


	if __name__ == "__main__":
	def on_segment(segment):
	print(f'☏ CALLBACK: Got segment: {len(segment)} frames')

	ch = MicChunker(verbose=True, dump_wav=True, on_segment=on_segment)

	print('Recording... press Ctrl+C to stop')
	try:
	while True:
	time.sleep(0.1)
	except KeyboardInterrupt:
	print('👋')
	ch.terminate()