p-i-/macos_realtime_stt__whisper.py

## macos_realtime_stt__whisper.py
'''
Adapted by π 15 Apr 2024
    from https://github.com/davabase/whisper_real_time/blob/master/transcribe_demo.py
    ... which appears to have been itself pilfered from:
        https://github.com/JihyeokKim/whisper_ros/tree/master/whisper_ros/src

To run on macOS:
    - `brew install ffmpeg portaudio`
    - Create requirements.txt:
    ```
    pyaudio
    SpeechRecognition
    --extra-index-url https://download.pytorch.org/whl/cu116
    torch
    numpy
    git+https://github.com/openai/whisper.git
    ```
    ... and `pip install -r requirements.txt`
    - `python transcribe_demo.py`

Sample output (M2 MacBook Pro):
```
(t0+5.180s) 🟢 Model loaded.


[t0+11.385]🎙️ Received 120832 bytes of audio.
(recv+0.008s)🔹Processing 60416 samples (= 3.776s) of audio data.
(4.439s) ✅ Transcription: The rain in Spain stays mainly in the plain.

[t0+26.164]🎙️ Received 184320 bytes of audio.
(recv+0.004s)🔹Processing 92160 samples (= 5.760s) of audio data.
(4.417s) ✅ Transcription: In Horsford, Herringford and Hampshire, hurricanes hardly ever happen.
^C👋
```
fkn-A!

Only concern is that ~4s of audio takes ~4s to transcribe.
So that's something to keep an eye on.
There's other Whisper code available that promises to be several x faster:
- https://github.com/ggerganov/whisper.cpp
- https://github.com/sanchit-gandhi/whisper-jax
'''

from sys import platform
from time import time, sleep
from queue import Queue
import numpy as np

import torch
import speech_recognition as sr
import whisper

class Args:
    model = "medium"
    non_english = False
    energy_threshold = 1000
    record_timeout = 30.0
    phrase_timeout = 3.0
    default_microphone = 'pulse' if 'linux' in platform else None
    sample_rate = 16000

def main():
    args = Args()
    t0 = time()

    # Thread safe Queue for passing data from the threaded recording callback.
    data_queue = Queue()
    # We use SpeechRecognizer to record our audio because it has a nice feature where it can detect when speech ends.
    recorder = sr.Recognizer()
    recorder.energy_threshold = args.energy_threshold
    # Definitely use False, dynamic energy compensation lowers the energy threshold dramatically to a point where the SpeechRecognizer never stops recording.
    recorder.dynamic_energy_threshold = False

    source = sr.Microphone(sample_rate=args.sample_rate)

    print('⏱️ Loading model...')
    model = args.model
    if args.model != "large" and not args.non_english:
        model = model + ".en"
    audio_model = whisper.load_model(model)

    record_timeout = args.record_timeout

    with source:
        recorder.adjust_for_ambient_noise(source)

    def record_callback(_, audio:sr.AudioData) -> None:
        """
        Threaded callback function to receive audio data when recordings finish.
        audio: An AudioData containing the recorded bytes.
        """
        # Grab the raw bytes and push it into the thread safe queue.
        timestamp = time()
        data: bytes = audio.get_raw_data()  # sint16 as bytes

        print(f'\n[t0+{time() - t0:.3f}]🎙️ Received {len(data)} bytes of audio.')
        data_queue.put((timestamp, data))

    # Create a background thread that will pass us raw audio bytes.
    # We could do this manually but SpeechRecognizer provides a nice helper.
    recorder.listen_in_background(source, record_callback, phrase_time_limit=record_timeout)

    print(f'(t0+{time() - t0:.3f}s) 🟢 Model loaded.\n')

    while True:
        while data_queue.empty():
            sleep(0.01)

        timestamp, audio_data = data_queue.get()

        audio_np = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0

        print(
            f'(recv+{time() - timestamp:.3f}s)🔹Processing {len(audio_np)} samples '
            f'(= {len(audio_np) / args.sample_rate:.3f}s) of audio data.'
        )

        t = time()
        result = audio_model.transcribe(audio_np, fp16=torch.cuda.is_available())
        text = result['text'].strip()

        print(f'({time() - t:.3f}s) ✅ Transcription: {text}')


if __name__ == "__main__":
    try:
        main()
    except KeyboardInterrupt:
        print('👋')
	'''
	Adapted by π 15 Apr 2024
	from https://github.com/davabase/whisper_real_time/blob/master/transcribe_demo.py
	... which appears to have been itself pilfered from:
	https://github.com/JihyeokKim/whisper_ros/tree/master/whisper_ros/src

	To run on macOS:
	- `brew install ffmpeg portaudio`
	- Create requirements.txt:
	```
	pyaudio
	SpeechRecognition
	--extra-index-url https://download.pytorch.org/whl/cu116
	torch
	numpy
	git+https://github.com/openai/whisper.git
	```
	... and `pip install -r requirements.txt`
	- `python transcribe_demo.py`

	Sample output (M2 MacBook Pro):
	```
	(t0+5.180s) 🟢 Model loaded.


	[t0+11.385]🎙️ Received 120832 bytes of audio.
	(recv+0.008s)🔹Processing 60416 samples (= 3.776s) of audio data.
	(4.439s) ✅ Transcription: The rain in Spain stays mainly in the plain.

	[t0+26.164]🎙️ Received 184320 bytes of audio.
	(recv+0.004s)🔹Processing 92160 samples (= 5.760s) of audio data.
	(4.417s) ✅ Transcription: In Horsford, Herringford and Hampshire, hurricanes hardly ever happen.
	^C👋
	```
	fkn-A!

	Only concern is that ~4s of audio takes ~4s to transcribe.
	So that's something to keep an eye on.
	There's other Whisper code available that promises to be several x faster:
	- https://github.com/ggerganov/whisper.cpp
	- https://github.com/sanchit-gandhi/whisper-jax
	'''

	from sys import platform
	from time import time, sleep
	from queue import Queue
	import numpy as np

	import torch
	import speech_recognition as sr
	import whisper

	class Args:
	model = "medium"
	non_english = False
	energy_threshold = 1000
	record_timeout = 30.0
	phrase_timeout = 3.0
	default_microphone = 'pulse' if 'linux' in platform else None
	sample_rate = 16000

	def main():
	args = Args()
	t0 = time()

	# Thread safe Queue for passing data from the threaded recording callback.
	data_queue = Queue()
	# We use SpeechRecognizer to record our audio because it has a nice feature where it can detect when speech ends.
	recorder = sr.Recognizer()
	recorder.energy_threshold = args.energy_threshold
	# Definitely use False, dynamic energy compensation lowers the energy threshold dramatically to a point where the SpeechRecognizer never stops recording.
	recorder.dynamic_energy_threshold = False

	source = sr.Microphone(sample_rate=args.sample_rate)

	print('⏱️ Loading model...')
	model = args.model
	if args.model != "large" and not args.non_english:
	model = model + ".en"
	audio_model = whisper.load_model(model)

	record_timeout = args.record_timeout

	with source:
	recorder.adjust_for_ambient_noise(source)

	def record_callback(_, audio:sr.AudioData) -> None:
	"""
	Threaded callback function to receive audio data when recordings finish.
	audio: An AudioData containing the recorded bytes.
	"""
	# Grab the raw bytes and push it into the thread safe queue.
	timestamp = time()
	data: bytes = audio.get_raw_data() # sint16 as bytes

	print(f'\n[t0+{time() - t0:.3f}]🎙️ Received {len(data)} bytes of audio.')
	data_queue.put((timestamp, data))

	# Create a background thread that will pass us raw audio bytes.
	# We could do this manually but SpeechRecognizer provides a nice helper.
	recorder.listen_in_background(source, record_callback, phrase_time_limit=record_timeout)

	print(f'(t0+{time() - t0:.3f}s) 🟢 Model loaded.\n')

	while True:
	while data_queue.empty():
	sleep(0.01)

	timestamp, audio_data = data_queue.get()

	audio_np = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0

	print(
	f'(recv+{time() - timestamp:.3f}s)🔹Processing {len(audio_np)} samples '
	f'(= {len(audio_np) / args.sample_rate:.3f}s) of audio data.'
	)

	t = time()
	result = audio_model.transcribe(audio_np, fp16=torch.cuda.is_available())
	text = result['text'].strip()

	print(f'({time() - t:.3f}s) ✅ Transcription: {text}')


	if __name__ == "__main__":
	try:
	main()
	except KeyboardInterrupt:
	print('👋')