max-arnold/listen.py

## listen.py
# Based on https://cloud.yandex.ru/ru/docs/speechkit/stt/api/microphone-streaming
import pyaudio
import wave
import argparse
import grpc
from datetime import datetime
import time
import queue
import sys

import yandex.cloud.ai.stt.v3.stt_pb2 as stt_pb2
import yandex.cloud.ai.stt.v3.stt_service_pb2_grpc as stt_service_pb2_grpc

FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 48000
CHUNK = 4096
RECORD_SECONDS = 60


def save_frames(audio, frames):
    if frames and False:
        now = datetime.now().strftime('%Y%m%d%H%M%S')
        waveFile = wave.open(f"{now}.wav", 'wb')
        waveFile.setnchannels(CHANNELS)
        waveFile.setsampwidth(audio.get_sample_size(FORMAT))
        waveFile.setframerate(RATE)
        waveFile.writeframes(b''.join(frames))
        waveFile.close()


def reqiter(audio_queue, frames, lang):
    recognize_options = stt_pb2.StreamingOptions(
        recognition_model=stt_pb2.RecognitionModelOptions(
            audio_format=stt_pb2.AudioFormatOptions(
                raw_audio=stt_pb2.RawAudio(
                    audio_encoding=stt_pb2.RawAudio.LINEAR16_PCM,
                    sample_rate_hertz=RATE,
                    audio_channel_count=1
                )
            ),
            text_normalization=stt_pb2.TextNormalizationOptions(
                text_normalization=stt_pb2.TextNormalizationOptions.TEXT_NORMALIZATION_ENABLED,
                profanity_filter=True,
                literature_text=False
            ),
            language_restriction=stt_pb2.LanguageRestrictionOptions(
                restriction_type=stt_pb2.LanguageRestrictionOptions.WHITELIST,
                language_code=[{'ru': 'ru-RU', 'en': 'en-US'}.get(lang, 'en-US')]
            ),
            audio_processing_type=stt_pb2.RecognitionModelOptions.REAL_TIME
        )
    )

    yield stt_pb2.StreamingRequest(session_options=recognize_options)

    sys.stderr.write("Recording...\n")
    end_time = time.monotonic() + RECORD_SECONDS
    while time.monotonic() < end_time:
        mic_data = audio_queue.get()
        frames.append(mic_data)
        yield stt_pb2.StreamingRequest(chunk=stt_pb2.AudioChunk(data=mic_data))


def run(token, lang):
    audio = pyaudio.PyAudio()
    sys.stderr.write(str(audio.get_default_input_device_info()) + "\n")
    audio_queue = queue.Queue()

    def callback(input_data, frame_count, time_info, status_flag):
        audio_queue.put_nowait(input_data)
        return (input_data, pyaudio.paContinue)

    stream = audio.open(
        format=FORMAT,
        channels=CHANNELS,
        rate=RATE,
        input=True,
        frames_per_buffer=CHUNK,
        stream_callback=callback,
    )

    try:
        while True:
            frames = []
            cred = grpc.ssl_channel_credentials()
            channel = grpc.secure_channel('stt.api.cloud.yandex.net:443', cred)
            stub = stt_service_pb2_grpc.RecognizerStub(channel)

            it = stub.RecognizeStreaming(reqiter(audio_queue, frames, lang), metadata=(
                ('authorization', f'Api-Key {token}'),
            ))

            try:
                for r in it:
                    event_type, alternatives = r.WhichOneof('Event'), None
                    if event_type == 'partial' and len(r.partial.alternatives) > 0:
                        alternatives = [a.text for a in r.partial.alternatives]
                    if event_type == 'final':
                        alternatives = [a.text for a in r.final.alternatives]
                    if event_type == 'final_refinement':
                        alternatives = [a.text for a in r.final_refinement.normalized_text.alternatives]
                    if alternatives and alternatives[0]:
                        print(f'{{"event": "{event_type}", "text": "{alternatives[0]}"}}')
                    if event_type == "eou_update":
                        print('{"event": "eou"}')
            except grpc._channel._Rendezvous as err:
                sys.stderr.write(f'Error code {err._state.code}, message: {err._state.details}' + "\n")
                raise err
            save_frames(audio, frames)
    except KeyboardInterrupt:
        pass
    finally:
        save_frames(audio, frames)
        stream.stop_stream()
        stream.close()
        audio.terminate()


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--token', required=True, help='API key or IAM token')
    parser.add_argument('-l', dest="lang", help='Language')
    args = parser.parse_args()
    run(args.token, args.lang)

## run-listen.sh
#!/bin/bash

~/.virtualenvs/speechkit-listener/bin/python listen.py --token XXX $@

## speechkit.el
;; Based on https://sachachua.com/blog/2023/12/live-speech-with-deepgram/

;; sqlite3 ~/Library/Application\ Support/com.apple.TCC/TCC.db
;; insert into access values ('kTCCServiceMicrophone', 'org.gnu.Emacs', 0, 2, 4, 1, null, null, null, 'UNUSED', null, null, 1704007344);

(defvar my/speechkit-buffer "*Speech*")
(defvar my/speechkit-current-buffer nil)
(defvar my/speechkit-process nil)
(defvar my/speechkit-stdout-buffer "*Speech JSON*")
(defvar my/speechkit-stderr-buffer "*Speech stderr*")
(defvar my/speechkit-auto-scroll t)
(defvar my/speechkit--change-group nil)

(defun my/speechkit-start (&optional arg)
  "Turn on live captions."
  (interactive "P")
  (let ((lang (if (and current-input-method (string-match "cyrillic" current-input-method)) "-l ru" "-l en")))
    (with-current-buffer (if arg (current-buffer) (get-buffer-create my/speechkit-buffer))
      (setq my/speechkit-current-buffer (current-buffer))
      (unless (process-live-p my/speechkit-process)
        (let ((default-directory "~/play/speechkit"))
          (with-current-buffer (get-buffer-create my/speechkit-stdout-buffer)
            (erase-buffer))
          (setq my/speechkit-process
                (make-process
                 :command `("bash" "run-listen.sh" ,lang)
                 :name "speech"
                 :filter 'my/speechkit-json-filter
                 :sentinel #'my/speechkit-process-sentinel
                 :buffer my/speechkit-stdout-buffer
                 :stderr my/speechkit-stderr-buffer))))
      (display-buffer (current-buffer)))))

(defun my/speechkit-stop ()
  (interactive)
  (if (process-live-p my/speechkit-process)
      (interrupt-process my/speechkit-process)))

(defun my/speechkit-process-sentinel (proc event)
  (when (string-match "finished" event)
    (my/speechkit-stop)))

(defun my/speechkit-json-filter (proc string)
  (when (buffer-live-p (process-buffer proc))
    (with-current-buffer (process-buffer proc)
      (let* ((proc-mark (process-mark proc))
             (moving (= (point) proc-mark)))
        ;;  insert the output
        (save-excursion
          (goto-char proc-mark)
          (insert string)
          (set-marker proc-mark (point)))
        (if moving (goto-char proc-mark))
        ;; process and remove all complete lines of JSON (lines are complete if ending with \n)
        (let ((pos (point-min)))
          (while (progn (goto-char pos)
                        (end-of-line)
                        (equal (following-char) ?\n))
            (let* ((end (point))
                   (line (buffer-substring pos end)))
              (delete-region pos (+ end 1))
              (my/speechkit-display-in-speech-buffer (json-parse-string line :object-type 'alist)))))))))

(defun my/speechkit-display-in-speech-buffer (json-object)
  (with-current-buffer my/speechkit-current-buffer
    (let-alist json-object
      (let* ((at-end (eobp)))
        (when (equal .event "eou")
          (end-of-line)
          (unless (bolp)
            (insert "\n")))
        (when (equal .event "partial")
          (beginning-of-line)
          (unless (eolp) (kill-line))
          (insert .text))
        (when (equal .event "final")
          (beginning-of-line)
          (unless (eolp) (kill-line))
          (insert .text))
        (when (equal .event "final_refinement")
          (beginning-of-line)
          (unless (eolp) (kill-line))
          (insert .text "\n")
          (end-of-line))
        (set-window-point (get-buffer-window (current-buffer)) (point))
        ))))

(defun my/speechkit-toggle-listen (&optional arg)
  (interactive "P")
  (if (process-live-p my/speechkit-process)
      (progn
        (my/speechkit-stop)
        (message "SpeechKit stopped"))
      (progn
        (my/speechkit-start arg)
        (message "SpeechKit started"))))

(provide 'speechkit)
	# Based on https://cloud.yandex.ru/ru/docs/speechkit/stt/api/microphone-streaming
	import pyaudio
	import wave
	import argparse
	import grpc
	from datetime import datetime
	import time
	import queue
	import sys

	import yandex.cloud.ai.stt.v3.stt_pb2 as stt_pb2
	import yandex.cloud.ai.stt.v3.stt_service_pb2_grpc as stt_service_pb2_grpc

	FORMAT = pyaudio.paInt16
	CHANNELS = 1
	RATE = 48000
	CHUNK = 4096
	RECORD_SECONDS = 60


	def save_frames(audio, frames):
	if frames and False:
	now = datetime.now().strftime('%Y%m%d%H%M%S')
	waveFile = wave.open(f"{now}.wav", 'wb')
	waveFile.setnchannels(CHANNELS)
	waveFile.setsampwidth(audio.get_sample_size(FORMAT))
	waveFile.setframerate(RATE)
	waveFile.writeframes(b''.join(frames))
	waveFile.close()


	def reqiter(audio_queue, frames, lang):
	recognize_options = stt_pb2.StreamingOptions(
	recognition_model=stt_pb2.RecognitionModelOptions(
	audio_format=stt_pb2.AudioFormatOptions(
	raw_audio=stt_pb2.RawAudio(
	audio_encoding=stt_pb2.RawAudio.LINEAR16_PCM,
	sample_rate_hertz=RATE,
	audio_channel_count=1
	)
	),
	text_normalization=stt_pb2.TextNormalizationOptions(
	text_normalization=stt_pb2.TextNormalizationOptions.TEXT_NORMALIZATION_ENABLED,
	profanity_filter=True,
	literature_text=False
	),
	language_restriction=stt_pb2.LanguageRestrictionOptions(
	restriction_type=stt_pb2.LanguageRestrictionOptions.WHITELIST,
	language_code=[{'ru': 'ru-RU', 'en': 'en-US'}.get(lang, 'en-US')]
	),
	audio_processing_type=stt_pb2.RecognitionModelOptions.REAL_TIME
	)
	)

	yield stt_pb2.StreamingRequest(session_options=recognize_options)

	sys.stderr.write("Recording...\n")
	end_time = time.monotonic() + RECORD_SECONDS
	while time.monotonic() < end_time:
	mic_data = audio_queue.get()
	frames.append(mic_data)
	yield stt_pb2.StreamingRequest(chunk=stt_pb2.AudioChunk(data=mic_data))


	def run(token, lang):
	audio = pyaudio.PyAudio()
	sys.stderr.write(str(audio.get_default_input_device_info()) + "\n")
	audio_queue = queue.Queue()

	def callback(input_data, frame_count, time_info, status_flag):
	audio_queue.put_nowait(input_data)
	return (input_data, pyaudio.paContinue)

	stream = audio.open(
	format=FORMAT,
	channels=CHANNELS,
	rate=RATE,
	input=True,
	frames_per_buffer=CHUNK,
	stream_callback=callback,
	)

	try:
	while True:
	frames = []
	cred = grpc.ssl_channel_credentials()
	channel = grpc.secure_channel('stt.api.cloud.yandex.net:443', cred)
	stub = stt_service_pb2_grpc.RecognizerStub(channel)

	it = stub.RecognizeStreaming(reqiter(audio_queue, frames, lang), metadata=(
	('authorization', f'Api-Key {token}'),
	))

	try:
	for r in it:
	event_type, alternatives = r.WhichOneof('Event'), None
	if event_type == 'partial' and len(r.partial.alternatives) > 0:
	alternatives = [a.text for a in r.partial.alternatives]
	if event_type == 'final':
	alternatives = [a.text for a in r.final.alternatives]
	if event_type == 'final_refinement':
	alternatives = [a.text for a in r.final_refinement.normalized_text.alternatives]
	if alternatives and alternatives[0]:
	print(f'{{"event": "{event_type}", "text": "{alternatives[0]}"}}')
	if event_type == "eou_update":
	print('{"event": "eou"}')
	except grpc._channel._Rendezvous as err:
	sys.stderr.write(f'Error code {err._state.code}, message: {err._state.details}' + "\n")
	raise err
	save_frames(audio, frames)
	except KeyboardInterrupt:
	pass
	finally:
	save_frames(audio, frames)
	stream.stop_stream()
	stream.close()
	audio.terminate()


	if __name__ == '__main__':
	parser = argparse.ArgumentParser()
	parser.add_argument('--token', required=True, help='API key or IAM token')
	parser.add_argument('-l', dest="lang", help='Language')
	args = parser.parse_args()
	run(args.token, args.lang)
	#!/bin/bash

	~/.virtualenvs/speechkit-listener/bin/python listen.py --token XXX $@
	;; Based on https://sachachua.com/blog/2023/12/live-speech-with-deepgram/

	;; sqlite3 ~/Library/Application\ Support/com.apple.TCC/TCC.db
	;; insert into access values ('kTCCServiceMicrophone', 'org.gnu.Emacs', 0, 2, 4, 1, null, null, null, 'UNUSED', null, null, 1704007344);

	(defvar my/speechkit-buffer "Speech")
	(defvar my/speechkit-current-buffer nil)
	(defvar my/speechkit-process nil)
	(defvar my/speechkit-stdout-buffer "Speech JSON")
	(defvar my/speechkit-stderr-buffer "Speech stderr")
	(defvar my/speechkit-auto-scroll t)
	(defvar my/speechkit--change-group nil)

	(defun my/speechkit-start (&optional arg)
	"Turn on live captions."
	(interactive "P")
	(let ((lang (if (and current-input-method (string-match "cyrillic" current-input-method)) "-l ru" "-l en")))
	(with-current-buffer (if arg (current-buffer) (get-buffer-create my/speechkit-buffer))
	(setq my/speechkit-current-buffer (current-buffer))
	(unless (process-live-p my/speechkit-process)
	(let ((default-directory "~/play/speechkit"))
	(with-current-buffer (get-buffer-create my/speechkit-stdout-buffer)
	(erase-buffer))
	(setq my/speechkit-process
	(make-process
	:command `("bash" "run-listen.sh" ,lang)
	:name "speech"
	:filter 'my/speechkit-json-filter
	:sentinel #'my/speechkit-process-sentinel
	:buffer my/speechkit-stdout-buffer
	:stderr my/speechkit-stderr-buffer))))
	(display-buffer (current-buffer)))))

	(defun my/speechkit-stop ()
	(interactive)
	(if (process-live-p my/speechkit-process)
	(interrupt-process my/speechkit-process)))

	(defun my/speechkit-process-sentinel (proc event)
	(when (string-match "finished" event)
	(my/speechkit-stop)))

	(defun my/speechkit-json-filter (proc string)
	(when (buffer-live-p (process-buffer proc))
	(with-current-buffer (process-buffer proc)
	(let* ((proc-mark (process-mark proc))
	(moving (= (point) proc-mark)))
	;; insert the output
	(save-excursion
	(goto-char proc-mark)
	(insert string)
	(set-marker proc-mark (point)))
	(if moving (goto-char proc-mark))
	;; process and remove all complete lines of JSON (lines are complete if ending with \n)
	(let ((pos (point-min)))
	(while (progn (goto-char pos)
	(end-of-line)
	(equal (following-char) ?\n))
	(let* ((end (point))
	(line (buffer-substring pos end)))
	(delete-region pos (+ end 1))
	(my/speechkit-display-in-speech-buffer (json-parse-string line :object-type 'alist)))))))))

	(defun my/speechkit-display-in-speech-buffer (json-object)
	(with-current-buffer my/speechkit-current-buffer
	(let-alist json-object
	(let* ((at-end (eobp)))
	(when (equal .event "eou")
	(end-of-line)
	(unless (bolp)
	(insert "\n")))
	(when (equal .event "partial")
	(beginning-of-line)
	(unless (eolp) (kill-line))
	(insert .text))
	(when (equal .event "final")
	(beginning-of-line)
	(unless (eolp) (kill-line))
	(insert .text))
	(when (equal .event "final_refinement")
	(beginning-of-line)
	(unless (eolp) (kill-line))
	(insert .text "\n")
	(end-of-line))
	(set-window-point (get-buffer-window (current-buffer)) (point))
	))))

	(defun my/speechkit-toggle-listen (&optional arg)
	(interactive "P")
	(if (process-live-p my/speechkit-process)
	(progn
	(my/speechkit-stop)
	(message "SpeechKit stopped"))
	(progn
	(my/speechkit-start arg)
	(message "SpeechKit started"))))

	(provide 'speechkit)