lmcarreiro/useSpeechToText.ts

## useSpeechToText.ts
import React from "react";
import hark from "hark";
import * as speech from "microsoft-cognitiveservices-speech-sdk";

const AUDIO_SAMPLE_RATE = 40_000;

const azureCredentials = {
  token: "***********************",
  region: "japaneast",
};

export default function useSpeechToText(
  speechToTextEnabled: boolean,
  muted: boolean,
  newMessage: (message: { text: string; isFinal: boolean }) => void,
) {
  // This is the ID of the selected input device.
  // You can list all devices using `navigator.mediaDevices.enumerateDevices()`
  const inputDevice = "default";

  const [stream, setStream] = React.useState<MediaStream>();

  const running = React.useRef<boolean>(false);
  const mutedRef = React.useRef<boolean>(muted);

  // We use a ref to check if the microphone is muted or not, to avoid recreating
  // the Azure's SpeechRecognizer instance every time we mute/unmute.
  React.useEffect(() => {
    mutedRef.current = muted;
  }, [muted]);

  // Initialize the MediaStream calling getUserMedia
  React.useEffect(() => {
    (async () => {
      const newStream = speechToTextEnabled
        ? await navigator.mediaDevices.getUserMedia({
            audio: { deviceId: inputDevice, channelCount: 1, sampleRate: { ideal: AUDIO_SAMPLE_RATE } },
            video: false,
          })
        : undefined;

      setStream(currentStream => {
        currentStream?.getTracks().forEach(t => t.stop());
        return newStream;
      });
    })();
  }, [speechToTextEnabled, inputDevice]);

  // Initialize the Azure Speech to Text instance and bind the necessary events
  React.useEffect(() => {
    if (speechToTextEnabled && stream) {
      const pushStream = speech.AudioInputStream.createPushStream();
      const bufferSize = 16384;

      const context: AudioContext = new AudioContext({ sampleRate: AUDIO_SAMPLE_RATE });

      console.log(new Date().toISOString(), "Initializing STT on:", stream.getTracks()[0].label);

      const input = context.createMediaStreamSource(stream);
      const processor = context.createScriptProcessor(bufferSize, 1, 1);
      const output = context.destination;

      input.connect(processor);
      processor.connect(output);

      const onAudioProcess = (ev: AudioProcessingEvent) => {
        const block = {
          duration: ev.inputBuffer.duration,
          bytes: convertFloat32ToInt16(ev.inputBuffer.getChannelData(0)),
        };

        if (!mutedRef.current) {
          pushStream.write(block.bytes);
        }
      };

      processor.addEventListener("audioprocess", onAudioProcess);

      context.resume();

      const speechConfig = speech.SpeechConfig.fromAuthorizationToken(azureCredentials.token, azureCredentials.region);
      const audioConfig = speech.AudioConfig.fromStreamInput(pushStream);

      speechConfig.speechRecognitionLanguage = "en-US";

      const recognizer = new speech.SpeechRecognizer(speechConfig, audioConfig);

      recognizer.recognizing = (s, e) => {
        if (!running.current) return;

        console.log("RECOGNIZING", `Text= ${e.result.text}`, e);
        newMessage({ isFinal: false, text: e.result.text });
      };

      recognizer.recognized = (s, e) => {
        if (!running.current) return;

        if (e.result.reason === speech.ResultReason.RecognizedSpeech) {
          console.log("RECOGNIZED", `Text= ${e.result.text}`, e);
          newMessage({ isFinal: true, text: e.result.text });
        } else if (e.result.reason === speech.ResultReason.NoMatch) {
          console.log("NO MATCH", e);
        }
      };

      recognizer.canceled = (s, e) => {
        console.log("CANCELED", "Error=", e);
        console.log("CANCELED", `Reason=${e.reason}`);

        if (e.reason === speech.CancellationReason.Error) {
          console.log("CANCELED", `ErrorCode=${e.errorCode}`);
          console.log("CANCELED", `ErrorDetails=${e.errorDetails}`);
        }

        recognizer.stopContinuousRecognitionAsync();
        running.current = false;
      };

      recognizer.sessionStopped = (s, e) => {
        console.log("STOPPED", e);

        if (!running.current) {
          recognizer.stopContinuousRecognitionAsync();
          running.current = false;
        }
      };

      console.log("############## start()");

      recognizer.startContinuousRecognitionAsync();
      running.current = true;

      return () => {
        console.log("############## stop()");

        recognizer.stopContinuousRecognitionAsync();
        running.current = false;

        processor.removeEventListener("audioprocess", onAudioProcess);
        processor.disconnect(output);
        input.disconnect(processor);
        context.close();
      };
    }
  }, [speechToTextEnabled, newMessage, stream]);
}

/**
 * Converts a buffer from float32 to int16. Necessary for streaming.
 * sampleRateHertz of 1600.
 *
 * @param {Float32Array} float32Array Buffer being converted
 */
function convertFloat32ToInt16(float32Array: Float32Array) {
  let l = float32Array.length;
  let buf = new Int16Array(l / 3);

  while (l--) {
    if (l % 3 === 0) {
      buf[l / 3] = float32Array[l] * 0xffff;
    }
  }
  return buf.buffer;
}
	import React from "react";
	import hark from "hark";
	import * as speech from "microsoft-cognitiveservices-speech-sdk";

	const AUDIO_SAMPLE_RATE = 40_000;

	const azureCredentials = {
	token: "***********************",
	region: "japaneast",
	};

	export default function useSpeechToText(
	speechToTextEnabled: boolean,
	muted: boolean,
	newMessage: (message: { text: string; isFinal: boolean }) => void,
	) {
	// This is the ID of the selected input device.
	// You can list all devices using `navigator.mediaDevices.enumerateDevices()`
	const inputDevice = "default";

	const [stream, setStream] = React.useState<MediaStream>();

	const running = React.useRef<boolean>(false);
	const mutedRef = React.useRef<boolean>(muted);

	// We use a ref to check if the microphone is muted or not, to avoid recreating
	// the Azure's SpeechRecognizer instance every time we mute/unmute.
	React.useEffect(() => {
	mutedRef.current = muted;
	}, [muted]);

	// Initialize the MediaStream calling getUserMedia
	React.useEffect(() => {
	(async () => {
	const newStream = speechToTextEnabled
	? await navigator.mediaDevices.getUserMedia({
	audio: { deviceId: inputDevice, channelCount: 1, sampleRate: { ideal: AUDIO_SAMPLE_RATE } },
	video: false,
	})
	: undefined;

	setStream(currentStream => {
	currentStream?.getTracks().forEach(t => t.stop());
	return newStream;
	});
	})();
	}, [speechToTextEnabled, inputDevice]);

	// Initialize the Azure Speech to Text instance and bind the necessary events
	React.useEffect(() => {
	if (speechToTextEnabled && stream) {
	const pushStream = speech.AudioInputStream.createPushStream();
	const bufferSize = 16384;

	const context: AudioContext = new AudioContext({ sampleRate: AUDIO_SAMPLE_RATE });

	console.log(new Date().toISOString(), "Initializing STT on:", stream.getTracks()[0].label);

	const input = context.createMediaStreamSource(stream);
	const processor = context.createScriptProcessor(bufferSize, 1, 1);
	const output = context.destination;

	input.connect(processor);
	processor.connect(output);

	const onAudioProcess = (ev: AudioProcessingEvent) => {
	const block = {
	duration: ev.inputBuffer.duration,
	bytes: convertFloat32ToInt16(ev.inputBuffer.getChannelData(0)),
	};

	if (!mutedRef.current) {
	pushStream.write(block.bytes);
	}
	};

	processor.addEventListener("audioprocess", onAudioProcess);

	context.resume();

	const speechConfig = speech.SpeechConfig.fromAuthorizationToken(azureCredentials.token, azureCredentials.region);
	const audioConfig = speech.AudioConfig.fromStreamInput(pushStream);

	speechConfig.speechRecognitionLanguage = "en-US";

	const recognizer = new speech.SpeechRecognizer(speechConfig, audioConfig);

	recognizer.recognizing = (s, e) => {
	if (!running.current) return;

	console.log("RECOGNIZING", `Text= ${e.result.text}`, e);
	newMessage({ isFinal: false, text: e.result.text });
	};

	recognizer.recognized = (s, e) => {
	if (!running.current) return;

	if (e.result.reason === speech.ResultReason.RecognizedSpeech) {
	console.log("RECOGNIZED", `Text= ${e.result.text}`, e);
	newMessage({ isFinal: true, text: e.result.text });
	} else if (e.result.reason === speech.ResultReason.NoMatch) {
	console.log("NO MATCH", e);
	}
	};

	recognizer.canceled = (s, e) => {
	console.log("CANCELED", "Error=", e);
	console.log("CANCELED", `Reason=${e.reason}`);

	if (e.reason === speech.CancellationReason.Error) {
	console.log("CANCELED", `ErrorCode=${e.errorCode}`);
	console.log("CANCELED", `ErrorDetails=${e.errorDetails}`);
	}

	recognizer.stopContinuousRecognitionAsync();
	running.current = false;
	};

	recognizer.sessionStopped = (s, e) => {
	console.log("STOPPED", e);

	if (!running.current) {
	recognizer.stopContinuousRecognitionAsync();
	running.current = false;
	}
	};

	console.log("############## start()");

	recognizer.startContinuousRecognitionAsync();
	running.current = true;

	return () => {
	console.log("############## stop()");

	recognizer.stopContinuousRecognitionAsync();
	running.current = false;

	processor.removeEventListener("audioprocess", onAudioProcess);
	processor.disconnect(output);
	input.disconnect(processor);
	context.close();
	};
	}
	}, [speechToTextEnabled, newMessage, stream]);
	}

	/**
	* Converts a buffer from float32 to int16. Necessary for streaming.
	* sampleRateHertz of 1600.
	*
	* @param {Float32Array} float32Array Buffer being converted
	*/
	function convertFloat32ToInt16(float32Array: Float32Array) {
	let l = float32Array.length;
	let buf = new Int16Array(l / 3);

	while (l--) {
	if (l % 3 === 0) {
	buf[l / 3] = float32Array[l] * 0xffff;
	}
	}
	return buf.buffer;
	}