lmcarreiro/useSpeechToText.ts

## useSpeechToText.ts
import React from "react";
import hark from "hark";
import * as speech from "microsoft-cognitiveservices-speech-sdk";

const AUDIO_SAMPLE_RATE = 48_000;
const BUFFER_SECONDS = 2;

const azureCredentials = {
  token: "***********************",
  region: "japaneast",
};

export default function useSpeechToText(
  speechToTextEnabled: boolean,
  muted: boolean,
  newMessage: (message: { text: string; isFinal: boolean }) => void,
) {
  // This is the ID of the selected input device.
  // You can list all devices using `navigator.mediaDevices.enumerateDevices()`
  const inputDevice = "default";

  const [stream, setStream] = React.useState<MediaStream>();

  const running = React.useRef<boolean>(false);
  const bufferBlocks = React.useRef<{ duration: number; bytes: ArrayBufferLike }[]>([]);

  const speakerActive = useAudioActive(stream);
  const streamingFlagRef = React.useRef<boolean>(false);
  const shouldStream = !muted && speakerActive && running.current;

  // Initialize the MediaStream calling getUserMedia
  React.useEffect(() => {
    (async () => {
      const newStream = speechToTextEnabled
        ? await navigator.mediaDevices.getUserMedia({
            audio: { deviceId: inputDevice, channelCount: 1, sampleRate: { ideal: AUDIO_SAMPLE_RATE } },
            video: false,
          })
        : undefined;

      setStream(currentStream => {
        currentStream?.getTracks().forEach(t => t.stop());
        return newStream;
      });
    })();
  }, [speechToTextEnabled, inputDevice]);

  // Control the streaming flag, based on the voice activity detection (that uses hark) and the mute/unmute flag
  React.useEffect(() => {
    if (shouldStream) {
      if (!streamingFlagRef.current) {
        console.log("Voice activity detected, starting streaming current buffer + live streaming...");
        streamingFlagRef.current = true;
      }
    } else {
      if (!streamingFlagRef.current) return;

      console.log("Stop detecting voice activity, will stop streaming in 2 seconds...");

      const stopStreamingTimer = setTimeout(() => {
        console.log("Stopped streaming after 2 seconds without voice activity.");
        streamingFlagRef.current = false;
      }, 2_000);

      return () => {
        if (streamingFlagRef.current) {
          console.log("Voice activity detected, continue streaming...");
        }

        clearTimeout(stopStreamingTimer);
      };
    }
  }, [shouldStream]);

  // Initialize the Azure Speech to Text instance and bind the necessary events
  React.useEffect(() => {
    if (speechToTextEnabled && stream) {
      const pushStream = speech.AudioInputStream.createPushStream();
      const bufferSize = 16384;

      const context: AudioContext = new AudioContext({ sampleRate: AUDIO_SAMPLE_RATE });

      console.log(new Date().toISOString(), "Initializing STT on:", stream.getTracks()[0].label);

      const input = context.createMediaStreamSource(stream);
      const processor = context.createScriptProcessor(bufferSize, 1, 1);
      const output = context.destination;

      input.connect(processor);
      processor.connect(output);

      const onAudioProcess = (ev: AudioProcessingEvent) => {
        const block = {
          duration: ev.inputBuffer.duration,
          bytes: convertFloat32ToInt16(ev.inputBuffer.getChannelData(0)),
        };

        // If not streaming, keep the current audio on buffer to be send when start streaming
        if (!streamingFlagRef.current) {
          const totalDuration = bufferBlocks.current.reduce((sum, curr) => sum + curr.duration, 0);

          if (totalDuration >= BUFFER_SECONDS) {
            bufferBlocks.current.shift();
          }

          bufferBlocks.current.push(block);
        }
        // If streaming, send first the current data from the buffer (if there are any), then send the current live streaming
        else {
          while (bufferBlocks.current.length) {
            pushStream.write(bufferBlocks.current.shift()!.bytes);
          }

          pushStream.write(block.bytes);
        }
      };

      processor.addEventListener("audioprocess", onAudioProcess);

      context.resume();

      const speechConfig = speech.SpeechConfig.fromAuthorizationToken(azureCredentials.token, azureCredentials.region);
      const audioConfig = speech.AudioConfig.fromStreamInput(pushStream);

      speechConfig.speechRecognitionLanguage = "en-US";

      const recognizer = new speech.SpeechRecognizer(speechConfig, audioConfig);

      recognizer.recognizing = (s, e) => {
        if (!running.current) return;

        console.log("RECOGNIZING", `Text= ${e.result.text}`, e);
        newMessage({ isFinal: false, text: e.result.text });
      };

      recognizer.recognized = (s, e) => {
        if (!running.current) return;

        if (e.result.reason === speech.ResultReason.RecognizedSpeech) {
          console.log("RECOGNIZED", `Text= ${e.result.text}`, e);
          newMessage({ isFinal: true, text: e.result.text });
        } else if (e.result.reason === speech.ResultReason.NoMatch) {
          console.log("NO MATCH", e);
        }
      };

      recognizer.canceled = (s, e) => {
        console.log("CANCELED", "Error=", e);
        console.log("CANCELED", `Reason=${e.reason}`);

        if (e.reason === speech.CancellationReason.Error) {
          console.log("CANCELED", `ErrorCode=${e.errorCode}`);
          console.log("CANCELED", `ErrorDetails=${e.errorDetails}`);
        }

        recognizer.stopContinuousRecognitionAsync();
        running.current = false;
      };

      recognizer.sessionStopped = (s, e) => {
        console.log("STOPPED", e);

        if (!running.current) {
          recognizer.stopContinuousRecognitionAsync();
          running.current = false;
        }
      };

      console.log("############## start()");

      recognizer.startContinuousRecognitionAsync();
      running.current = true;

      return () => {
        console.log("############## stop()");

        recognizer.stopContinuousRecognitionAsync();
        running.current = false;
        bufferBlocks.current = [];

        processor.removeEventListener("audioprocess", onAudioProcess);
        processor.disconnect(output);
        input.disconnect(processor);
        context.close();
      };
    }
  }, [speechToTextEnabled, newMessage, stream]);
}

/**
 * Converts a buffer from float32 to int16. Necessary for streaming.
 * sampleRateHertz of 1600.
 *
 * @param {Float32Array} float32Array Buffer being converted
 */
function convertFloat32ToInt16(float32Array: Float32Array) {
  let l = float32Array.length;
  let buf = new Int16Array(l / 3);

  while (l--) {
    if (l % 3 === 0) {
      buf[l / 3] = float32Array[l] * 0xffff;
    }
  }
  return buf.buffer;
}
	import React from "react";
	import hark from "hark";
	import * as speech from "microsoft-cognitiveservices-speech-sdk";

	const AUDIO_SAMPLE_RATE = 48_000;
	const BUFFER_SECONDS = 2;

	const azureCredentials = {
	token: "***********************",
	region: "japaneast",
	};

	export default function useSpeechToText(
	speechToTextEnabled: boolean,
	muted: boolean,
	newMessage: (message: { text: string; isFinal: boolean }) => void,
	) {
	// This is the ID of the selected input device.
	// You can list all devices using `navigator.mediaDevices.enumerateDevices()`
	const inputDevice = "default";

	const [stream, setStream] = React.useState<MediaStream>();

	const running = React.useRef<boolean>(false);
	const bufferBlocks = React.useRef<{ duration: number; bytes: ArrayBufferLike }[]>([]);

	const speakerActive = useAudioActive(stream);
	const streamingFlagRef = React.useRef<boolean>(false);
	const shouldStream = !muted && speakerActive && running.current;

	// Initialize the MediaStream calling getUserMedia
	React.useEffect(() => {
	(async () => {
	const newStream = speechToTextEnabled
	? await navigator.mediaDevices.getUserMedia({
	audio: { deviceId: inputDevice, channelCount: 1, sampleRate: { ideal: AUDIO_SAMPLE_RATE } },
	video: false,
	})
	: undefined;

	setStream(currentStream => {
	currentStream?.getTracks().forEach(t => t.stop());
	return newStream;
	});
	})();
	}, [speechToTextEnabled, inputDevice]);

	// Control the streaming flag, based on the voice activity detection (that uses hark) and the mute/unmute flag
	React.useEffect(() => {
	if (shouldStream) {
	if (!streamingFlagRef.current) {
	console.log("Voice activity detected, starting streaming current buffer + live streaming...");
	streamingFlagRef.current = true;
	}
	} else {
	if (!streamingFlagRef.current) return;

	console.log("Stop detecting voice activity, will stop streaming in 2 seconds...");

	const stopStreamingTimer = setTimeout(() => {
	console.log("Stopped streaming after 2 seconds without voice activity.");
	streamingFlagRef.current = false;
	}, 2_000);

	return () => {
	if (streamingFlagRef.current) {
	console.log("Voice activity detected, continue streaming...");
	}

	clearTimeout(stopStreamingTimer);
	};
	}
	}, [shouldStream]);

	// Initialize the Azure Speech to Text instance and bind the necessary events
	React.useEffect(() => {
	if (speechToTextEnabled && stream) {
	const pushStream = speech.AudioInputStream.createPushStream();
	const bufferSize = 16384;

	const context: AudioContext = new AudioContext({ sampleRate: AUDIO_SAMPLE_RATE });

	console.log(new Date().toISOString(), "Initializing STT on:", stream.getTracks()[0].label);

	const input = context.createMediaStreamSource(stream);
	const processor = context.createScriptProcessor(bufferSize, 1, 1);
	const output = context.destination;

	input.connect(processor);
	processor.connect(output);

	const onAudioProcess = (ev: AudioProcessingEvent) => {
	const block = {
	duration: ev.inputBuffer.duration,
	bytes: convertFloat32ToInt16(ev.inputBuffer.getChannelData(0)),
	};

	// If not streaming, keep the current audio on buffer to be send when start streaming
	if (!streamingFlagRef.current) {
	const totalDuration = bufferBlocks.current.reduce((sum, curr) => sum + curr.duration, 0);

	if (totalDuration >= BUFFER_SECONDS) {
	bufferBlocks.current.shift();
	}

	bufferBlocks.current.push(block);
	}
	// If streaming, send first the current data from the buffer (if there are any), then send the current live streaming
	else {
	while (bufferBlocks.current.length) {
	pushStream.write(bufferBlocks.current.shift()!.bytes);
	}

	pushStream.write(block.bytes);
	}
	};

	processor.addEventListener("audioprocess", onAudioProcess);

	context.resume();

	const speechConfig = speech.SpeechConfig.fromAuthorizationToken(azureCredentials.token, azureCredentials.region);
	const audioConfig = speech.AudioConfig.fromStreamInput(pushStream);

	speechConfig.speechRecognitionLanguage = "en-US";

	const recognizer = new speech.SpeechRecognizer(speechConfig, audioConfig);

	recognizer.recognizing = (s, e) => {
	if (!running.current) return;

	console.log("RECOGNIZING", `Text= ${e.result.text}`, e);
	newMessage({ isFinal: false, text: e.result.text });
	};

	recognizer.recognized = (s, e) => {
	if (!running.current) return;

	if (e.result.reason === speech.ResultReason.RecognizedSpeech) {
	console.log("RECOGNIZED", `Text= ${e.result.text}`, e);
	newMessage({ isFinal: true, text: e.result.text });
	} else if (e.result.reason === speech.ResultReason.NoMatch) {
	console.log("NO MATCH", e);
	}
	};

	recognizer.canceled = (s, e) => {
	console.log("CANCELED", "Error=", e);
	console.log("CANCELED", `Reason=${e.reason}`);

	if (e.reason === speech.CancellationReason.Error) {
	console.log("CANCELED", `ErrorCode=${e.errorCode}`);
	console.log("CANCELED", `ErrorDetails=${e.errorDetails}`);
	}

	recognizer.stopContinuousRecognitionAsync();
	running.current = false;
	};

	recognizer.sessionStopped = (s, e) => {
	console.log("STOPPED", e);

	if (!running.current) {
	recognizer.stopContinuousRecognitionAsync();
	running.current = false;
	}
	};

	console.log("############## start()");

	recognizer.startContinuousRecognitionAsync();
	running.current = true;

	return () => {
	console.log("############## stop()");

	recognizer.stopContinuousRecognitionAsync();
	running.current = false;
	bufferBlocks.current = [];

	processor.removeEventListener("audioprocess", onAudioProcess);
	processor.disconnect(output);
	input.disconnect(processor);
	context.close();
	};
	}
	}, [speechToTextEnabled, newMessage, stream]);
	}

	/**
	* Converts a buffer from float32 to int16. Necessary for streaming.
	* sampleRateHertz of 1600.
	*
	* @param {Float32Array} float32Array Buffer being converted
	*/
	function convertFloat32ToInt16(float32Array: Float32Array) {
	let l = float32Array.length;
	let buf = new Int16Array(l / 3);

	while (l--) {
	if (l % 3 === 0) {
	buf[l / 3] = float32Array[l] * 0xffff;
	}
	}
	return buf.buffer;
	}