Skip to content

Instantly share code, notes, and snippets.

Created July 29, 2021 21:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lmcarreiro/bf549eea7efb5570fcd99c2dacd598d0 to your computer and use it in GitHub Desktop.
Save lmcarreiro/bf549eea7efb5570fcd99c2dacd598d0 to your computer and use it in GitHub Desktop.
STT+VAD article - useSpeechToText - Final
import React from "react";
import hark from "hark";
import * as speech from "microsoft-cognitiveservices-speech-sdk";
const AUDIO_SAMPLE_RATE = 48_000;
const azureCredentials = {
token: "***********************",
region: "japaneast",
export default function useSpeechToText(
speechToTextEnabled: boolean,
muted: boolean,
newMessage: (message: { text: string; isFinal: boolean }) => void,
) {
// This is the ID of the selected input device.
// You can list all devices using `navigator.mediaDevices.enumerateDevices()`
const inputDevice = "default";
const [stream, setStream] = React.useState<MediaStream>();
const running = React.useRef<boolean>(false);
const bufferBlocks = React.useRef<{ duration: number; bytes: ArrayBufferLike }[]>([]);
const speakerActive = useAudioActive(stream);
const streamingFlagRef = React.useRef<boolean>(false);
const shouldStream = !muted && speakerActive && running.current;
// Initialize the MediaStream calling getUserMedia
React.useEffect(() => {
(async () => {
const newStream = speechToTextEnabled
? await navigator.mediaDevices.getUserMedia({
audio: { deviceId: inputDevice, channelCount: 1, sampleRate: { ideal: AUDIO_SAMPLE_RATE } },
video: false,
: undefined;
setStream(currentStream => {
currentStream?.getTracks().forEach(t => t.stop());
return newStream;
}, [speechToTextEnabled, inputDevice]);
// Control the streaming flag, based on the voice activity detection (that uses hark) and the mute/unmute flag
React.useEffect(() => {
if (shouldStream) {
if (!streamingFlagRef.current) {
console.log("Voice activity detected, starting streaming current buffer + live streaming...");
streamingFlagRef.current = true;
} else {
if (!streamingFlagRef.current) return;
console.log("Stop detecting voice activity, will stop streaming in 2 seconds...");
const stopStreamingTimer = setTimeout(() => {
console.log("Stopped streaming after 2 seconds without voice activity.");
streamingFlagRef.current = false;
}, 2_000);
return () => {
if (streamingFlagRef.current) {
console.log("Voice activity detected, continue streaming...");
}, [shouldStream]);
// Initialize the Azure Speech to Text instance and bind the necessary events
React.useEffect(() => {
if (speechToTextEnabled && stream) {
const pushStream = speech.AudioInputStream.createPushStream();
const bufferSize = 16384;
const context: AudioContext = new AudioContext({ sampleRate: AUDIO_SAMPLE_RATE });
console.log(new Date().toISOString(), "Initializing STT on:", stream.getTracks()[0].label);
const input = context.createMediaStreamSource(stream);
const processor = context.createScriptProcessor(bufferSize, 1, 1);
const output = context.destination;
const onAudioProcess = (ev: AudioProcessingEvent) => {
const block = {
duration: ev.inputBuffer.duration,
bytes: convertFloat32ToInt16(ev.inputBuffer.getChannelData(0)),
// If not streaming, keep the current audio on buffer to be send when start streaming
if (!streamingFlagRef.current) {
const totalDuration = bufferBlocks.current.reduce((sum, curr) => sum + curr.duration, 0);
if (totalDuration >= BUFFER_SECONDS) {
// If streaming, send first the current data from the buffer (if there are any), then send the current live streaming
else {
while (bufferBlocks.current.length) {
processor.addEventListener("audioprocess", onAudioProcess);
const speechConfig = speech.SpeechConfig.fromAuthorizationToken(azureCredentials.token, azureCredentials.region);
const audioConfig = speech.AudioConfig.fromStreamInput(pushStream);
speechConfig.speechRecognitionLanguage = "en-US";
const recognizer = new speech.SpeechRecognizer(speechConfig, audioConfig);
recognizer.recognizing = (s, e) => {
if (!running.current) return;
console.log("RECOGNIZING", `Text= ${e.result.text}`, e);
newMessage({ isFinal: false, text: e.result.text });
recognizer.recognized = (s, e) => {
if (!running.current) return;
if (e.result.reason === speech.ResultReason.RecognizedSpeech) {
console.log("RECOGNIZED", `Text= ${e.result.text}`, e);
newMessage({ isFinal: true, text: e.result.text });
} else if (e.result.reason === speech.ResultReason.NoMatch) {
console.log("NO MATCH", e);
recognizer.canceled = (s, e) => {
console.log("CANCELED", "Error=", e);
console.log("CANCELED", `Reason=${e.reason}`);
if (e.reason === speech.CancellationReason.Error) {
console.log("CANCELED", `ErrorCode=${e.errorCode}`);
console.log("CANCELED", `ErrorDetails=${e.errorDetails}`);
running.current = false;
recognizer.sessionStopped = (s, e) => {
console.log("STOPPED", e);
if (!running.current) {
running.current = false;
console.log("############## start()");
running.current = true;
return () => {
console.log("############## stop()");
running.current = false;
bufferBlocks.current = [];
processor.removeEventListener("audioprocess", onAudioProcess);
}, [speechToTextEnabled, newMessage, stream]);
* Converts a buffer from float32 to int16. Necessary for streaming.
* sampleRateHertz of 1600.
* @param {Float32Array} float32Array Buffer being converted
function convertFloat32ToInt16(float32Array: Float32Array) {
let l = float32Array.length;
let buf = new Int16Array(l / 3);
while (l--) {
if (l % 3 === 0) {
buf[l / 3] = float32Array[l] * 0xffff;
return buf.buffer;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment