Skip to content

Instantly share code, notes, and snippets.

@lmcarreiro
Created July 29, 2021 21:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lmcarreiro/777a714a1ee6e9d56b0529afdd4833a1 to your computer and use it in GitHub Desktop.
Save lmcarreiro/777a714a1ee6e9d56b0529afdd4833a1 to your computer and use it in GitHub Desktop.
STT+VAR article - useSpeechToText.ts
import React from "react";
import hark from "hark";
import * as speech from "microsoft-cognitiveservices-speech-sdk";
const AUDIO_SAMPLE_RATE = 40_000;
const azureCredentials = {
token: "***********************",
region: "japaneast",
};
export default function useSpeechToText(
speechToTextEnabled: boolean,
muted: boolean,
newMessage: (message: { text: string; isFinal: boolean }) => void,
) {
// This is the ID of the selected input device.
// You can list all devices using `navigator.mediaDevices.enumerateDevices()`
const inputDevice = "default";
const [stream, setStream] = React.useState<MediaStream>();
const running = React.useRef<boolean>(false);
const mutedRef = React.useRef<boolean>(muted);
// We use a ref to check if the microphone is muted or not, to avoid recreating
// the Azure's SpeechRecognizer instance every time we mute/unmute.
React.useEffect(() => {
mutedRef.current = muted;
}, [muted]);
// Initialize the MediaStream calling getUserMedia
React.useEffect(() => {
(async () => {
const newStream = speechToTextEnabled
? await navigator.mediaDevices.getUserMedia({
audio: { deviceId: inputDevice, channelCount: 1, sampleRate: { ideal: AUDIO_SAMPLE_RATE } },
video: false,
})
: undefined;
setStream(currentStream => {
currentStream?.getTracks().forEach(t => t.stop());
return newStream;
});
})();
}, [speechToTextEnabled, inputDevice]);
// Initialize the Azure Speech to Text instance and bind the necessary events
React.useEffect(() => {
if (speechToTextEnabled && stream) {
const pushStream = speech.AudioInputStream.createPushStream();
const bufferSize = 16384;
const context: AudioContext = new AudioContext({ sampleRate: AUDIO_SAMPLE_RATE });
console.log(new Date().toISOString(), "Initializing STT on:", stream.getTracks()[0].label);
const input = context.createMediaStreamSource(stream);
const processor = context.createScriptProcessor(bufferSize, 1, 1);
const output = context.destination;
input.connect(processor);
processor.connect(output);
const onAudioProcess = (ev: AudioProcessingEvent) => {
const block = {
duration: ev.inputBuffer.duration,
bytes: convertFloat32ToInt16(ev.inputBuffer.getChannelData(0)),
};
if (!mutedRef.current) {
pushStream.write(block.bytes);
}
};
processor.addEventListener("audioprocess", onAudioProcess);
context.resume();
const speechConfig = speech.SpeechConfig.fromAuthorizationToken(azureCredentials.token, azureCredentials.region);
const audioConfig = speech.AudioConfig.fromStreamInput(pushStream);
speechConfig.speechRecognitionLanguage = "en-US";
const recognizer = new speech.SpeechRecognizer(speechConfig, audioConfig);
recognizer.recognizing = (s, e) => {
if (!running.current) return;
console.log("RECOGNIZING", `Text= ${e.result.text}`, e);
newMessage({ isFinal: false, text: e.result.text });
};
recognizer.recognized = (s, e) => {
if (!running.current) return;
if (e.result.reason === speech.ResultReason.RecognizedSpeech) {
console.log("RECOGNIZED", `Text= ${e.result.text}`, e);
newMessage({ isFinal: true, text: e.result.text });
} else if (e.result.reason === speech.ResultReason.NoMatch) {
console.log("NO MATCH", e);
}
};
recognizer.canceled = (s, e) => {
console.log("CANCELED", "Error=", e);
console.log("CANCELED", `Reason=${e.reason}`);
if (e.reason === speech.CancellationReason.Error) {
console.log("CANCELED", `ErrorCode=${e.errorCode}`);
console.log("CANCELED", `ErrorDetails=${e.errorDetails}`);
}
recognizer.stopContinuousRecognitionAsync();
running.current = false;
};
recognizer.sessionStopped = (s, e) => {
console.log("STOPPED", e);
if (!running.current) {
recognizer.stopContinuousRecognitionAsync();
running.current = false;
}
};
console.log("############## start()");
recognizer.startContinuousRecognitionAsync();
running.current = true;
return () => {
console.log("############## stop()");
recognizer.stopContinuousRecognitionAsync();
running.current = false;
processor.removeEventListener("audioprocess", onAudioProcess);
processor.disconnect(output);
input.disconnect(processor);
context.close();
};
}
}, [speechToTextEnabled, newMessage, stream]);
}
/**
* Converts a buffer from float32 to int16. Necessary for streaming.
* sampleRateHertz of 1600.
*
* @param {Float32Array} float32Array Buffer being converted
*/
function convertFloat32ToInt16(float32Array: Float32Array) {
let l = float32Array.length;
let buf = new Int16Array(l / 3);
while (l--) {
if (l % 3 === 0) {
buf[l / 3] = float32Array[l] * 0xffff;
}
}
return buf.buffer;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment