-
-
Save hiroshi-manabe/51f4ec8a2bf8e83d5ea72aa7dc402027 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import React, { useEffect, useState, useRef } from "react"; | |
import { | |
SpeechRecogintionResult, | |
createSpeechRecognition, | |
createMediaRecorder, | |
SpeechRecord | |
} from "./utils"; | |
let currentSpeechRecognition: any | null = null; | |
let currentMediaRecorder: any | null = null; | |
type Props = { | |
onBlobUpdated: (blob: Blob, records: SpeechRecord[]) => void; | |
}; | |
let records: SpeechRecord[] = []; | |
export function SpeechRecognizer(props: Props) { | |
const textareaRef = useRef<HTMLTextAreaElement>(null); | |
const [isRecoding, setRecording] = useState(false); | |
const [step, setStep] = useState(0); | |
const [startAt, setStartAt] = useState(0); | |
const [withTimestamp, setWithTimestamp] = useState(true); | |
// const [records, setRecords] = useState<SpeechRecord[]>([]); | |
const [currentResults, setCurrentResults] = useState< | |
SpeechRecogintionResult[] | |
>([]); | |
const onRecordStart = async () => { | |
const mediaRecorder = await createMediaRecorder({ | |
onData(chunks) { | |
const size = chunks.map(c => c.size).reduce((sum, i) => sum + i, 0); | |
console.log(`size: ${Math.floor(size / 1000)}kb`); | |
}, | |
onRecordEnd(blob) { | |
props.onBlobUpdated(blob, records); | |
} | |
}); | |
currentMediaRecorder = mediaRecorder; | |
mediaRecorder.start(); | |
records = []; | |
setStartAt(Date.now()); | |
setRecording(true); | |
}; | |
const onRecordEnd = async () => { | |
if (currentSpeechRecognition) { | |
currentSpeechRecognition.stop(); | |
} | |
if (currentMediaRecorder) { | |
currentMediaRecorder.stop(); | |
} | |
setRecording(false); | |
setStep(0); | |
}; | |
useEffect( | |
() => { | |
if (!isRecoding) { | |
return; | |
} | |
let recognition = createSpeechRecognition({ | |
lang: "ja-JP", | |
onResult(results) { | |
setCurrentResults(results); | |
}, | |
onEnd(results, range) { | |
if (results.length > 0) { | |
records.push({ | |
start: range.start - startAt, | |
end: range.end - startAt, | |
results: results.map(r => { | |
// serialize to JSON.stringify | |
return { | |
transcript: r.transcript, | |
confidence: r.confidence | |
}; | |
}) | |
}); | |
} | |
setCurrentResults([]); | |
setStep(s => s + 1); | |
} | |
}); | |
currentSpeechRecognition = recognition; | |
console.log("start new recognition: step", step); | |
recognition.start(); | |
return () => { | |
// @ts-ignore | |
recognition = null; | |
}; | |
}, | |
[isRecoding, step] | |
); | |
const speechText = recordsToString(records, 0, withTimestamp); | |
// scroll to bottom | |
useEffect( | |
() => { | |
if (textareaRef.current) { | |
textareaRef.current.scrollTop = textareaRef.current.scrollHeight; | |
} | |
}, | |
[speechText] | |
); | |
return ( | |
<> | |
{isRecoding ? ( | |
<> | |
<button onClick={onRecordEnd}>recording end</button> | |
Recording... | |
</> | |
) : ( | |
<button onClick={onRecordStart}>recording start</button> | |
)} | |
<hr /> | |
<h3>Output</h3> | |
{currentResults.length > 0 && <div>Input...</div>} | |
{currentResults.map((r, index) => { | |
return ( | |
<div key={index}> | |
{Math.floor(r.confidence * 10000) / 100}%: {r.transcript}: | |
</div> | |
); | |
})} | |
<textarea | |
ref={textareaRef} | |
placeholder="Press start to record..." | |
readOnly={isRecoding} | |
style={{ width: "50vw", height: "30vh" }} | |
value={speechText} | |
onChange={() => {}} | |
/> | |
<div> | |
{withTimestamp ? ( | |
<button | |
onClick={() => { | |
setWithTimestamp(false); | |
}} | |
> | |
withTimestamp: on | |
</button> | |
) : ( | |
<button | |
onClick={() => { | |
setWithTimestamp(true); | |
}} | |
> | |
withTimestamp: off | |
</button> | |
)} | |
</div> | |
</> | |
); | |
} | |
function recordsToString( | |
records: SpeechRecord[], | |
startAt: number, | |
withTimestamp: boolean | |
) { | |
return records | |
.map(r => { | |
const s = withTimestamp | |
? `${Math.floor((r.start - startAt) / 1000)}s: ` | |
: ""; | |
const t = r.results.map(i => i.transcript).join(". "); | |
return `${s}${t}`; | |
}) | |
.join("\n"); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Speech | |
export type SpeechRecord = { | |
results: SpeechRecogintionResult[]; | |
start: number; | |
end: number; | |
}; | |
export type SpeechRecogintionResult = { | |
transcript: string; | |
confidence: number; | |
}; | |
declare class SpeechRecogintion { | |
lang: string; | |
continuous: boolean; | |
interimResults: boolean; | |
stop(): void; | |
abort(): void; | |
onresult: ( | |
event: { | |
results: Array<Array<SpeechRecogintionResult>>; | |
} | |
) => void; | |
onend: (event: any) => void; | |
onaudiostart: (event: any) => void; | |
onaudioend: (event: any) => void; | |
onspeechstart: (event: any) => void; | |
onspeechend: (event: any) => void; | |
onnomatch: (event: any) => void; | |
onerror: (event: any) => void; | |
start: () => void; | |
} | |
export const SpeechRecognition: typeof SpeechRecogintion = | |
(global as any).webkitSpeechRecognition || (global as any).SpeechRecognition; | |
// let currentRecognition: SpeechRecogintion | null = null; | |
export function createSpeechRecognition(opts: { | |
lang?: string; | |
onResult: (results: SpeechRecogintionResult[]) => void; | |
onEnd: ( | |
results: SpeechRecogintionResult[], | |
range: { start: number; end: number } | |
) => void; | |
}) { | |
const recognition = new SpeechRecognition(); | |
recognition.lang = opts.lang || "ja-JP"; | |
recognition.interimResults = true; | |
let currentInputResults: SpeechRecogintionResult[] = []; | |
recognition.onresult = event => { | |
console.log("SpeechRecognition: onresult", event); | |
const r: SpeechRecogintionResult[] = []; | |
Array.from(event.results).forEach(xr => { | |
r.push(...Array.from(xr)); | |
}); | |
currentInputResults = r; | |
opts.onResult(r); | |
}; | |
let defaultStart = Date.now(); | |
let start: number | null = null; | |
let end: number | null = null; | |
recognition.onspeechstart = (_event: any) => { | |
// set at init | |
if (start == null) { | |
start = Date.now(); | |
} | |
console.log("SpeechRecognition: onspeechstart"); | |
}; | |
recognition.onspeechend = (_event: any) => { | |
end = Date.now(); | |
console.log("SpeechRecognition: onspeechend"); | |
}; | |
recognition.onnomatch = (_event: any) => { | |
console.log("SpeechRecogintion: nomatch"); | |
}; | |
recognition.onend = (event: any) => { | |
console.log("SpeechRecogintion: end"); | |
opts.onEnd(currentInputResults, { | |
start: start || defaultStart, | |
end: end || defaultStart | |
}); | |
}; | |
recognition.onerror = (_event: any) => { | |
console.log("SpeechRecogintion: onerror"); | |
}; | |
return recognition; | |
} | |
// media recorder | |
declare var MediaRecorder: any; | |
export async function createMediaRecorder(listeners: { | |
onRecordEnd: (blob: Blob) => void; | |
onData: (chunks: Blob[]) => void; | |
}): Promise<{ start: Function; stop: Function }> { | |
const stream = await navigator.mediaDevices.getUserMedia({ | |
audio: true, | |
video: false | |
}); | |
const codec = "audio/webm"; | |
const recorder = new MediaRecorder(stream, { | |
audioBitsPerSecond: 128000, // 128kbps | |
mimeType: codec | |
}); | |
const chunks: Array<Blob> = []; | |
recorder.addEventListener("dataavailable", (ev: { data: Blob }) => { | |
chunks.push(ev.data); | |
listeners.onData(chunks); | |
}); | |
recorder.addEventListener("stop", async () => { | |
const blob = new Blob(chunks, { type: codec }); | |
listeners.onRecordEnd(blob); | |
}); | |
return recorder; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment