hiroshi-manabe/SpeechRecognizer.tsx

## SpeechRecognizer.tsx
import React, { useEffect, useState, useRef } from "react";
import {
  SpeechRecogintionResult,
  createSpeechRecognition,
  createMediaRecorder,
  SpeechRecord
} from "./utils";

let currentSpeechRecognition: any | null = null;
let currentMediaRecorder: any | null = null;

type Props = {
  onBlobUpdated: (blob: Blob, records: SpeechRecord[]) => void;
};

let records: SpeechRecord[] = [];

export function SpeechRecognizer(props: Props) {
  const textareaRef = useRef<HTMLTextAreaElement>(null);

  const [isRecoding, setRecording] = useState(false);
  const [step, setStep] = useState(0);
  const [startAt, setStartAt] = useState(0);
  const [withTimestamp, setWithTimestamp] = useState(true);

  // const [records, setRecords] = useState<SpeechRecord[]>([]);
  const [currentResults, setCurrentResults] = useState<
    SpeechRecogintionResult[]
  >([]);

  const onRecordStart = async () => {
    const mediaRecorder = await createMediaRecorder({
      onData(chunks) {
        const size = chunks.map(c => c.size).reduce((sum, i) => sum + i, 0);
        console.log(`size: ${Math.floor(size / 1000)}kb`);
      },
      onRecordEnd(blob) {
        props.onBlobUpdated(blob, records);
      }
    });
    currentMediaRecorder = mediaRecorder;
    mediaRecorder.start();
    records = [];
    setStartAt(Date.now());
    setRecording(true);
  };

  const onRecordEnd = async () => {
    if (currentSpeechRecognition) {
      currentSpeechRecognition.stop();
    }
    if (currentMediaRecorder) {
      currentMediaRecorder.stop();
    }
    setRecording(false);
    setStep(0);
  };

  useEffect(
    () => {
      if (!isRecoding) {
        return;
      }
      let recognition = createSpeechRecognition({
        lang: "ja-JP",
        onResult(results) {
          setCurrentResults(results);
        },
        onEnd(results, range) {
          if (results.length > 0) {
            records.push({
              start: range.start - startAt,
              end: range.end - startAt,
              results: results.map(r => {
                // serialize to JSON.stringify
                return {
                  transcript: r.transcript,
                  confidence: r.confidence
                };
              })
            });
          }
          setCurrentResults([]);
          setStep(s => s + 1);
        }
      });
      currentSpeechRecognition = recognition;

      console.log("start new recognition: step", step);

      recognition.start();
      return () => {
        // @ts-ignore
        recognition = null;
      };
    },
    [isRecoding, step]
  );

  const speechText = recordsToString(records, 0, withTimestamp);

  // scroll to bottom
  useEffect(
    () => {
      if (textareaRef.current) {
        textareaRef.current.scrollTop = textareaRef.current.scrollHeight;
      }
    },
    [speechText]
  );

  return (
    <>
      {isRecoding ? (
        <>
          <button onClick={onRecordEnd}>recording end</button>
          &nbsp; Recording...
        </>
      ) : (
        <button onClick={onRecordStart}>recording start</button>
      )}
      <hr />
      <h3>Output</h3>
      {currentResults.length > 0 && <div>Input...</div>}
      {currentResults.map((r, index) => {
        return (
          <div key={index}>
            {Math.floor(r.confidence * 10000) / 100}%: {r.transcript}:
          </div>
        );
      })}
      <textarea
        ref={textareaRef}
        placeholder="Press start to record..."
        readOnly={isRecoding}
        style={{ width: "50vw", height: "30vh" }}
        value={speechText}
        onChange={() => {}}
      />
      <div>
        {withTimestamp ? (
          <button
            onClick={() => {
              setWithTimestamp(false);
            }}
          >
            withTimestamp: on
          </button>
        ) : (
          <button
            onClick={() => {
              setWithTimestamp(true);
            }}
          >
            withTimestamp: off
          </button>
        )}
      </div>
    </>
  );
}

function recordsToString(
  records: SpeechRecord[],
  startAt: number,
  withTimestamp: boolean
) {
  return records
    .map(r => {
      const s = withTimestamp
        ? `${Math.floor((r.start - startAt) / 1000)}s: `
        : "";
      const t = r.results.map(i => i.transcript).join(". ");
      return `${s}${t}`;
    })
    .join("\n");
}

## utils.ts
// Speech

export type SpeechRecord = {
  results: SpeechRecogintionResult[];
  start: number;
  end: number;
};

export type SpeechRecogintionResult = {
  transcript: string;
  confidence: number;
};

declare class SpeechRecogintion {
  lang: string;
  continuous: boolean;
  interimResults: boolean;
  stop(): void;
  abort(): void;
  onresult: (
    event: {
      results: Array<Array<SpeechRecogintionResult>>;
    }
  ) => void;
  onend: (event: any) => void;
  onaudiostart: (event: any) => void;
  onaudioend: (event: any) => void;
  onspeechstart: (event: any) => void;
  onspeechend: (event: any) => void;

  onnomatch: (event: any) => void;
  onerror: (event: any) => void;
  start: () => void;
}

export const SpeechRecognition: typeof SpeechRecogintion =
  (global as any).webkitSpeechRecognition || (global as any).SpeechRecognition;

// let currentRecognition: SpeechRecogintion | null = null;
export function createSpeechRecognition(opts: {
  lang?: string;
  onResult: (results: SpeechRecogintionResult[]) => void;
  onEnd: (
    results: SpeechRecogintionResult[],
    range: { start: number; end: number }
  ) => void;
}) {
  const recognition = new SpeechRecognition();
  recognition.lang = opts.lang || "ja-JP";
  recognition.interimResults = true;
  let currentInputResults: SpeechRecogintionResult[] = [];
  recognition.onresult = event => {
    console.log("SpeechRecognition: onresult", event);
    const r: SpeechRecogintionResult[] = [];
    Array.from(event.results).forEach(xr => {
      r.push(...Array.from(xr));
    });
    currentInputResults = r;
    opts.onResult(r);
  };

  let defaultStart = Date.now();
  let start: number | null = null;
  let end: number | null = null;

  recognition.onspeechstart = (_event: any) => {
    // set at init
    if (start == null) {
      start = Date.now();
    }
    console.log("SpeechRecognition: onspeechstart");
  };
  recognition.onspeechend = (_event: any) => {
    end = Date.now();
    console.log("SpeechRecognition: onspeechend");
  };

  recognition.onnomatch = (_event: any) => {
    console.log("SpeechRecogintion: nomatch");
  };

  recognition.onend = (event: any) => {
    console.log("SpeechRecogintion: end");
    opts.onEnd(currentInputResults, {
      start: start || defaultStart,
      end: end || defaultStart
    });
  };

  recognition.onerror = (_event: any) => {
    console.log("SpeechRecogintion: onerror");
  };

  return recognition;
}

// media recorder
declare var MediaRecorder: any;
export async function createMediaRecorder(listeners: {
  onRecordEnd: (blob: Blob) => void;
  onData: (chunks: Blob[]) => void;
}): Promise<{ start: Function; stop: Function }> {
  const stream = await navigator.mediaDevices.getUserMedia({
    audio: true,
    video: false
  });
  const codec = "audio/webm";
  const recorder = new MediaRecorder(stream, {
    audioBitsPerSecond: 128000, // 128kbps
    mimeType: codec
  });
  const chunks: Array<Blob> = [];
  recorder.addEventListener("dataavailable", (ev: { data: Blob }) => {
    chunks.push(ev.data);
    listeners.onData(chunks);
  });

  recorder.addEventListener("stop", async () => {
    const blob = new Blob(chunks, { type: codec });
    listeners.onRecordEnd(blob);
  });
  return recorder;
}
	import React, { useEffect, useState, useRef } from "react";
	import {
	SpeechRecogintionResult,
	createSpeechRecognition,
	createMediaRecorder,
	SpeechRecord
	} from "./utils";

	let currentSpeechRecognition: any \| null = null;
	let currentMediaRecorder: any \| null = null;

	type Props = {
	onBlobUpdated: (blob: Blob, records: SpeechRecord[]) => void;
	};

	let records: SpeechRecord[] = [];

	export function SpeechRecognizer(props: Props) {
	const textareaRef = useRef<HTMLTextAreaElement>(null);

	const [isRecoding, setRecording] = useState(false);
	const [step, setStep] = useState(0);
	const [startAt, setStartAt] = useState(0);
	const [withTimestamp, setWithTimestamp] = useState(true);

	// const [records, setRecords] = useState<SpeechRecord[]>([]);
	const [currentResults, setCurrentResults] = useState<
	SpeechRecogintionResult[]
	>([]);

	const onRecordStart = async () => {
	const mediaRecorder = await createMediaRecorder({
	onData(chunks) {
	const size = chunks.map(c => c.size).reduce((sum, i) => sum + i, 0);
	console.log(`size: ${Math.floor(size / 1000)}kb`);
	},
	onRecordEnd(blob) {
	props.onBlobUpdated(blob, records);
	}
	});
	currentMediaRecorder = mediaRecorder;
	mediaRecorder.start();
	records = [];
	setStartAt(Date.now());
	setRecording(true);
	};

	const onRecordEnd = async () => {
	if (currentSpeechRecognition) {
	currentSpeechRecognition.stop();
	}
	if (currentMediaRecorder) {
	currentMediaRecorder.stop();
	}
	setRecording(false);
	setStep(0);
	};

	useEffect(
	() => {
	if (!isRecoding) {
	return;
	}
	let recognition = createSpeechRecognition({
	lang: "ja-JP",
	onResult(results) {
	setCurrentResults(results);
	},
	onEnd(results, range) {
	if (results.length > 0) {
	records.push({
	start: range.start - startAt,
	end: range.end - startAt,
	results: results.map(r => {
	// serialize to JSON.stringify
	return {
	transcript: r.transcript,
	confidence: r.confidence
	};
	})
	});
	}
	setCurrentResults([]);
	setStep(s => s + 1);
	}
	});
	currentSpeechRecognition = recognition;

	console.log("start new recognition: step", step);

	recognition.start();
	return () => {
	// @ts-ignore
	recognition = null;
	};
	},
	[isRecoding, step]
	);

	const speechText = recordsToString(records, 0, withTimestamp);

	// scroll to bottom
	useEffect(
	() => {
	if (textareaRef.current) {
	textareaRef.current.scrollTop = textareaRef.current.scrollHeight;
	}
	},
	[speechText]
	);

	return (
	<>
	{isRecoding ? (
	<>
	<button onClick={onRecordEnd}>recording end</button>
	Recording...
	</>
	) : (
	<button onClick={onRecordStart}>recording start</button>
	)}
	<hr />
	<h3>Output</h3>
	{currentResults.length > 0 && <div>Input...</div>}
	{currentResults.map((r, index) => {
	return (
	<div key={index}>
	{Math.floor(r.confidence * 10000) / 100}%: {r.transcript}:
	</div>
	);
	})}
	<textarea
	ref={textareaRef}
	placeholder="Press start to record..."
	readOnly={isRecoding}
	style={{ width: "50vw", height: "30vh" }}
	value={speechText}
	onChange={() => {}}
	/>
	<div>
	{withTimestamp ? (
	<button
	onClick={() => {
	setWithTimestamp(false);
	}}
	>
	withTimestamp: on
	</button>
	) : (
	<button
	onClick={() => {
	setWithTimestamp(true);
	}}
	>
	withTimestamp: off
	</button>
	)}
	</div>
	</>
	);
	}

	function recordsToString(
	records: SpeechRecord[],
	startAt: number,
	withTimestamp: boolean
	) {
	return records
	.map(r => {
	const s = withTimestamp
	? `${Math.floor((r.start - startAt) / 1000)}s: `
	: "";
	const t = r.results.map(i => i.transcript).join(". ");
	return `${s}${t}`;
	})
	.join("\n");
	}
	// Speech

	export type SpeechRecord = {
	results: SpeechRecogintionResult[];
	start: number;
	end: number;
	};

	export type SpeechRecogintionResult = {
	transcript: string;
	confidence: number;
	};

	declare class SpeechRecogintion {
	lang: string;
	continuous: boolean;
	interimResults: boolean;
	stop(): void;
	abort(): void;
	onresult: (
	event: {
	results: Array<Array<SpeechRecogintionResult>>;
	}
	) => void;
	onend: (event: any) => void;
	onaudiostart: (event: any) => void;
	onaudioend: (event: any) => void;
	onspeechstart: (event: any) => void;
	onspeechend: (event: any) => void;

	onnomatch: (event: any) => void;
	onerror: (event: any) => void;
	start: () => void;
	}

	export const SpeechRecognition: typeof SpeechRecogintion =
	(global as any).webkitSpeechRecognition \|\| (global as any).SpeechRecognition;

	// let currentRecognition: SpeechRecogintion \| null = null;
	export function createSpeechRecognition(opts: {
	lang?: string;
	onResult: (results: SpeechRecogintionResult[]) => void;
	onEnd: (
	results: SpeechRecogintionResult[],
	range: { start: number; end: number }
	) => void;
	}) {
	const recognition = new SpeechRecognition();
	recognition.lang = opts.lang \|\| "ja-JP";
	recognition.interimResults = true;
	let currentInputResults: SpeechRecogintionResult[] = [];
	recognition.onresult = event => {
	console.log("SpeechRecognition: onresult", event);
	const r: SpeechRecogintionResult[] = [];
	Array.from(event.results).forEach(xr => {
	r.push(...Array.from(xr));
	});
	currentInputResults = r;
	opts.onResult(r);
	};

	let defaultStart = Date.now();
	let start: number \| null = null;
	let end: number \| null = null;

	recognition.onspeechstart = (_event: any) => {
	// set at init
	if (start == null) {
	start = Date.now();
	}
	console.log("SpeechRecognition: onspeechstart");
	};
	recognition.onspeechend = (_event: any) => {
	end = Date.now();
	console.log("SpeechRecognition: onspeechend");
	};

	recognition.onnomatch = (_event: any) => {
	console.log("SpeechRecogintion: nomatch");
	};

	recognition.onend = (event: any) => {
	console.log("SpeechRecogintion: end");
	opts.onEnd(currentInputResults, {
	start: start \|\| defaultStart,
	end: end \|\| defaultStart
	});
	};

	recognition.onerror = (_event: any) => {
	console.log("SpeechRecogintion: onerror");
	};

	return recognition;
	}

	// media recorder
	declare var MediaRecorder: any;
	export async function createMediaRecorder(listeners: {
	onRecordEnd: (blob: Blob) => void;
	onData: (chunks: Blob[]) => void;
	}): Promise<{ start: Function; stop: Function }> {
	const stream = await navigator.mediaDevices.getUserMedia({
	audio: true,
	video: false
	});
	const codec = "audio/webm";
	const recorder = new MediaRecorder(stream, {
	audioBitsPerSecond: 128000, // 128kbps
	mimeType: codec
	});
	const chunks: Array<Blob> = [];
	recorder.addEventListener("dataavailable", (ev: { data: Blob }) => {
	chunks.push(ev.data);
	listeners.onData(chunks);
	});

	recorder.addEventListener("stop", async () => {
	const blob = new Blob(chunks, { type: codec });
	listeners.onRecordEnd(blob);
	});
	return recorder;
	}