Skip to content

Instantly share code, notes, and snippets.

@kidGodzilla
Created February 25, 2021 04:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kidGodzilla/45691ee73b9ffce6ac1aeffa072d3560 to your computer and use it in GitHub Desktop.
Save kidGodzilla/45691ee73b9ffce6ac1aeffa072d3560 to your computer and use it in GitHub Desktop.
Infer timestamps in real-time from non-timestamped transcription progress updates
function currentTs() {
return (+ new Date()) - window._startTime || 0;
}
function cutTranscription() {
clearTimeout(window._silenceTimr);
window._cutoffTimr = null;
let piece = (_recognizedSpeech[0] || '').substring(_transcriptionLength);
_transcriptionLength = (_recognizedSpeech[0] || '').length;
_transcriptionPoints.push(_transcriptionLength);
_transcriptionTimes.push(currentTs());
// _transcriptionArray.push(piece);
}
function timestampedRecording() {
// window._transcriptionArray = [];
window._startTime = + new Date();
window._transcriptionTimes = [];
window._transcriptionPoints = [];
window._transcriptionLength = 0;
clearTimeout(window._cutoffTimr);
window._silenceTimr = null;
window._cutoffTimr = null;
window.plugins.speechRecognition.startListening(function (data) {
console.log(data); window._recognizedSpeech = data;
// Cut transcription if we pause for more than 2s
clearTimeout(window._silenceTimr);
window._silenceTimr = setTimeout(cutTranscription, 2000);
// Cut transcription after max 4 seconds
if (!window._cutoffTimr) window._cutoffTimr = setTimeout(cutTranscription, 4000);
}, console.warn, { language: 'en-US', showPartial: true });
}
function stopTimestampedRecording() {
window.plugins.speechRecognition.stopListening();
}
function generateFromPoints() {
let outs = [], lastPoint = 0;
_transcriptionPoints.forEach(point => {
outs.push((_recognizedSpeech[0] || '').substring(lastPoint, point));
lastPoint = point;
});
return outs;
}
function printTimestamped() {
let a = generateFromPoints();
let times = JSON.parse(JSON.stringify(_transcriptionTimes));
times.unshift(0);
times.forEach(time => {
let s = a.shift();
if (s) console.log(`${ Math.round(time/1000) }: ${ s }`);
});
}
// function generateFromLuckyPoints() {
// let outs = [], lastPoint = 0;
// window._luckyTimes = [0];
// _transcriptionPoints.forEach(point => {
// if (_recognizedSpeech[0].substring(point, point+1) == ' ') {
// outs.push(_recognizedSpeech[0].substring(lastPoint, point));
// lastPoint = point;
// }
// });
// return outs;
// }
// function printLuckyTimestamps() {
// let a = generateFromLuckyPoints();
// let times = _luckyTimes;
// times.forEach(time => {
// let s = a.shift();
// if (s) console.log(`${ Math.round(time/1000) }: ${ s }`);
// });
// }
function generateFromNaturalPoints() {
let outs = [], lastPoint = 0;
window._naturalTimes = [0];
let tpi = 0;
// Walk through transcription once
for (var i = 0; i < (_recognizedSpeech[0] || '').length; i++) {
var cTp = _transcriptionPoints[tpi];
var cChar = (_recognizedSpeech[0] || '')[i];
// console.log(i, cTp, cChar);
// Cut if in window and we hit a space
if (i > cTp - 6 && i < cTp + 8 && cChar == ' ') {
outs.push((_recognizedSpeech[0] || '').substring(lastPoint, i));
window._naturalTimes.push(_transcriptionTimes[tpi]);
lastPoint = i;
tpi++;
}
}
// One more
let s = (_recognizedSpeech[0] || '').substring(lastPoint)
if (s) {
outs.push(s);
window._naturalTimes.push(_transcriptionTimes[tpi]);
}
return outs;
}
function printNaturalTimestamps() {
let a = generateFromNaturalPoints();
let times = _naturalTimes;
times.forEach(time => {
let s = a.shift();
if (s) console.log(`${ (time/1000).toFixed(2) }: ${ s }`);
});
}
function timestampMap() {
return _transcriptionPoints.join(',') + ' ' + _transcriptionTimes.join(',');
}
@kidGodzilla
Copy link
Author

You start with a big string of text:

(This is what the final output looks like from iOS after a couple of minutes of recording, and passing the text through a neural net to add punctuation)

So here's my indie maker update today, I'm not doing anything really interesting, I'm just doing something for fun. I am doing a transcription point map for the transcription that comes from the iOS app when you do a new recording. Basically iOS will give you updates in real time for the completed transcription. It doesn't give you timestamps. So it's a somewhat interesting problem to create that mapping of, you know, just break that completed paragraph of text for the transcription up into smaller segments and timestamp them. That’s all. 

And while I was recording, the above code made a map of positions and timestamps:

66,95,161,198,236,291,347,389,435,471 5230,9279,13391,17590,21621,26059,30061,34399,39121,44428

So you can end up with something like this, produced on the fly, even after a user has tweaked or modified their transcription to correct mistakes:

out

Basically, since the output is being updated in real-time, the naive approach is to just keep measuring the character length of the string as it grows, and remember those positions. A little bit of cleanup later, and you have some natural breakpoints that you can continuously use to break your transcribed text into timestamped segments, even after it has been modified.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment