Skip to content

Instantly share code, notes, and snippets.

@gsuberland
Last active October 12, 2022 05:50
Show Gist options
  • Save gsuberland/cfeab284f6504ece33870195d0fa6f57 to your computer and use it in GitHub Desktop.
Save gsuberland/cfeab284f6504ece33870195d0fa6f57 to your computer and use it in GitHub Desktop.
Automatic forced alignment transcription for creating timed SRT subtitles from a script document, using speech recognition.
/*
works in LinqPad 5 (.NET Framework)
requires System.Speech.dll to be loaded
*/
string script = @"
This is a test of automatic forced alignment transcription.
When I read these words, the program will automatically follow along and output SRT subtitle entries.
The code also handles situations where two lines in the script are the same.
For example, I can repeat the first line of this script:
This is a test of automatic forced alignment transcription.
The console output should show that a duplicate line was corrected for.
";
string cultureID = "en-GB";
SpeechRecognitionEngine engine;
Dictionary<int, List<int>> duplicates = new Dictionary<int, List<int>>();
string[] lines;
int currentLineNumber = 0;
int srtLine = 1;
bool done = false;
StringBuilder srt = new StringBuilder();
void Main()
{
// split the script into individual lines, to be recognised one by one
lines = script.Split(new char[] { '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries);
// create the engine and set the culture info (for pronunciation matching)
using (engine = new SpeechRecognitionEngine(new System.Globalization.CultureInfo(cultureID)))
{
// build a grammar for each line, backwards
for (int lineID = 0; lineID < lines.Length; lineID++)
{
// create a grammar builder for this line
var grammarBuilder = new GrammarBuilder(lines[lineID]);
// build the grammar and give it a name that matches the line index
var grammar = new Grammar(grammarBuilder);
grammar.Name = lineID.ToString();
// default each grammar's weight to a low value. these weights are updated on the fly.
grammar.Weight = 0.001f;
// load the grammar into the recognition engine
Console.WriteLine($"Loading grammar {lineID}");
engine.LoadGrammar(grammar);
}
// prioritise the first line in the script
engine.Grammars[0].Weight = 1.0f;
// add priority information for lines that are equal
var processedDuplicates = new HashSet<int>();
for (int lineID = 0; lineID < lines.Length; lineID++)
{
string lineText = lines[lineID];
// figure out which lines, if any, are duplicates
// note: this isn't perfect, since it's a textual comparison not a semantic comparison
var duplicateIndices = lines.Select((item, index) => new { Text = item, Index = index})
.Where(line => line.Index != lineID && line.Text.Trim().ToLower() == lineText.Trim().ToLower())
.Select(line => line.Index)
.OrderBy(i => i);
// organise the duplicates in order of priority
if (duplicateIndices.Count() > 0)
{
// store these duplicates into the duplicates dictionary so we can map between them later
duplicates.Add(lineID, duplicateIndices.ToList());
// don't re-apply priorities to lines that we've already processed
if (processedDuplicates.Contains(lineID))
continue;
// set the priority of the first line highest (max 127)
int priority = Math.Min(duplicateIndices.Count(), 127);
engine.Grammars[lineID].Priority = priority;
processedDuplicates.Add(lineID);
// go through the rest of the duplicates and decrement their priorities
Console.WriteLine($"Line {lineID} has {priority} duplicates:");
foreach (var duplicateIndex in duplicateIndices)
{
priority--;
Console.WriteLine($"\tSet priority {priority} on duplicate {duplicateIndex}");
engine.Grammars[duplicateIndex].Priority = priority;
processedDuplicates.Add(duplicateIndex);
}
}
}
// set up event handlers
engine.SpeechHypothesized += new EventHandler<SpeechHypothesizedEventArgs>(SpeechHypothesised);
engine.SpeechRecognized += new EventHandler<SpeechRecognizedEventArgs>(SpeechRecognised);
// start recognising
engine.SetInputToDefaultAudioDevice();
engine.RecognizeAsync(RecognizeMode.Multiple);
Console.Write("Listening... ");
while (!done)
{
Thread.Yield();
}
}
Console.WriteLine();
Console.WriteLine();
Console.WriteLine("SRT result:");
Console.WriteLine();
Console.WriteLine(srt.ToString());
}
void SpeechHypothesised(object sender, SpeechHypothesizedEventArgs ea)
{
// print out words as the engine guesses them
Console.Write(ea.Result.Text.Split(' ').Last() + " ");
Console.Out.FlushAsync();
}
void SpeechRecognised(object sender, SpeechRecognizedEventArgs ea)
{
// we got a fully matched line!
// newline after the listening results
Console.WriteLine();
// the line index is stored in the grammar name
int recognisedLineNumber = int.Parse(ea.Result.Grammar.Name);
// did we get a line other than the one we expected?
if (recognisedLineNumber != currentLineNumber)
{
// check if we're detecting a duplicate
if (duplicates.ContainsKey(recognisedLineNumber) && duplicates[recognisedLineNumber].Contains(currentLineNumber))
{
Console.WriteLine($"Correcting for duplicate: {recognisedLineNumber} -> {currentLineNumber}.");
recognisedLineNumber = currentLineNumber;
}
else
{
// doesn't look like a duplicate. probably skipped a line somewhere.
Console.WriteLine($"[WARNING] Line skip! Expected {currentLineNumber}, got {recognisedLineNumber}.");
}
}
Console.WriteLine($"Recognised line {recognisedLineNumber}: {ea.Result.Text}");
// update weights based on progress
for (int n = 0; n < engine.Grammars.Count; n++)
{
if (n <= recognisedLineNumber)
engine.Grammars[n].Weight = 0.0f; // previous lines are effectively disabled
else if (n == recognisedLineNumber + 1)
engine.Grammars[n].Weight = 1.0f; // next line is prioritised
else
engine.Grammars[n].Weight = (float)Math.Pow(0.1, n - recognisedLineNumber); // exponential fall-off for future lines
}
// print it like an SRT
var st = ea.Result.Audio.StartTime;
var et = st + ea.Result.Audio.Duration;
string srtText = $"{srtLine}" + Environment.NewLine +
$"{st.Hour:d2}:{st.Minute:d2}:{st.Second:d2},{st.Millisecond:d4}" +
" --> " +
$"{et.Hour:d2}:{et.Minute:d2}:{et.Second:d2},{et.Millisecond:d4}" + Environment.NewLine +
$"{ea.Result.Text}" + Environment.NewLine + Environment.NewLine;
Console.Write(srtText);
// add this to the SRT file
srt.Append(srtText);
srtLine++;
// are we at the end?
if (recognisedLineNumber + 1 >= engine.Grammars.Count)
{
Console.WriteLine("COMPLETED!");
done = true;
return;
}
// next line
currentLineNumber = recognisedLineNumber + 1;
Console.WriteLine($"Current line updated to {currentLineNumber}, weight={engine.Grammars[currentLineNumber].Weight}: {lines[currentLineNumber]}");
Console.Write("Listening... ");
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment