Last active
October 12, 2022 05:50
-
-
Save gsuberland/cfeab284f6504ece33870195d0fa6f57 to your computer and use it in GitHub Desktop.
Automatic forced alignment transcription for creating timed SRT subtitles from a script document, using speech recognition.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
works in LinqPad 5 (.NET Framework) | |
requires System.Speech.dll to be loaded | |
*/ | |
string script = @" | |
This is a test of automatic forced alignment transcription. | |
When I read these words, the program will automatically follow along and output SRT subtitle entries. | |
The code also handles situations where two lines in the script are the same. | |
For example, I can repeat the first line of this script: | |
This is a test of automatic forced alignment transcription. | |
The console output should show that a duplicate line was corrected for. | |
"; | |
string cultureID = "en-GB"; | |
SpeechRecognitionEngine engine; | |
Dictionary<int, List<int>> duplicates = new Dictionary<int, List<int>>(); | |
string[] lines; | |
int currentLineNumber = 0; | |
int srtLine = 1; | |
bool done = false; | |
StringBuilder srt = new StringBuilder(); | |
void Main() | |
{ | |
// split the script into individual lines, to be recognised one by one | |
lines = script.Split(new char[] { '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries); | |
// create the engine and set the culture info (for pronunciation matching) | |
using (engine = new SpeechRecognitionEngine(new System.Globalization.CultureInfo(cultureID))) | |
{ | |
// build a grammar for each line, backwards | |
for (int lineID = 0; lineID < lines.Length; lineID++) | |
{ | |
// create a grammar builder for this line | |
var grammarBuilder = new GrammarBuilder(lines[lineID]); | |
// build the grammar and give it a name that matches the line index | |
var grammar = new Grammar(grammarBuilder); | |
grammar.Name = lineID.ToString(); | |
// default each grammar's weight to a low value. these weights are updated on the fly. | |
grammar.Weight = 0.001f; | |
// load the grammar into the recognition engine | |
Console.WriteLine($"Loading grammar {lineID}"); | |
engine.LoadGrammar(grammar); | |
} | |
// prioritise the first line in the script | |
engine.Grammars[0].Weight = 1.0f; | |
// add priority information for lines that are equal | |
var processedDuplicates = new HashSet<int>(); | |
for (int lineID = 0; lineID < lines.Length; lineID++) | |
{ | |
string lineText = lines[lineID]; | |
// figure out which lines, if any, are duplicates | |
// note: this isn't perfect, since it's a textual comparison not a semantic comparison | |
var duplicateIndices = lines.Select((item, index) => new { Text = item, Index = index}) | |
.Where(line => line.Index != lineID && line.Text.Trim().ToLower() == lineText.Trim().ToLower()) | |
.Select(line => line.Index) | |
.OrderBy(i => i); | |
// organise the duplicates in order of priority | |
if (duplicateIndices.Count() > 0) | |
{ | |
// store these duplicates into the duplicates dictionary so we can map between them later | |
duplicates.Add(lineID, duplicateIndices.ToList()); | |
// don't re-apply priorities to lines that we've already processed | |
if (processedDuplicates.Contains(lineID)) | |
continue; | |
// set the priority of the first line highest (max 127) | |
int priority = Math.Min(duplicateIndices.Count(), 127); | |
engine.Grammars[lineID].Priority = priority; | |
processedDuplicates.Add(lineID); | |
// go through the rest of the duplicates and decrement their priorities | |
Console.WriteLine($"Line {lineID} has {priority} duplicates:"); | |
foreach (var duplicateIndex in duplicateIndices) | |
{ | |
priority--; | |
Console.WriteLine($"\tSet priority {priority} on duplicate {duplicateIndex}"); | |
engine.Grammars[duplicateIndex].Priority = priority; | |
processedDuplicates.Add(duplicateIndex); | |
} | |
} | |
} | |
// set up event handlers | |
engine.SpeechHypothesized += new EventHandler<SpeechHypothesizedEventArgs>(SpeechHypothesised); | |
engine.SpeechRecognized += new EventHandler<SpeechRecognizedEventArgs>(SpeechRecognised); | |
// start recognising | |
engine.SetInputToDefaultAudioDevice(); | |
engine.RecognizeAsync(RecognizeMode.Multiple); | |
Console.Write("Listening... "); | |
while (!done) | |
{ | |
Thread.Yield(); | |
} | |
} | |
Console.WriteLine(); | |
Console.WriteLine(); | |
Console.WriteLine("SRT result:"); | |
Console.WriteLine(); | |
Console.WriteLine(srt.ToString()); | |
} | |
void SpeechHypothesised(object sender, SpeechHypothesizedEventArgs ea) | |
{ | |
// print out words as the engine guesses them | |
Console.Write(ea.Result.Text.Split(' ').Last() + " "); | |
Console.Out.FlushAsync(); | |
} | |
void SpeechRecognised(object sender, SpeechRecognizedEventArgs ea) | |
{ | |
// we got a fully matched line! | |
// newline after the listening results | |
Console.WriteLine(); | |
// the line index is stored in the grammar name | |
int recognisedLineNumber = int.Parse(ea.Result.Grammar.Name); | |
// did we get a line other than the one we expected? | |
if (recognisedLineNumber != currentLineNumber) | |
{ | |
// check if we're detecting a duplicate | |
if (duplicates.ContainsKey(recognisedLineNumber) && duplicates[recognisedLineNumber].Contains(currentLineNumber)) | |
{ | |
Console.WriteLine($"Correcting for duplicate: {recognisedLineNumber} -> {currentLineNumber}."); | |
recognisedLineNumber = currentLineNumber; | |
} | |
else | |
{ | |
// doesn't look like a duplicate. probably skipped a line somewhere. | |
Console.WriteLine($"[WARNING] Line skip! Expected {currentLineNumber}, got {recognisedLineNumber}."); | |
} | |
} | |
Console.WriteLine($"Recognised line {recognisedLineNumber}: {ea.Result.Text}"); | |
// update weights based on progress | |
for (int n = 0; n < engine.Grammars.Count; n++) | |
{ | |
if (n <= recognisedLineNumber) | |
engine.Grammars[n].Weight = 0.0f; // previous lines are effectively disabled | |
else if (n == recognisedLineNumber + 1) | |
engine.Grammars[n].Weight = 1.0f; // next line is prioritised | |
else | |
engine.Grammars[n].Weight = (float)Math.Pow(0.1, n - recognisedLineNumber); // exponential fall-off for future lines | |
} | |
// print it like an SRT | |
var st = ea.Result.Audio.StartTime; | |
var et = st + ea.Result.Audio.Duration; | |
string srtText = $"{srtLine}" + Environment.NewLine + | |
$"{st.Hour:d2}:{st.Minute:d2}:{st.Second:d2},{st.Millisecond:d4}" + | |
" --> " + | |
$"{et.Hour:d2}:{et.Minute:d2}:{et.Second:d2},{et.Millisecond:d4}" + Environment.NewLine + | |
$"{ea.Result.Text}" + Environment.NewLine + Environment.NewLine; | |
Console.Write(srtText); | |
// add this to the SRT file | |
srt.Append(srtText); | |
srtLine++; | |
// are we at the end? | |
if (recognisedLineNumber + 1 >= engine.Grammars.Count) | |
{ | |
Console.WriteLine("COMPLETED!"); | |
done = true; | |
return; | |
} | |
// next line | |
currentLineNumber = recognisedLineNumber + 1; | |
Console.WriteLine($"Current line updated to {currentLineNumber}, weight={engine.Grammars[currentLineNumber].Weight}: {lines[currentLineNumber]}"); | |
Console.Write("Listening... "); | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment