gsuberland/forced_alignment_srt.cs

## forced_alignment_srt.cs
/*
works in LinqPad 5 (.NET Framework)
requires System.Speech.dll to be loaded
*/

string script = @"
This is a test of automatic forced alignment transcription.
When I read these words, the program will automatically follow along and output SRT subtitle entries.
The code also handles situations where two lines in the script are the same.
For example, I can repeat the first line of this script:
This is a test of automatic forced alignment transcription.
The console output should show that a duplicate line was corrected for.
";

string cultureID = "en-GB";
SpeechRecognitionEngine engine;
Dictionary<int, List<int>> duplicates = new Dictionary<int, List<int>>();
string[] lines;
int currentLineNumber = 0;
int srtLine = 1;
bool done = false;

StringBuilder srt = new StringBuilder();

void Main()
{
	// split the script into individual lines, to be recognised one by one
	lines = script.Split(new char[] { '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries);

	// create the engine and set the culture info (for pronunciation matching)
	using (engine = new SpeechRecognitionEngine(new System.Globalization.CultureInfo(cultureID)))
	{
		// build a grammar for each line, backwards
		for (int lineID = 0; lineID < lines.Length; lineID++)
		{
			// create a grammar builder for this line
			var grammarBuilder = new GrammarBuilder(lines[lineID]);
			// build the grammar and give it a name that matches the line index
			var grammar = new Grammar(grammarBuilder);
			grammar.Name = lineID.ToString();
			// default each grammar's weight to a low value. these weights are updated on the fly.
			grammar.Weight = 0.001f;

			// load the grammar into the recognition engine
			Console.WriteLine($"Loading grammar {lineID}");
			engine.LoadGrammar(grammar);
		}

		// prioritise the first line in the script
		engine.Grammars[0].Weight = 1.0f;

		// add priority information for lines that are equal
		var processedDuplicates = new HashSet<int>();
		for (int lineID = 0; lineID < lines.Length; lineID++)
		{
			string lineText = lines[lineID];

			// figure out which lines, if any, are duplicates
			// note: this isn't perfect, since it's a textual comparison not a semantic comparison
			var duplicateIndices = lines.Select((item, index) => new { Text = item, Index = index})
				 .Where(line => line.Index != lineID && line.Text.Trim().ToLower() == lineText.Trim().ToLower())
				 .Select(line => line.Index)
				 .OrderBy(i => i);

			// organise the duplicates in order of priority
			if (duplicateIndices.Count() > 0)
			{
				// store these duplicates into the duplicates dictionary so we can map between them later
				duplicates.Add(lineID, duplicateIndices.ToList());

				// don't re-apply priorities to lines that we've already processed
				if (processedDuplicates.Contains(lineID))
					continue;

				// set the priority of the first line highest (max 127)
				int priority = Math.Min(duplicateIndices.Count(), 127);
				engine.Grammars[lineID].Priority = priority;
				processedDuplicates.Add(lineID);

				// go through the rest of the duplicates and decrement their priorities
				Console.WriteLine($"Line {lineID} has {priority} duplicates:");
				foreach (var duplicateIndex in duplicateIndices)
				{
					priority--;
					Console.WriteLine($"\tSet priority {priority} on duplicate {duplicateIndex}");
					engine.Grammars[duplicateIndex].Priority = priority;
					processedDuplicates.Add(duplicateIndex);
				}
			}
		}

		// set up event handlers
		engine.SpeechHypothesized += new EventHandler<SpeechHypothesizedEventArgs>(SpeechHypothesised);
		engine.SpeechRecognized += new EventHandler<SpeechRecognizedEventArgs>(SpeechRecognised);

		// start recognising
		engine.SetInputToDefaultAudioDevice();
		engine.RecognizeAsync(RecognizeMode.Multiple);

		Console.Write("Listening...  ");

		while (!done)
		{
			Thread.Yield();
		}
	}
	Console.WriteLine();
	Console.WriteLine();
	Console.WriteLine("SRT result:");
	Console.WriteLine();
	Console.WriteLine(srt.ToString());
}

void SpeechHypothesised(object sender, SpeechHypothesizedEventArgs ea)
{
	// print out words as the engine guesses them
	Console.Write(ea.Result.Text.Split(' ').Last() + " ");
	Console.Out.FlushAsync();
}

void SpeechRecognised(object sender, SpeechRecognizedEventArgs ea)
{
	// we got a fully matched line!

	// newline after the listening results
	Console.WriteLine();

	// the line index is stored in the grammar name
	int recognisedLineNumber = int.Parse(ea.Result.Grammar.Name);

	// did we get a line other than the one we expected?
	if (recognisedLineNumber != currentLineNumber)
	{
		// check if we're detecting a duplicate
		if (duplicates.ContainsKey(recognisedLineNumber) && duplicates[recognisedLineNumber].Contains(currentLineNumber))
		{
			Console.WriteLine($"Correcting for duplicate: {recognisedLineNumber} -> {currentLineNumber}.");
			recognisedLineNumber = currentLineNumber;
		}
		else
		{
			// doesn't look like a duplicate. probably skipped a line somewhere.
			Console.WriteLine($"[WARNING] Line skip! Expected {currentLineNumber}, got {recognisedLineNumber}.");
		}
	}

	Console.WriteLine($"Recognised line {recognisedLineNumber}: {ea.Result.Text}");

	// update weights based on progress
	for (int n = 0; n < engine.Grammars.Count; n++)
	{
		if (n <= recognisedLineNumber)
			engine.Grammars[n].Weight = 0.0f; // previous lines are effectively disabled
		else if (n == recognisedLineNumber + 1)
			engine.Grammars[n].Weight = 1.0f; // next line is prioritised
		else
			engine.Grammars[n].Weight = (float)Math.Pow(0.1, n - recognisedLineNumber); // exponential fall-off for future lines
	}

	// print it like an SRT
	var st = ea.Result.Audio.StartTime;
	var et = st + ea.Result.Audio.Duration;
	string srtText = $"{srtLine}" + Environment.NewLine +
		$"{st.Hour:d2}:{st.Minute:d2}:{st.Second:d2},{st.Millisecond:d4}" +
		" --> " +
		$"{et.Hour:d2}:{et.Minute:d2}:{et.Second:d2},{et.Millisecond:d4}" + Environment.NewLine +
		$"{ea.Result.Text}" + Environment.NewLine + Environment.NewLine;
	Console.Write(srtText);
	// add this to the SRT file
	srt.Append(srtText);

	srtLine++;

	// are we at the end?
	if (recognisedLineNumber + 1 >= engine.Grammars.Count)
	{
		Console.WriteLine("COMPLETED!");
		done = true;
		return;
	}

	// next line
	currentLineNumber = recognisedLineNumber + 1;
	Console.WriteLine($"Current line updated to {currentLineNumber}, weight={engine.Grammars[currentLineNumber].Weight}: {lines[currentLineNumber]}");

	Console.Write("Listening...  ");
}
	/*
	works in LinqPad 5 (.NET Framework)
	requires System.Speech.dll to be loaded
	*/

	string script = @"
	This is a test of automatic forced alignment transcription.
	When I read these words, the program will automatically follow along and output SRT subtitle entries.
	The code also handles situations where two lines in the script are the same.
	For example, I can repeat the first line of this script:
	This is a test of automatic forced alignment transcription.
	The console output should show that a duplicate line was corrected for.
	";

	string cultureID = "en-GB";
	SpeechRecognitionEngine engine;
	Dictionary<int, List<int>> duplicates = new Dictionary<int, List<int>>();
	string[] lines;
	int currentLineNumber = 0;
	int srtLine = 1;
	bool done = false;

	StringBuilder srt = new StringBuilder();

	void Main()
	{
	// split the script into individual lines, to be recognised one by one
	lines = script.Split(new char[] { '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries);

	// create the engine and set the culture info (for pronunciation matching)
	using (engine = new SpeechRecognitionEngine(new System.Globalization.CultureInfo(cultureID)))
	{
	// build a grammar for each line, backwards
	for (int lineID = 0; lineID < lines.Length; lineID++)
	{
	// create a grammar builder for this line
	var grammarBuilder = new GrammarBuilder(lines[lineID]);
	// build the grammar and give it a name that matches the line index
	var grammar = new Grammar(grammarBuilder);
	grammar.Name = lineID.ToString();
	// default each grammar's weight to a low value. these weights are updated on the fly.
	grammar.Weight = 0.001f;

	// load the grammar into the recognition engine
	Console.WriteLine($"Loading grammar {lineID}");
	engine.LoadGrammar(grammar);
	}

	// prioritise the first line in the script
	engine.Grammars[0].Weight = 1.0f;

	// add priority information for lines that are equal
	var processedDuplicates = new HashSet<int>();
	for (int lineID = 0; lineID < lines.Length; lineID++)
	{
	string lineText = lines[lineID];

	// figure out which lines, if any, are duplicates
	// note: this isn't perfect, since it's a textual comparison not a semantic comparison
	var duplicateIndices = lines.Select((item, index) => new { Text = item, Index = index})
	.Where(line => line.Index != lineID && line.Text.Trim().ToLower() == lineText.Trim().ToLower())
	.Select(line => line.Index)
	.OrderBy(i => i);

	// organise the duplicates in order of priority
	if (duplicateIndices.Count() > 0)
	{
	// store these duplicates into the duplicates dictionary so we can map between them later
	duplicates.Add(lineID, duplicateIndices.ToList());

	// don't re-apply priorities to lines that we've already processed
	if (processedDuplicates.Contains(lineID))
	continue;

	// set the priority of the first line highest (max 127)
	int priority = Math.Min(duplicateIndices.Count(), 127);
	engine.Grammars[lineID].Priority = priority;
	processedDuplicates.Add(lineID);

	// go through the rest of the duplicates and decrement their priorities
	Console.WriteLine($"Line {lineID} has {priority} duplicates:");
	foreach (var duplicateIndex in duplicateIndices)
	{
	priority--;
	Console.WriteLine($"\tSet priority {priority} on duplicate {duplicateIndex}");
	engine.Grammars[duplicateIndex].Priority = priority;
	processedDuplicates.Add(duplicateIndex);
	}
	}
	}

	// set up event handlers
	engine.SpeechHypothesized += new EventHandler<SpeechHypothesizedEventArgs>(SpeechHypothesised);
	engine.SpeechRecognized += new EventHandler<SpeechRecognizedEventArgs>(SpeechRecognised);

	// start recognising
	engine.SetInputToDefaultAudioDevice();
	engine.RecognizeAsync(RecognizeMode.Multiple);

	Console.Write("Listening... ");

	while (!done)
	{
	Thread.Yield();
	}
	}
	Console.WriteLine();
	Console.WriteLine();
	Console.WriteLine("SRT result:");
	Console.WriteLine();
	Console.WriteLine(srt.ToString());
	}

	void SpeechHypothesised(object sender, SpeechHypothesizedEventArgs ea)
	{
	// print out words as the engine guesses them
	Console.Write(ea.Result.Text.Split(' ').Last() + " ");
	Console.Out.FlushAsync();
	}

	void SpeechRecognised(object sender, SpeechRecognizedEventArgs ea)
	{
	// we got a fully matched line!

	// newline after the listening results
	Console.WriteLine();

	// the line index is stored in the grammar name
	int recognisedLineNumber = int.Parse(ea.Result.Grammar.Name);

	// did we get a line other than the one we expected?
	if (recognisedLineNumber != currentLineNumber)
	{
	// check if we're detecting a duplicate
	if (duplicates.ContainsKey(recognisedLineNumber) && duplicates[recognisedLineNumber].Contains(currentLineNumber))
	{
	Console.WriteLine($"Correcting for duplicate: {recognisedLineNumber} -> {currentLineNumber}.");
	recognisedLineNumber = currentLineNumber;
	}
	else
	{
	// doesn't look like a duplicate. probably skipped a line somewhere.
	Console.WriteLine($"[WARNING] Line skip! Expected {currentLineNumber}, got {recognisedLineNumber}.");
	}
	}

	Console.WriteLine($"Recognised line {recognisedLineNumber}: {ea.Result.Text}");

	// update weights based on progress
	for (int n = 0; n < engine.Grammars.Count; n++)
	{
	if (n <= recognisedLineNumber)
	engine.Grammars[n].Weight = 0.0f; // previous lines are effectively disabled
	else if (n == recognisedLineNumber + 1)
	engine.Grammars[n].Weight = 1.0f; // next line is prioritised
	else
	engine.Grammars[n].Weight = (float)Math.Pow(0.1, n - recognisedLineNumber); // exponential fall-off for future lines
	}

	// print it like an SRT
	var st = ea.Result.Audio.StartTime;
	var et = st + ea.Result.Audio.Duration;
	string srtText = $"{srtLine}" + Environment.NewLine +
	$"{st.Hour:d2}:{st.Minute:d2}:{st.Second:d2},{st.Millisecond:d4}" +
	" --> " +
	$"{et.Hour:d2}:{et.Minute:d2}:{et.Second:d2},{et.Millisecond:d4}" + Environment.NewLine +
	$"{ea.Result.Text}" + Environment.NewLine + Environment.NewLine;
	Console.Write(srtText);
	// add this to the SRT file
	srt.Append(srtText);

	srtLine++;

	// are we at the end?
	if (recognisedLineNumber + 1 >= engine.Grammars.Count)
	{
	Console.WriteLine("COMPLETED!");
	done = true;
	return;
	}

	// next line
	currentLineNumber = recognisedLineNumber + 1;
	Console.WriteLine($"Current line updated to {currentLineNumber}, weight={engine.Grammars[currentLineNumber].Weight}: {lines[currentLineNumber]}");

	Console.Write("Listening... ");
	}