Skip to content

Instantly share code, notes, and snippets.

@CodingOctocat
Last active May 8, 2022 03:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save CodingOctocat/3d80252fd59d3895cae7ab370afd442e to your computer and use it in GitHub Desktop.
Save CodingOctocat/3d80252fd59d3895cae7ab370afd442e to your computer and use it in GitHub Desktop.
Speech-to-Subtitle (word-level timestamp) code snippet based on MS Azure Speech.
/// <summary>
/// 表示一条包含基本要素的字幕。
/// </summary>
public interface ISubtitle
{
/// <summary>
/// 获取或设置字幕的开始时间。
/// </summary>
public TimeSpan Begin { get; set; }
/// <summary>
/// 获取或设置字幕的结束时间。
/// </summary>
public TimeSpan End { get; set; }
/// <summary>
/// 获取或设置字幕的顺序编号,从 1 开始。
/// </summary>
[Range(1, Int32.MaxValue)]
public int Order { get; set; }
/// <summary>
/// 获取或设置字幕的内容。
/// </summary>
public string Text { get; set; }
}
/// <summary>
/// 表示一个描述性进度。
/// </summary>
public class ProgressInfo
{
/// <summary>
/// 获取或设置进度描述。
/// </summary>
public string Description { get; set; }
/// <summary>
/// 获取或设置进度(0~1)。
/// </summary>
[Range(0d, 1d)]
public double Progress { get; set; }
public ProgressInfo(double progress, string description)
{
Progress = progress;
Description = description;
}
}
/// <summary>
/// 表示一个字幕服务。
/// </summary>
public class SrtService
{
private readonly TaskCompletionSource<bool> _tcs = new();
public SpeechConfig SpeechConfig { get; }
public SrtService(SpeechConfig speechConfig)
{
SpeechConfig = speechConfig;
}
public async Task<Srt> BuildSrtAsync(AudioConfig audioConfig, IProgress<ProgressInfo> progress = default, CancellationToken cancellationToken = default)
{
var recognizer = new SpeechRecognizer(SpeechConfig, audioConfig);
var srt = new Srt() {
// The audioConfig passed in should set the duration to support progress.
// audioConfig.SetProperty("Duration", "Duration of speech in string format")
Duration = TimeSpan.Parse(audioConfig.GetProperty("Duration"))
};
int order = 1;
var recognizingSubtitles = new Stack<Subtitle>();
// SpeechRecognizer 事件顺序如下。
recognizer.SessionStarted += (sender, e) => {
Debug.WriteLine(">>> BuildSrtAsync.SessionStarted.");
};
recognizer.SpeechStartDetected += (sender, e) => {
Debug.WriteLine(">>> BuildSrtAsync.SpeechStartDetected.");
};
// Azure Speech 识别过程是递进式的。
recognizer.Recognizing += async (sender, e) => {
Debug.WriteLine(">>> BuildSrtAsync.Recognizing.");
if (cancellationToken.IsCancellationRequested)
{
Debug.WriteLine($">>> BuildSrtAsync.Recognizing.Cancelled.");
await recognizer.StopContinuousRecognitionAsync();
return;
}
var subtitle = new Subtitle(order, e.Result.Text, TimeSpan.FromTicks(e.Result.OffsetInTicks), TimeSpan.FromTicks(e.Result.OffsetInTicks + e.Result.Duration.Ticks));
recognizingSubtitles.Push(subtitle);
progress.Report(new ProgressInfo(subtitle.End / srt.Duration, $"正在识别: {subtitle}"));
Debug.WriteLine($">>> [{subtitle.Begin} -> {subtitle.End}] | ({subtitle.Duration}) {subtitle.Text}");
};
recognizer.Recognized += async (sender, e) => {
Debug.WriteLine(">>> BuildSrtAsync.Recognized.");
if (cancellationToken.IsCancellationRequested)
{
Debug.WriteLine($">>> BuildSrtAsync.Recognized.Cancelled.");
await recognizer.StopContinuousRecognitionAsync();
return;
}
var begin = TimeSpan.FromTicks(e.Result.OffsetInTicks);
var end = begin + e.Result.Duration;
var subtitle = new Subtitle(order++, e.Result.Text, begin, end);
string[] recognizedUnitTexts = subtitle.Text.Split(' ');
recognizingSubtitles = FilterRecognizingSubtitles(recognizingSubtitles);
// 预处理每段的初始拆分过程。
var previous = recognizingSubtitles.Pop();
string[] previousUnitTexts = previous.Text.Split(' ');
var previousUnit = new SubtitleUnit(previous.Text, previous.Begin, previous.End, subtitle);
var incUnits = previousUnit.Split();
subtitle.Units.AddRange(incUnits);
foreach (var unit in incUnits)
{
Debug.WriteLine($">>> [{unit.Begin} -> {unit.End}] | ({unit.Duration}) {unit.Text}");
}
// 处理剩余拆分过程。
while (recognizingSubtitles.Count > 0)
{
var current = recognizingSubtitles.Pop();
string[] currentUnitTexts = current.Text.Split(' ');
// 当内容包含连字符(-)等情况时,跳过本次拆分过程。
if (currentUnitTexts.Length > recognizedUnits.Length)
{
continue;
}
// recognizedUnits 识别文本的反转文本规范化(“规范”)形式,
// 包括电话号码、数字、缩写(“doctor smith”到“dr smith”),以及应用的其他转换。
// 所以总是从 recognizedUnits 获取短语。
string incUnitText = String.Join(' ', recognizedUnitTexts[previousUnitTexts.Length..currentUnitTexts.Length]);
var incUnit = new SubtitleUnit(incUnitText, previous.End, current.End, subtitle);
incUnits = incUnit.Split();
subtitle.Units.AddRange(incUnits);
previous = current;
previousUnitTexts = currentUnitTexts;
foreach (var unit in incUnits)
{
Debug.WriteLine($">>> [{unit.Begin} -> {unit.End}] | ({unit.Duration}) {unit.Text}");
}
}
srt.Add(subtitle);
recognizingSubtitles.Clear();
progress.Report(new ProgressInfo(subtitle.End / srt.Duration, $"已识别: {subtitle}"));
Debug.WriteLine($"{subtitle}\n");
};
recognizer.SessionStopped += async (sender, e) => {
Debug.WriteLine(">>> BuildSrtAsync.SessionStopped.");
await recognizer.StopContinuousRecognitionAsync();
};
recognizer.Canceled += async (sender, e) => {
Debug.WriteLine(">>> BuildSrtAsync.Canceled.");
await recognizer.StopContinuousRecognitionAsync();
};
recognizer.SpeechEndDetected += (sender, e) => {
Debug.WriteLine(">>> BuildSrtAsync.SpeechEndDetected.");
progress.Report(new ProgressInfo(1, $"识别结束: {e.Offset}"));
_tcs.TrySetResult(true);
};
await recognizer.StartContinuousRecognitionAsync();
await _tcs.Task;
if (srt.Count > 0)
{
srt.First().Units.Insert(0, new SubtitleUnit($"<入场静音/{srt.First().Begin:s\\.fff}s>", TimeSpan.Zero, srt.First().Begin, srt.First()));
srt.Last().Units.Add(new SubtitleUnit($"<退场静音/{srt.Last().End - srt.Duration:s\\.fff}s>", srt.Last().End, srt.Duration, srt.Last()));
}
if (srt.Count > 1)
{
for (int i = 0; i < srt.Count - 1; i++)
{
var subtitle = srt[i];
subtitle.Units.Add(new SubtitleUnit($"<转场静音/{subtitle.Units.Last().End - srt[i + 1].Begin:s\\.fff}s>", subtitle.Units.Last().End, srt[i + 1].Begin, subtitle));
}
}
return srt;
}
private static Stack<Subtitle> FilterRecognizingSubtitles(Stack<Subtitle> recognizingSubtitles)
{
if (recognizingSubtitles.Count == 0)
{
return recognizingSubtitles;
}
var filtereds = new Stack<Subtitle>();
var top = recognizingSubtitles.Pop();
string[] topUnitTexts = top.Text.Split(' ');
filtereds.Push(top);
while (recognizingSubtitles.Count > 0)
{
var current = recognizingSubtitles.Pop();
string[] currentUnitTexts = current.Text.Split(' ');
if (currentUnitTexts.Length < topUnits.Length && currentUnitTexts.SequenceEqual(topUnits.Take(currentUnitTexts.Length)))
{
filtereds.Push(current);
topUnitTexts = currentUnitTexts;
}
}
return filtereds;
}
}
/// <summary>
/// 表示一条字幕。
/// </summary>
[NotifyPropertyChanged]
public class Subtitle : SubtitleBase
{
/// <summary>
/// 获取当前字幕被拆分为单词或(非正式的)词组的单元集合。
/// </summary>
public ObservableCollection<SubtitleUnit> Units { get; private set; } = new();
public Subtitle(int order, string text, TimeSpan begin, TimeSpan end) : base(order, text, begin, end)
{ }
/// <summary>
/// 表示一条抽象的字幕。
/// </summary>
[NotifyPropertyChanged]
public abstract class SubtitleBase : ISubtitle
{
public TimeSpan Begin { get; set; }
/// <summary>
/// 获取字幕的持续时间。
/// </summary>
public TimeSpan Duration => End - Begin;
public TimeSpan End { get; set; }
public int Order { get; set; }
public string Text { get; set; }
protected SubtitleBase(int order, string text, TimeSpan begin, TimeSpan end)
{
Order = order;
Text = text;
Begin = begin;
End = end;
}
public override string ToString()
{
string begin = @$"{Begin:hh\:mm\:ss\,fff}";
string end = @$"{End:hh\:mm\:ss\,fff}";
return $"{Order}\n{begin} --> {end}\n{Text}";
}
}
/// <summary>
/// 表示由 <seealso cref="Subtitle"/> 被语音识别服务拆分成的字幕单元,字幕单元可能是单词或(非正式的)短语。
/// </summary>
[NotifyPropertyChanged]
public class SubtitleUnit : SubtitleBase
{
public new int Order => Parent.Order;
public Subtitle Parent { get; set; }
/// <summary>
/// 获取字幕单元中的单词、数字,或者两者的组合,支持连字符(-)。
/// <para><seealso cref="SubtitleBase.Text"/> 可能包含标点符号。</para>
/// </summary>
[SafeForDependencyAnalysis]
public string Word => Regex.Match(Text, @"[\w\d-]+").Value;
public SubtitleUnit(string text, TimeSpan begin, TimeSpan end, Subtitle parent) : base(parent.Order, text, begin, end)
{
Parent = parent;
}
/// <summary>
/// 将当前字幕单元拆分成内容比 <seealso cref="Word"/> 更小的单元,按字符占比分配时间。
/// <para>TODO: 不支持中英混合内容。</para>
/// </summary>
/// <param name="sep">拆分分隔符,默认为英文空格。</param>
/// <returns></returns>
public IEnumerable<SubtitleUnit> Split(string sep = " ")
{
string[] units = Text.Split(sep);
int totalLength = units.Sum(x => x.Length);
var begin = Begin;
var end = End;
var duration = Duration;
foreach (string unit in units)
{
end = begin + (duration * unit.Length / totalLength);
yield return new SubtitleUnit(unit, begin, end, Parent);
begin = end;
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment