Last active
May 8, 2022 03:38
-
-
Save CodingOctocat/3d80252fd59d3895cae7ab370afd442e to your computer and use it in GitHub Desktop.
Speech-to-Subtitle (word-level timestamp) code snippet based on MS Azure Speech.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/// <summary> | |
/// 表示一条包含基本要素的字幕。 | |
/// </summary> | |
public interface ISubtitle | |
{ | |
/// <summary> | |
/// 获取或设置字幕的开始时间。 | |
/// </summary> | |
public TimeSpan Begin { get; set; } | |
/// <summary> | |
/// 获取或设置字幕的结束时间。 | |
/// </summary> | |
public TimeSpan End { get; set; } | |
/// <summary> | |
/// 获取或设置字幕的顺序编号,从 1 开始。 | |
/// </summary> | |
[Range(1, Int32.MaxValue)] | |
public int Order { get; set; } | |
/// <summary> | |
/// 获取或设置字幕的内容。 | |
/// </summary> | |
public string Text { get; set; } | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/// <summary> | |
/// 表示一个描述性进度。 | |
/// </summary> | |
public class ProgressInfo | |
{ | |
/// <summary> | |
/// 获取或设置进度描述。 | |
/// </summary> | |
public string Description { get; set; } | |
/// <summary> | |
/// 获取或设置进度(0~1)。 | |
/// </summary> | |
[Range(0d, 1d)] | |
public double Progress { get; set; } | |
public ProgressInfo(double progress, string description) | |
{ | |
Progress = progress; | |
Description = description; | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/// <summary> | |
/// 表示一个字幕服务。 | |
/// </summary> | |
public class SrtService | |
{ | |
private readonly TaskCompletionSource<bool> _tcs = new(); | |
public SpeechConfig SpeechConfig { get; } | |
public SrtService(SpeechConfig speechConfig) | |
{ | |
SpeechConfig = speechConfig; | |
} | |
public async Task<Srt> BuildSrtAsync(AudioConfig audioConfig, IProgress<ProgressInfo> progress = default, CancellationToken cancellationToken = default) | |
{ | |
var recognizer = new SpeechRecognizer(SpeechConfig, audioConfig); | |
var srt = new Srt() { | |
// The audioConfig passed in should set the duration to support progress. | |
// audioConfig.SetProperty("Duration", "Duration of speech in string format") | |
Duration = TimeSpan.Parse(audioConfig.GetProperty("Duration")) | |
}; | |
int order = 1; | |
var recognizingSubtitles = new Stack<Subtitle>(); | |
// SpeechRecognizer 事件顺序如下。 | |
recognizer.SessionStarted += (sender, e) => { | |
Debug.WriteLine(">>> BuildSrtAsync.SessionStarted."); | |
}; | |
recognizer.SpeechStartDetected += (sender, e) => { | |
Debug.WriteLine(">>> BuildSrtAsync.SpeechStartDetected."); | |
}; | |
// Azure Speech 识别过程是递进式的。 | |
recognizer.Recognizing += async (sender, e) => { | |
Debug.WriteLine(">>> BuildSrtAsync.Recognizing."); | |
if (cancellationToken.IsCancellationRequested) | |
{ | |
Debug.WriteLine($">>> BuildSrtAsync.Recognizing.Cancelled."); | |
await recognizer.StopContinuousRecognitionAsync(); | |
return; | |
} | |
var subtitle = new Subtitle(order, e.Result.Text, TimeSpan.FromTicks(e.Result.OffsetInTicks), TimeSpan.FromTicks(e.Result.OffsetInTicks + e.Result.Duration.Ticks)); | |
recognizingSubtitles.Push(subtitle); | |
progress.Report(new ProgressInfo(subtitle.End / srt.Duration, $"正在识别: {subtitle}")); | |
Debug.WriteLine($">>> [{subtitle.Begin} -> {subtitle.End}] | ({subtitle.Duration}) {subtitle.Text}"); | |
}; | |
recognizer.Recognized += async (sender, e) => { | |
Debug.WriteLine(">>> BuildSrtAsync.Recognized."); | |
if (cancellationToken.IsCancellationRequested) | |
{ | |
Debug.WriteLine($">>> BuildSrtAsync.Recognized.Cancelled."); | |
await recognizer.StopContinuousRecognitionAsync(); | |
return; | |
} | |
var begin = TimeSpan.FromTicks(e.Result.OffsetInTicks); | |
var end = begin + e.Result.Duration; | |
var subtitle = new Subtitle(order++, e.Result.Text, begin, end); | |
string[] recognizedUnitTexts = subtitle.Text.Split(' '); | |
recognizingSubtitles = FilterRecognizingSubtitles(recognizingSubtitles); | |
// 预处理每段的初始拆分过程。 | |
var previous = recognizingSubtitles.Pop(); | |
string[] previousUnitTexts = previous.Text.Split(' '); | |
var previousUnit = new SubtitleUnit(previous.Text, previous.Begin, previous.End, subtitle); | |
var incUnits = previousUnit.Split(); | |
subtitle.Units.AddRange(incUnits); | |
foreach (var unit in incUnits) | |
{ | |
Debug.WriteLine($">>> [{unit.Begin} -> {unit.End}] | ({unit.Duration}) {unit.Text}"); | |
} | |
// 处理剩余拆分过程。 | |
while (recognizingSubtitles.Count > 0) | |
{ | |
var current = recognizingSubtitles.Pop(); | |
string[] currentUnitTexts = current.Text.Split(' '); | |
// 当内容包含连字符(-)等情况时,跳过本次拆分过程。 | |
if (currentUnitTexts.Length > recognizedUnits.Length) | |
{ | |
continue; | |
} | |
// recognizedUnits 识别文本的反转文本规范化(“规范”)形式, | |
// 包括电话号码、数字、缩写(“doctor smith”到“dr smith”),以及应用的其他转换。 | |
// 所以总是从 recognizedUnits 获取短语。 | |
string incUnitText = String.Join(' ', recognizedUnitTexts[previousUnitTexts.Length..currentUnitTexts.Length]); | |
var incUnit = new SubtitleUnit(incUnitText, previous.End, current.End, subtitle); | |
incUnits = incUnit.Split(); | |
subtitle.Units.AddRange(incUnits); | |
previous = current; | |
previousUnitTexts = currentUnitTexts; | |
foreach (var unit in incUnits) | |
{ | |
Debug.WriteLine($">>> [{unit.Begin} -> {unit.End}] | ({unit.Duration}) {unit.Text}"); | |
} | |
} | |
srt.Add(subtitle); | |
recognizingSubtitles.Clear(); | |
progress.Report(new ProgressInfo(subtitle.End / srt.Duration, $"已识别: {subtitle}")); | |
Debug.WriteLine($"{subtitle}\n"); | |
}; | |
recognizer.SessionStopped += async (sender, e) => { | |
Debug.WriteLine(">>> BuildSrtAsync.SessionStopped."); | |
await recognizer.StopContinuousRecognitionAsync(); | |
}; | |
recognizer.Canceled += async (sender, e) => { | |
Debug.WriteLine(">>> BuildSrtAsync.Canceled."); | |
await recognizer.StopContinuousRecognitionAsync(); | |
}; | |
recognizer.SpeechEndDetected += (sender, e) => { | |
Debug.WriteLine(">>> BuildSrtAsync.SpeechEndDetected."); | |
progress.Report(new ProgressInfo(1, $"识别结束: {e.Offset}")); | |
_tcs.TrySetResult(true); | |
}; | |
await recognizer.StartContinuousRecognitionAsync(); | |
await _tcs.Task; | |
if (srt.Count > 0) | |
{ | |
srt.First().Units.Insert(0, new SubtitleUnit($"<入场静音/{srt.First().Begin:s\\.fff}s>", TimeSpan.Zero, srt.First().Begin, srt.First())); | |
srt.Last().Units.Add(new SubtitleUnit($"<退场静音/{srt.Last().End - srt.Duration:s\\.fff}s>", srt.Last().End, srt.Duration, srt.Last())); | |
} | |
if (srt.Count > 1) | |
{ | |
for (int i = 0; i < srt.Count - 1; i++) | |
{ | |
var subtitle = srt[i]; | |
subtitle.Units.Add(new SubtitleUnit($"<转场静音/{subtitle.Units.Last().End - srt[i + 1].Begin:s\\.fff}s>", subtitle.Units.Last().End, srt[i + 1].Begin, subtitle)); | |
} | |
} | |
return srt; | |
} | |
private static Stack<Subtitle> FilterRecognizingSubtitles(Stack<Subtitle> recognizingSubtitles) | |
{ | |
if (recognizingSubtitles.Count == 0) | |
{ | |
return recognizingSubtitles; | |
} | |
var filtereds = new Stack<Subtitle>(); | |
var top = recognizingSubtitles.Pop(); | |
string[] topUnitTexts = top.Text.Split(' '); | |
filtereds.Push(top); | |
while (recognizingSubtitles.Count > 0) | |
{ | |
var current = recognizingSubtitles.Pop(); | |
string[] currentUnitTexts = current.Text.Split(' '); | |
if (currentUnitTexts.Length < topUnits.Length && currentUnitTexts.SequenceEqual(topUnits.Take(currentUnitTexts.Length))) | |
{ | |
filtereds.Push(current); | |
topUnitTexts = currentUnitTexts; | |
} | |
} | |
return filtereds; | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/// <summary> | |
/// 表示一条字幕。 | |
/// </summary> | |
[NotifyPropertyChanged] | |
public class Subtitle : SubtitleBase | |
{ | |
/// <summary> | |
/// 获取当前字幕被拆分为单词或(非正式的)词组的单元集合。 | |
/// </summary> | |
public ObservableCollection<SubtitleUnit> Units { get; private set; } = new(); | |
public Subtitle(int order, string text, TimeSpan begin, TimeSpan end) : base(order, text, begin, end) | |
{ } | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/// <summary> | |
/// 表示一条抽象的字幕。 | |
/// </summary> | |
[NotifyPropertyChanged] | |
public abstract class SubtitleBase : ISubtitle | |
{ | |
public TimeSpan Begin { get; set; } | |
/// <summary> | |
/// 获取字幕的持续时间。 | |
/// </summary> | |
public TimeSpan Duration => End - Begin; | |
public TimeSpan End { get; set; } | |
public int Order { get; set; } | |
public string Text { get; set; } | |
protected SubtitleBase(int order, string text, TimeSpan begin, TimeSpan end) | |
{ | |
Order = order; | |
Text = text; | |
Begin = begin; | |
End = end; | |
} | |
public override string ToString() | |
{ | |
string begin = @$"{Begin:hh\:mm\:ss\,fff}"; | |
string end = @$"{End:hh\:mm\:ss\,fff}"; | |
return $"{Order}\n{begin} --> {end}\n{Text}"; | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/// <summary> | |
/// 表示由 <seealso cref="Subtitle"/> 被语音识别服务拆分成的字幕单元,字幕单元可能是单词或(非正式的)短语。 | |
/// </summary> | |
[NotifyPropertyChanged] | |
public class SubtitleUnit : SubtitleBase | |
{ | |
public new int Order => Parent.Order; | |
public Subtitle Parent { get; set; } | |
/// <summary> | |
/// 获取字幕单元中的单词、数字,或者两者的组合,支持连字符(-)。 | |
/// <para><seealso cref="SubtitleBase.Text"/> 可能包含标点符号。</para> | |
/// </summary> | |
[SafeForDependencyAnalysis] | |
public string Word => Regex.Match(Text, @"[\w\d-]+").Value; | |
public SubtitleUnit(string text, TimeSpan begin, TimeSpan end, Subtitle parent) : base(parent.Order, text, begin, end) | |
{ | |
Parent = parent; | |
} | |
/// <summary> | |
/// 将当前字幕单元拆分成内容比 <seealso cref="Word"/> 更小的单元,按字符占比分配时间。 | |
/// <para>TODO: 不支持中英混合内容。</para> | |
/// </summary> | |
/// <param name="sep">拆分分隔符,默认为英文空格。</param> | |
/// <returns></returns> | |
public IEnumerable<SubtitleUnit> Split(string sep = " ") | |
{ | |
string[] units = Text.Split(sep); | |
int totalLength = units.Sum(x => x.Length); | |
var begin = Begin; | |
var end = End; | |
var duration = Duration; | |
foreach (string unit in units) | |
{ | |
end = begin + (duration * unit.Length / totalLength); | |
yield return new SubtitleUnit(unit, begin, end, Parent); | |
begin = end; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment