Skip to content

Instantly share code, notes, and snippets.

@jakkaj
Last active March 14, 2017 12:02
Show Gist options
  • Save jakkaj/f69b64d5217be3192c84f398741190a9 to your computer and use it in GitHub Desktop.
Save jakkaj/f69b64d5217be3192c84f398741190a9 to your computer and use it in GitHub Desktop.
Parse WebVTT to POCO ready for indexing using Azure Search.
static List<IndexText> Parse(string vtt, string videoId){
string[] lines = vtt.Split(new string[] { "\r\n", "\n" }, StringSplitOptions.None);
var tcStart = default(TimeSpan);
var tcEnd = default(TimeSpan);
var indexList = new List<IndexText>();
foreach (var l in lines)
{
if (l == "WEBVTT" || string.IsNullOrWhiteSpace(l))
{
continue;
}
if (l.IndexOf("-->") != -1)
{
//this is a timecode
var tc = l.Replace("-->", "|").Split('|');
tcStart = TimeSpan.Parse(tc[0]);
tcEnd = TimeSpan.Parse(tc[1]);
}
else
{
var s = tcStart.TotalMilliseconds.ToString();
var e = tcEnd.TotalMilliseconds.ToString();
//this is text
var idx = new IndexText
{
Text = l,
Start = s,
End = e,
IndexId = videoId + s + e,
VideoId = videoId
};
indexList.Add(idx);
}
}
return indexList;
}
public class IndexText
{
[Key]
[IsFilterable]
public string IndexId { get; set; }
public string Start { get; set; }
public string End { get; set; }
[IsSearchable]
[Analyzer(AnalyzerName.AsString.EnLucene)]
public string Text { get; set; }
[IsSortable]
[IsFilterable]
[IsSearchable]
public string VideoId { get; set; }
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment