Created
October 23, 2011 09:53
-
-
Save pvdz/1307192 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Not intended to be fast. | |
var WebVTTCueTextParser = (function(){ // this will return just the WebVTTCueTextParser function and not leak anything else | |
// local scope: | |
var | |
NEWLINE = /\r\n|\r|\n/, | |
SPACE = /[\u0020\t\f]/, | |
NOSPACE = /[^\u0020\t\f]/ | |
var WebVTTParser = function() { | |
this.linePos = 0 | |
this.errors = [] | |
} | |
WebVTTParser.prototype = { | |
linePos: null, | |
errors: null, // instantiate in constructor! | |
err: function err(message, col) { | |
this.errors.push({message:message, line:this.linePos+1, col:col}) | |
}, | |
parse: function(input) { | |
//XXX need global search and replace for \0 | |
var startTime = (new Date).getTime(), | |
lines = input.split(NEWLINE), | |
cues = [] | |
/* SIGNATURE */ | |
if( | |
lines[this.linePos].length < 6 || | |
lines[this.linePos].indexOf("WEBVTT") != 0 || | |
lines[this.linePos].length > 6 && | |
lines[this.linePos][6] != " " && | |
lines[this.linePos][6] != "\t" | |
) { | |
this.err("No valid signature. (File needs to start with \"WEBVTT\".)") | |
} | |
this.linePos++ | |
/* HEADER */ | |
while(lines[this.linePos] != "" && lines[this.linePos] != undefined) { | |
// XXX not called out in the specification | |
this.err("No blank line after the signature. Line ignored.") | |
this.linePos++ | |
} | |
/* CUE LOOP */ | |
while(lines[this.linePos] != undefined) { | |
var cue | |
while(lines[this.linePos] == "") { | |
this.linePos++ | |
} | |
if(lines[this.linePos] == undefined) | |
continue | |
cue = { | |
start:0, | |
end:0, | |
identifier:"", | |
pauseOnExit:false, | |
direction:"horizontal", | |
snapToLines:true, | |
linePosition:"auto", | |
textPosition:50, | |
size:100, | |
alignment:"middle", | |
text:"", | |
tree:null | |
} | |
if(lines[this.linePos].indexOf("-->") == -1) { | |
cue.identifier = lines[this.linePos] | |
this.linePos++ | |
if(lines[this.linePos] == "" || lines[this.linePos] == undefined) { | |
this.err("Cue identifier cannot be standalone.") | |
continue | |
} | |
} | |
var timings = new WebVTTCueTimingsAndSettingsParser(lines[this.linePos], err) | |
if(!timings.parse(cue)) { | |
/* BAD CUE */ | |
cue = null | |
/* BAD CUE LOOP */ | |
while(lines[this.linePos] != "" && lines[this.linePos] != undefined) { | |
this.linePos++ | |
} | |
continue | |
} | |
/* CUE TEXT LOOP */ | |
while(lines[this.linePos] != "" && lines[this.linePos] != undefined) { | |
if(cue.text != "") | |
cue.text += "\n" | |
cue.text += lines[this.linePos] | |
this.linePos++ | |
} | |
var cuetextparser = new WebVTTCueTextParser(cue.text, err) | |
cue.tree = cuetextparser.parse() | |
cues.push(cue) | |
this.linePos++ | |
} | |
/* END */ | |
return {cues:cues, errors:this.errors, time:(new Date).getTime()-startTime} | |
} | |
} | |
var WebVTTCueTimingsAndSettingsParser = function(line, errorHandler) { | |
var line = line, | |
pos = 0, | |
parseError = false, | |
err = function(message) { | |
parseError = true | |
errorHandler(message, pos+1) | |
} | |
function skip(pattern) { | |
while( | |
line[pos] != undefined && | |
pattern.test(line[pos]) | |
) { | |
pos++ | |
} | |
} | |
function collect(pattern) { | |
var str = "" | |
while( | |
line[pos] != undefined && | |
pattern.test(line[pos]) | |
) { | |
str += line[pos] | |
pos++ | |
} | |
return str | |
} | |
/* | |
http://www.whatwg.org/specs/web-apps/current-work/multipage/the-video-element.html#collect-a-webvtt-timestamp | |
*/ | |
function timestamp() { | |
var units = "minutes", | |
val1, | |
val2, | |
val3, | |
val4 | |
// 3 | |
if(line[pos] == undefined) { | |
err("No timestamp found.") | |
return | |
} | |
// 4 | |
if(!/\d/.test(line[pos])) { | |
err("Timestamp must start with a character in the range 0-9.") | |
return | |
} | |
// 5-7 | |
val1 = collect(/\d/) | |
if(val1.length > 2 || parseInt(val1) > 59) { | |
units = "hours" | |
} | |
// 8 | |
if(line[pos] != ":") { | |
err("No time unit separator found.") | |
return | |
} | |
pos++ | |
// 9-11 | |
val2 = collect(/\d/) | |
if(val2.length != 2) { | |
err("Must be exactly two digits.") | |
return | |
} | |
// 12 | |
if(units == "hours" || line[pos] == ":") { | |
if(line[pos] != ":") { | |
err("No seconds found or minutes is greater than 59.") | |
return | |
} | |
pos++ | |
val3 = collect(/\d/) | |
if(val3.length != 2) { | |
err("Must be exactly two digits.") | |
return | |
} | |
} else { | |
val3 = val2 | |
val2 = val1 | |
val1 = "" | |
} | |
// 13 | |
if(line[pos] != ".") { | |
err("No decimal separator (\".\") found.") | |
return | |
} | |
pos++ | |
// 14-16 | |
val4 = collect(/\d/) | |
if(val4.length != 3) { | |
err("Milliseconds must be given in three digits.") | |
return | |
} | |
// 17 | |
if(parseInt(val2) > 59) { | |
err("You cannot have more than 59 minutes.") | |
return | |
} | |
if(parseInt(val3) > 59) { | |
err("You cannot have more than 59 seconds.") | |
return | |
} | |
return parseInt(val1) * 60 * 60 + parseInt(val2) * 60 + parseInt(val3) + parseInt(val4) / 1000 | |
} | |
/* | |
http://www.whatwg.org/specs/web-apps/current-work/multipage/the-video-element.html#parse-the-webvtt-settings | |
*/ | |
function settings(cue) { | |
var seen = [], | |
setting = "", | |
value = "" | |
function otherwise() { | |
if(line[pos] != undefined && NOSPACE.test(line[pos])) { | |
err("Invalid setting.") | |
skip(NOSPACE) | |
return true | |
} | |
return | |
} | |
while(line[pos] != undefined) { | |
// XXX specification needs update for this | |
skip(SPACE) | |
if(line[pos] == undefined) { | |
return | |
} | |
setting = line[pos] | |
pos++ | |
if(seen.indexOf(setting) != -1) { | |
err("Duplicate setting.") | |
} | |
seen.push(setting) | |
// 5 | |
if(line[pos] != ":") { | |
setting = "" | |
} | |
// 6 XXX this also skips spaces is that really intentional? | |
pos++ | |
// 7 | |
if(line[pos] == undefined) { | |
// XXX specification needs update for this | |
err("No value for setting defined.") | |
return | |
} | |
// 8 | |
if(setting == "D") { // writing direction | |
value = collect(NOSPACE) | |
if(value != "vertical" && value != "vertical-lr") { | |
err("Writing direction can only be set to 'vertical' or 'vertical-lr'.") | |
continue | |
} | |
cue.direction = value | |
} else if(setting == "L") { // line position | |
value = collect(/[-%0-9]/) | |
// 2 | |
if(otherwise()) { | |
continue | |
} | |
if(!/\d/.test(value)) { | |
err("Line position takes a number or percentage.") | |
continue | |
} | |
// 4 | |
if(value.indexOf("-", 1) != -1) { | |
err("Line position can only have '-' at the start.") | |
continue | |
} | |
//5 | |
if(value.indexOf("%") != -1 && value.indexOf("%") != value.length-1) { | |
err("Line position can only have '%' at the end.") | |
continue | |
} | |
// 6 | |
if(value[0] == "-" && value[value.length-1] == "%") { | |
err("Line position cannot be a negative percentage.") | |
continue | |
} | |
// 8 | |
if(value[value.length-1] == "%") { | |
if(parseInt(value) > 100) { | |
err("Line position cannot be >100%.") | |
continue | |
} | |
cue.snapToLines = false | |
} | |
cue.linePosition = parseInt(value) | |
} else if(setting == "T") { // text position | |
value = collect(/\d/) | |
// 3 | |
if(line[pos] != "%") { | |
err("Text position must be a percentage.") | |
skip(NOSPACE) | |
continue | |
} | |
// 4-6 | |
pos++ | |
if(otherwise() || value == "") { | |
continue | |
} | |
// 7-8 | |
if(parseInt(value) > 100) { | |
err("Size cannot be >100%.") | |
continue | |
} | |
cue.textPosition = parseInt(value) | |
} else if(setting == "S") { // size | |
value = collect(/\d/) | |
// 3 | |
if(line[pos] != "%") { | |
err("Size must be a percentage.") | |
skip(NOSPACE) | |
continue | |
} | |
// 4-6 | |
pos++ | |
if(otherwise() || value == "") { | |
continue | |
} | |
// 7-8 | |
if(parseInt(value) > 100) { | |
err("Size cannot be >100%.") | |
continue | |
} | |
cue.size = parseInt(value) | |
} else if(setting == "A") { // alignment | |
value = collect(NOSPACE) | |
if(value != "start" && value != "middle" && value != "end") { | |
err("Alignment can only be set to 'start', 'middle', or 'end'.") | |
continue | |
} | |
cue.alignment = value | |
} else { | |
err("Invalid setting.") | |
skip(NOSPACE) | |
} | |
} | |
} | |
this.parse = function(cue) { | |
skip(SPACE) | |
cue.start = timestamp() | |
if(cue.start == undefined) { | |
return | |
} | |
skip(SPACE) | |
// 6-8 | |
if(line[pos] != "-") { | |
err("No valid timestamp separator found.") | |
return | |
} | |
pos++ | |
if(line[pos] != "-") { | |
err("No valid timestamp separator found.") | |
return | |
} | |
pos++ | |
if(line[pos] != ">") { | |
err("No valid timestamp separator found.") | |
return | |
} | |
pos++ | |
skip(SPACE) | |
cue.end = timestamp() | |
if(cue.end == undefined) { | |
return | |
} | |
skip(SPACE) | |
settings(cue) | |
if(parseError) | |
return | |
return true | |
} | |
this.parseTimestamp = function() { | |
var timestamp = timestamp() | |
if(line[pos] != undefined) { | |
err("Timestamp must not have trailing characters.") | |
return | |
} | |
return timestamp | |
} | |
} | |
var WebVTTCueTextParser = function(line, errorHandler) { | |
var line = line, | |
pos = 0, | |
err = function(message) { | |
errorHandler(message, pos+1) | |
} | |
this.parse = function() { | |
var result = {children:[]}, | |
current = result | |
function attach(token) { | |
current.children.push({type:"object", name:token[1], children:[], parent:current}) | |
current = current.children[current.children.length-1] | |
} | |
function inScope(name) { | |
var node = current | |
while(node) { | |
if(node.name == "v") | |
return true | |
node = node.parent | |
} | |
return | |
} | |
while(line[pos] != undefined) { | |
var token = nextToken() | |
if(token[0] == "text") { | |
current.children.push({type:"text", value:token[1], parent:current}) | |
} else if(token[0] == "start tag") { | |
var name = token[1] | |
if( | |
name == "c" || | |
name == "i" || | |
name == "b" || | |
name == "u" || | |
name == "ruby" | |
) { | |
attach(token) | |
} else if(name == "rt" && current.name == "ruby") { | |
attach(token) | |
} else if(name == "v") { | |
if(inScope("v")) | |
err("<v> cannot be nested inside itself.") | |
attach(token) | |
token.value = token[3] // annotation | |
} else { | |
err("Incorrect start tag.") | |
} | |
} else if(token[0] == "end tag") { | |
if(token[1] == current.name) { | |
current = current.parent | |
} else if(token[1] == "ruby" && current.name == "rt") { | |
current = current.parent.parent | |
} else { | |
err("Incorrect end tag.") | |
} | |
} else if(token[0] == "timestamp") { | |
var timings = new WebVTTCueTimingsAndSettingsParser(token[1], err), | |
timestamp = timings.parseTimestamp() | |
if(timestamp != undefined) { | |
current.children.push({type:"timestamp", value:token[1], parent:current}) | |
} | |
} | |
} | |
return result | |
} | |
function nextToken() { | |
var state = "data", | |
result = "", | |
buffer = "", | |
classes = [] | |
while(line[pos-1] != undefined || pos == 0) { | |
var c = line[pos] | |
if(state == "data") { | |
if(c == "&") { | |
buffer = c | |
state = "escape" | |
} else if(c == "<" && result == "") { | |
state = "tag" | |
} else if(c == "<" || c == undefined) { | |
return ["text", result] | |
} else { | |
result += c | |
} | |
} else if(state == "escape") { | |
if(c == ";") { | |
if(buffer == "&") { | |
result += "&" | |
} else if(buffer == "<") { | |
result += "<" | |
} else if(buffer == ">") { | |
result += ">" | |
} else { | |
err("Incorrect escape.") | |
result += buffer + ";" | |
} | |
state = "data" | |
} else if(/[ampltg]/.test(c)) { | |
buffer += c | |
} else if(c == undefined) { | |
err("Incorrect escape.") | |
result += buffer | |
return ["text", result] | |
} else { | |
// XXX spec does not append c | |
err("Incorrect escape.") | |
result += buffer + c | |
state = "data" | |
} | |
} else if(state == "tag") { | |
if(c == " " || c == "\t") { | |
state = "start tag annotation" | |
} else if(c == ".") { | |
state = "start tag class" | |
} else if(c == "/") { | |
state = "end tag" | |
} else if(/\d/.test(c)) { | |
result = c | |
state = "timestamp tag" | |
} else if(c == ">" || c == undefined) { | |
if(c == ">") | |
pos++ | |
return ["start tag", "", [], ""] | |
} else { | |
result = c | |
state = "start tag" | |
} | |
} else if(state == "start tag") { | |
if(c == " " || c == "\t") { | |
state = "start tag annotation" | |
} else if(c == ".") { | |
state = "start tag class" | |
} else if(c == ">" || c == undefined) { | |
if(c == ">") | |
pos++ | |
return ["start tag", result, [], ""] | |
} else { | |
result += c | |
} | |
} else if(state == "start tag class") { | |
if(c == " " || c == "\t") { | |
classes.push(buffer) | |
buffer = "" | |
state = "start tag annotation" | |
} else if(c == ".") { | |
classes.push(buffer) | |
buffer = "" | |
} else if(c == ">" || c == undefined) { | |
if(c == ">") | |
pos++ | |
classes.push(buffer) | |
return ["start tag", result, classes, ""] | |
} else { | |
buffer += c | |
} | |
} else if(state == "start tag annotation") { | |
if(c == ">" || c == undefined) { | |
if(c == ">") | |
pos++ | |
// XXX normalize buffer | |
return ["start tag", result, classes, buffer] | |
} else { | |
buffer +=c | |
} | |
} else if(state == "end tag") { | |
if(c == ">" || c == undefined) { | |
if(c == ">") | |
pos++ | |
return ["end tag", result] | |
} else { | |
result += c | |
} | |
} else if(state == "timestamp tag") { | |
if(c == ">" || c == undefined) { | |
if(c == ">") | |
pos++ | |
return ["timestamp", result] | |
} else { | |
result += c | |
} | |
} else { | |
err("Never happens.") // The joke is it might. | |
} | |
// 8 | |
pos++ | |
} | |
} | |
} | |
// end of local scope | |
return WebVTTCueTextParser | |
})() | |
/* | |
function serializeChildren(children) { | |
// lousy serialize function | |
var result = "" | |
for (var i = 0; i < children.length; i++) { | |
var child = children[i] | |
if(child.type == "text") { | |
result += child.value | |
} else if(child.type == "object") { | |
result += "<" + child.name + ">" | |
if(child.children) | |
result += serializeChildren(child.children) | |
result += "</" + child.name + ">" | |
} else { | |
result += "XXX" | |
} | |
} | |
return result | |
} | |
var tralla = new WebVTTCueTextParser("&<i>c") | |
trollo = tralla.parse().children | |
alert(serializeChildren(trollo)) | |
*/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment