|
<!DOCTYPE html> |
|
<html> |
|
<head> |
|
<meta http-equiv="Content-Type" content="text/html;charset=utf-8"> |
|
<title>Lexing and Parsing in JavaScript</title> |
|
<script type="text/javascript" src="http://d3js.org/d3.v2.js" ></script> |
|
<script type="text/javascript" src="http://gerhobbelt.github.com/bl.ocks.org-hack/fixit.js" ></script> |
|
<style> |
|
label |
|
{ |
|
display: block; |
|
} |
|
textarea |
|
{ |
|
height: 500px; |
|
width: 900px; |
|
} |
|
table |
|
{ |
|
border: 2px solid black; |
|
} |
|
td |
|
{ |
|
border: 1px solid grey; |
|
} |
|
th |
|
{ |
|
border: 1px solid grey; |
|
} |
|
</style> |
|
</head> |
|
<body> |
|
<form> |
|
<label for="enter_text_here">Enter your text here</label> |
|
<textarea id="enter_text_here"> |
|
[blurb [headline Astronomy Shoppe products] |
|
|
|
[paragraph In addition to carrying the finest amateur astronomy |
|
equipment, we also manufacture a unique and delightful telescope |
|
accessories. These accessories enhance your observing time with |
|
telescope and binoculars. |
|
] |
|
|
|
[menu_button [button_title Astronomy Shoppe] |
|
[button_item [url /astro_products]ScoutScope II] |
|
[button_item [url /astro_products]Bino-Brac I-IV] |
|
[button_item [url /astro_products]Dob Upgrades] |
|
[button_item [url /astro_products]Unitron Drives] |
|
[button_item [url /astro_products]Scope-Totes] |
|
[button_item [url /astro_products]Scope-Gripps] |
|
[button_item [url /astro_products]Astro-Glow] ] |
|
] |
|
|
|
|
|
[blurb [headline Astronomy Shoppe [reveal catagory]] |
|
|
|
[paragraph In addition to carrying the finest amateur astronomy |
|
equipment, we also manufacture a unique and delightful telescope |
|
accessories. These accessories enhance your observing time with |
|
telescope and binoculars. |
|
] |
|
|
|
[menu_button [button_title Astronomy Shoppe] |
|
[multiple buttons |
|
[url /astro_products]ScoutScope II] |
|
[url /astro_products]Bino-Brac I-IV] |
|
[url /astro_products]Dob Upgrades] |
|
[url /astro_products]Unitron Drives] |
|
[url /astro_products]Scope-Totes] |
|
[url /astro_products]Scope-Gripps] |
|
[url /astro_products]Astro-Glow] ] |
|
] |
|
</textarea> |
|
</form> |
|
<div id="chart"> |
|
</div> |
|
<div id="output"> |
|
<table id="events_statistics"> |
|
<thead> |
|
<tr> |
|
<th colspan='4'>Events fired</th> |
|
</tr> |
|
<tr> |
|
<th>event</th> |
|
<th><html></th> |
|
<th><body></th> |
|
<th><textarea></th> |
|
</tr> |
|
</thead> |
|
<tbody> |
|
</tbody> |
|
</table> |
|
</div> |
|
<script type="text/javascript"> |
|
/* |
|
The approach to coding followed here is largely a Object Oriented one, where objects |
|
are used to store data and define the actions which can be performed on that data. |
|
"The data knows how to act." |
|
|
|
We use a JavaScript-specific idiom where instances of a 'class' are created by calling |
|
a function -- JavaScript functions are so-called first class citizens -- rather than the |
|
special 'new Class()' constructor invocation known from classical languages such as C++. |
|
|
|
We do NOT employ the fact that JavaScript is a language offering 'prototypical inheritance', |
|
rather than class-based inheritance: the code can be transposed to a class-based OOP language |
|
almost 1:1. |
|
|
|
We also do NOT employ the 'closure' feature of the JavaScript language: any variables |
|
which are not declared in the local scope are instance members, so these behave exactly |
|
like class members in a classic class-based OOP language. |
|
|
|
--- |
|
|
|
First of all, we need a way to fetch content entered by the user into the textarea, i.e. |
|
the content we're going to lex/parse/transform/'compile' into an output format: in this case |
|
we are going to produce HTML. |
|
Because we need to watch the activity in the textarea, we're going to register a callback, |
|
which in turn will fire up the compiler to produce a matching HTML output for the current input. |
|
|
|
The compiler is required by design to NEVER FAIL, so any input will produce AN output. We |
|
do accept that invalid input may produce gibberish output ('garbage in = garbage out') but the |
|
compiler should at least show a token effort to recover from minor flaws and faults in the |
|
input. |
|
*/ |
|
|
|
/* |
|
Doesn't matter what you use for HTML DOM node selection. jQuery, mootools, D3... |
|
I use D3 here, because I might just get fancy and construct a graphical representation |
|
of the AST as I go -- something which would be done most easily with D3. |
|
*/ |
|
|
|
var events = [ |
|
"DOMActivate", |
|
"DOMAttrModified", |
|
"DOMAttributeNameChanged", |
|
"DOMCharacterDataModified", |
|
"DOMContentLoaded", |
|
"DOMElementNameChanged", |
|
"DOMFocusIn", |
|
"DOMFocusOut", |
|
"DOMMouseScroll", |
|
"DOMNodeInserted", |
|
"DOMNodeInsertedIntoDocument", |
|
"DOMNodeRemoved", |
|
"DOMNodeRemovedFromDocument", |
|
"DOMSubtreeModified", |
|
"MozMousePixelScroll", |
|
"abort", |
|
"afterprint", |
|
"animationend", |
|
"animationiteration", |
|
"animationstart", |
|
"beforecopy", |
|
"beforecut", |
|
"beforepaste", |
|
"beforeprint", |
|
"beforeunload", |
|
"blur", |
|
"callschanged", |
|
"canplay", |
|
"canplaythrough", |
|
"change", |
|
"click", |
|
"compositionend", |
|
"compositionstart", |
|
"compositionupdate", |
|
"contextmenu", |
|
"copy", |
|
"cuechange", |
|
"cut", |
|
"dblclick", |
|
"delivered", |
|
"devicelight", |
|
"deviceproximity", |
|
"drag", |
|
"dragend", |
|
"dragenter", |
|
"dragleave", |
|
"dragover", |
|
"dragstart", |
|
"drop", |
|
"durationchange", |
|
"emptied", |
|
"ended", |
|
"error", |
|
"focus", |
|
"focusin", |
|
"focusout", |
|
"hashchange", |
|
"incoming", |
|
"input", |
|
"invalid", |
|
"keydown", |
|
"keypress", |
|
"keyup", |
|
"load", |
|
"loadeddata", |
|
"loadedmetadata", |
|
"loadend", |
|
"loadstart", |
|
"message", |
|
"mousedown", |
|
"mouseenter", |
|
"mouseleave", |
|
"mousemove", |
|
"mouseout", |
|
"mouseover", |
|
"mouseup", |
|
"mousewheel", |
|
"mozfullscreenchange", |
|
"offline", |
|
"online", |
|
"pagehide", |
|
"pageshow", |
|
"paste", |
|
"pause", |
|
"play", |
|
"playing", |
|
"popstate", |
|
"progress", |
|
"ratechange", |
|
"readystatechange", |
|
"received", |
|
"reset", |
|
"resize", |
|
"scroll", |
|
"search", |
|
"seeked", |
|
"seeking", |
|
"select", |
|
"selectstart", |
|
"sent", |
|
"show", |
|
"stalled", |
|
"storage", |
|
"submit", |
|
"suspend", |
|
"textInput", |
|
"textinput", |
|
"timeout", |
|
"timeupdate", |
|
"touchcancel", |
|
"touchend", |
|
"touchmove", |
|
"touchstart", |
|
"unload", |
|
"userproximity", |
|
"volumechange", |
|
"waiting", |
|
"webkitfullscreenchange", |
|
"wheel", |
|
]; |
|
var event_statistics = {}; |
|
var ready = false; |
|
var nested_event = 1; // DOMNodeInserted etc. fire while inside d3.selection.append(), etc. |
|
var saw_nested_event = false; |
|
|
|
var ta = d3.select("#enter_text_here"); |
|
for (var evt = 0; evt < events.length; evt++) { |
|
var evt_name = events[evt]; |
|
event_statistics[evt_name] = evt; |
|
events[evt] = { |
|
id: evt, |
|
name: evt_name, |
|
source: [ 0, 0, 0 ] // html, body, textarea |
|
}; |
|
ta.on(evt_name, process_changed_input(2)); |
|
d3.select("html").on(evt_name, process_changed_input(0)); |
|
d3.select("body").on(evt_name, process_changed_input(1)); |
|
} |
|
event_statistics['unknown'] = events.length; |
|
events.push({ |
|
id: -1, |
|
name: '???', |
|
source: [ 0, 0, 0 ] // html, body, textarea |
|
}); |
|
|
|
var evt_table = d3.select("#events_statistics tbody"); |
|
evt_table.selectAll("tr") |
|
.data(events, function(d, i) { |
|
return d.id; |
|
}) |
|
.enter() |
|
.append("tr") |
|
.attr("id", function(d, i) { |
|
return "evt_stats_" + i; |
|
}) |
|
.selectAll("td") |
|
.data(function(d, i) { |
|
return [d, d, d, d]; |
|
}) |
|
.enter() |
|
.append("td") |
|
.html(function(d, i, j) { |
|
return i > 0 ? d.source[i - 1] ? d.source[i - 1] : " " : d.name; |
|
}); |
|
ready = true; |
|
|
|
// the entire previous code section was marked: |
|
// any events that happened while running through there were marked as 'nested' |
|
// and will consequently force the entire event table to update on the next |
|
// event: |
|
nested_event--; |
|
|
|
var cc = compiler(); |
|
|
|
function process_changed_input(i) { |
|
// closure for 'i' |
|
|
|
return function() { |
|
nested_event++; |
|
var evt_name = (d3.event ? d3.event.type : "unknown"); |
|
var evt_idx = event_statistics[evt_name]; |
|
events[evt_idx].source[i]++; |
|
|
|
// do not execute any d3 selection update logic when inside a nested event |
|
// such as DomNodeInserted, or otherwise you'll end up with a locked-up page! |
|
if (nested_event > 1 || (ready && /* catch MSIE DOM events */ evt_name.indexOf("DOM") >= 0)) { |
|
saw_nested_event = true; |
|
} else if (ready && nested_event == 1) { |
|
var input_text = ta.text(); |
|
|
|
//console.log((input_text ? input_text : "").length, evt_name, i, evt_idx, d3.event, this); |
|
|
|
// and update the table content: |
|
if (saw_nested_event) { |
|
evt_table.selectAll("tr") |
|
.selectAll("td") |
|
.html(function(d, k, j) { |
|
return k > 0 ? d.source[k - 1] ? d.source[k - 1] : "-" : d.name; |
|
}); |
|
} else { |
|
evt_table.select("tr#evt_stats_" + evt_idx) |
|
.selectAll("td") |
|
.html(function(d, k, j) { |
|
return k > 0 ? d.source[k - 1] ? d.source[k - 1] : "-" : d.name; |
|
}); |
|
} |
|
|
|
var ast = cc.run(input_text); |
|
show_ast(ast); |
|
} |
|
nested_event--; |
|
}; |
|
} |
|
|
|
/* |
|
lexer 'class' |
|
|
|
old skool think: pccts/lex: a state-machine based scanner. |
|
No regexes are used, as they are serve other purposes better. |
|
*/ |
|
function lexer() { |
|
var state = { |
|
input: '', |
|
input_length: 0, |
|
scan_pos: 0, |
|
//expect_keyword: false, |
|
token_stream: [], |
|
processed_tokens: 0 |
|
}; |
|
|
|
// return FALSE when input didn't change |
|
state.set_input = function(input_text) { |
|
if (this.input === input_text) |
|
return false; |
|
|
|
this.input = input_text; |
|
this.input.length = input_text.length; |
|
return true; |
|
}; |
|
|
|
state.eof = function() { |
|
return this.scan_pos >= this.input_length; |
|
}; |
|
|
|
state.lookahead = function(n) { |
|
// fetch from cache when token was already scanned before: |
|
if (this.processed_tokens < this.token_stream.length) { |
|
return this.token_stream[this.processed_tokens + n - 1]; |
|
} |
|
|
|
if (!this.eof()) { |
|
while (this.processed_tokens + n > this.token_stream.length) { |
|
this.scan_one_token(); |
|
} |
|
|
|
if (this.processed_tokens < this.token_stream.length) { |
|
return this.token_stream[this.processed_tokens + n - 1]; |
|
} |
|
} |
|
return false; |
|
}; |
|
|
|
state.consume = function(n) { |
|
this.processed_tokens += n; |
|
}; |
|
|
|
//state.expect_keyword = function(state) { |
|
// this.expect_keyword = state; |
|
//}; |
|
|
|
state.clone_state = function(id) { |
|
return { |
|
id: id, |
|
scan_pos: this.scan_pos, |
|
token_stream: this.token_stream |
|
}; |
|
}; |
|
|
|
state.scan_one_token = function() { |
|
var start = this.scan_pos; |
|
var end; |
|
var leading_ws; |
|
var code_word = false; |
|
|
|
// skip leading whitespace: |
|
for ( ; start < this.input_length; start++) { |
|
if (" \t\r\n".indexOf(this.input[start]) < 0) |
|
break; |
|
} |
|
leading_ws = this.input.substring(this.scan_pos, start); |
|
switch (this.input[start]) { |
|
case '[': |
|
if ("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ".indexOf(this.input[start + 1]) >= 0) { |
|
this.push_token({ |
|
token_id: 1, |
|
token: '[', |
|
pos: start, |
|
leading_ws: leading_ws |
|
}); |
|
} else { |
|
this.push_token({ |
|
token_id: 33, |
|
token: '[', |
|
pos: start, |
|
leading_ws: leading_ws |
|
}); |
|
} |
|
this.scan_pos = start + 1; |
|
break; |
|
|
|
case ']': |
|
this.push_token({ |
|
token_id: 2, |
|
token: ']', |
|
pos: start, |
|
leading_ws: leading_ws |
|
}); |
|
this.scan_pos = start + 1; |
|
break; |
|
|
|
default: |
|
end = start; |
|
while ("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789".indexOf(this.input[end]) >= 0) { |
|
end++; |
|
} |
|
while ("_-".indexOf(this.input[end]) >= 0 |
|
&& "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ".indexOf(this.input[end + 1]) >= 0) { |
|
code_word = true; |
|
while ("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789".indexOf(this.input[end]) >= 0) { |
|
end++; |
|
} |
|
} |
|
if (start < end) { |
|
this.push_token({ |
|
token_id: 65, |
|
token: this.input.substring(start, end), |
|
pos: start, |
|
leading_ws: leading_ws |
|
}); |
|
this.scan_pos = end; |
|
break; |
|
} |
|
|
|
// punctuation series? |
|
end = start; |
|
while ("?!#$%^&*-+~=:<.>/|\\".indexOf(this.input[end]) >= 0) { |
|
end++; |
|
} |
|
if (end == start) { |
|
end = start + 1; |
|
} |
|
this.push_token({ |
|
token_id: 66, |
|
token: this.input.substring(start, end), |
|
pos: start, |
|
leading_ws: leading_ws |
|
}); |
|
this.scan_pos = end; |
|
break; |
|
} |
|
} |
|
|
|
state.push_token = function(token) { |
|
this.token_stream.push(token); |
|
}; |
|
|
|
return state; |
|
} |
|
|
|
/* |
|
parser 'class' |
|
|
|
old skool think: pccts/antlr, NOT yacc, as this is a top-down LL(k) parser, |
|
NOT a LALR(1) parser a la yacc/bison. |
|
*/ |
|
function parser() { |
|
var state = { |
|
lexer: 0, |
|
ast: null, |
|
ast_parent: null |
|
}; |
|
|
|
state.register_lexer = function(lexer) { |
|
this.lexer = lexer; |
|
return this; |
|
}; |
|
|
|
state.parse = function(input_text) { |
|
// speed up: does the new run actually change anything? |
|
if (!this.lexer.set_input(input_text)) { |
|
return this.ast; |
|
} |
|
|
|
// (re)parse input |
|
|
|
this.ast = []; |
|
this.ast_parent = []; |
|
|
|
// match the root grammar rule: |
|
// input := blurb* |
|
try { |
|
while (!this.lexer.eof()) { |
|
this.match_blurb(); |
|
} |
|
} catch(e) { |
|
this.push_AST_error_node(this.new_error_node(e), 0); |
|
} |
|
return this.ast; |
|
}; |
|
|
|
// match a single 'blurb': |
|
state.match_blurb = function() { |
|
/* |
|
A 'blurb' consists of a series of 'chunks', where the first chunk MUST be a 'headline': |
|
|
|
blurb := headline chunk* |
|
*/ |
|
var s = this.clone_state('blurb'); |
|
this.match_token('blurb'); |
|
this.enter_AST_node(); |
|
|
|
// match one headline |
|
if (this.match_headline()) { |
|
// followed by an arbitrary number of chunks: |
|
while (this.match_chunk()) |
|
; |
|
this.leave_AST_node(); |
|
} else { |
|
throw { state: s, msg: "blurb doesn't start with a headline" }; |
|
} |
|
}; |
|
|
|
// match a single 'headline': |
|
state.match_headline = function() { |
|
/* |
|
A 'headline' consists of a series of words and inline-markup: |
|
|
|
headline := (word | inline_markup)+ |
|
*/ |
|
var s = this.clone_state('headline'); |
|
this.match_token('headline'); |
|
this.enter_AST_node(); |
|
|
|
// followed by an arbitrary number of words and inline-markup: |
|
var info = { |
|
word_count: 0 |
|
}; |
|
for (;;) { |
|
if (!this.match_inline_markup(info, true)) { |
|
break; |
|
} |
|
} |
|
if (info.word_count < 1) { |
|
throw { state: s, msg: "headline must contain at least one word as headline text" }; |
|
} |
|
}; |
|
|
|
// match a inline-markup sequence from start to end: |
|
state.match_inline_markup = function(info, test) { |
|
/* |
|
'inline-markup' consists of a '*' or '_' to indicate bold/italics, followed by one of more words and terminated by the same '*' or '_' marker. |
|
|
|
inline-markup := ('*' | '_') (word | inline_markup)+ ('*' | '_') |
|
| word* |
|
|
|
where the alternative grammar rule indicates that, when no inline-markup tokens were found, we expect a simple series of words and punctuation. |
|
(Yes, punctuation is considered to be a special type of 'word'.) |
|
*/ |
|
var s = this.clone_state('inline-markup'); |
|
var m = this.match_markup_start_token(); |
|
if (m) { |
|
this.enter_AST_node(); |
|
this.match_inline_markup(info, test); |
|
if (info.word_count < 1) { |
|
throw { state: s, msg: "headline must contain at least one word as headline text" }; |
|
} |
|
this.match_markup_end_token(m); |
|
this.leave_AST_node(); |
|
} else { |
|
this.match_words(info, test); |
|
} |
|
}; |
|
|
|
state.match_markup_start_token = function() { |
|
return false; |
|
}; |
|
|
|
state.match_markup_end_token = function(start_token) { |
|
}; |
|
|
|
state.match_words = function(info, test) { |
|
var t = this.lexer.lookahead(1); |
|
while (t.token_id >= 32) { |
|
this.lexer.consume(1); |
|
this.push_AST_node(t); |
|
t = this.lexer.lookahead(1); |
|
} |
|
}; |
|
|
|
state.match_token = function(required_token_id) { |
|
// tell the lexer that the token in the stream must be a keyword: |
|
//this.lexer.expect_keyword(true); |
|
|
|
var t = this.lexer.lookahead(1); |
|
if (t.token_id != 1 /* '[' */ ) { |
|
throw { state: this.state, msg: "unexpected word '" + t.token + "' where we expected the special keyword marker '['" }; |
|
} |
|
t = this.lexer.lookahead(2); |
|
if (t.token != required_token_id) { |
|
throw { state: this.state, msg: "unexpected token '" + t.token + "' where we expected the token '" + required_token_id + "'." }; |
|
} |
|
this.lexer.consume(2); |
|
//this.lexer.expect_keyword(false); |
|
return this.push_AST_node(t); |
|
}; |
|
|
|
state.push_AST_node = function(node) { |
|
this.ast.push(node); |
|
return this; |
|
}; |
|
|
|
state.enter_AST_node = function() { |
|
assert(this.ast.length > 0); |
|
var node = this.ast[this.ast.length - 1]; |
|
node.ast = []; |
|
this.ast_parent.push({ ast: this.ast, node: node }); |
|
this.ast = node.ast; |
|
}; |
|
|
|
state.leave_AST_node = function() { |
|
assert(this.ast_parent.length > 0); |
|
var p = this.ast_parent.pop(); |
|
this.ast = p.ast; |
|
}; |
|
|
|
state.clone_state = function(id) { |
|
return { |
|
id: id, |
|
ast: this.ast, |
|
ast_parent: this.ast_parent, |
|
lexer_state: this.lexer.clone_state(id) |
|
}; |
|
}; |
|
|
|
state.push_AST_error_node = function(err_node, parent_depth) { |
|
while (this.ast_parent.length > parent_depth) { |
|
this.push_AST_node(err_node); |
|
this.leave_AST_node(); |
|
parent_depth--; |
|
} |
|
this.push_AST_node(err_node); |
|
}; |
|
|
|
state.new_error_node = function(e) { |
|
return { |
|
error: true, |
|
value: e |
|
}; |
|
}; |
|
|
|
return state; |
|
} |
|
|
|
/* |
|
compiler 'class' |
|
|
|
combines the lexer and parser to produce an AST (abstract syntax tree). |
|
The compiler also includes a 'backend' a.k.a. 'code generator', producing |
|
augmented HTML from the scanned input. |
|
*/ |
|
function compiler() { |
|
var state = { |
|
state: 0, |
|
lexer: null, |
|
parser: null, |
|
ast: [] |
|
}; |
|
|
|
state.compile = function(input_text) { |
|
// create a lexer and/or parser instance when these do not exist yet: |
|
if (!this.parser) { |
|
this.parser = parser(); |
|
} |
|
if (!this.lexer) { |
|
this.lexer = lexer(); |
|
this.parser.register_lexer(this.lexer); |
|
} |
|
var ast = this.parser.parse(input_text); |
|
return ast; |
|
}; |
|
|
|
state.run = function(input_text) { |
|
return this.compile(input_text || ""); |
|
}; |
|
|
|
return state; |
|
} |
|
|
|
|
|
|
|
function show_ast(ast) { |
|
} |
|
|
|
|
|
</script> |
|
</body> |
|
</html> |