Blog 2020/6/21
<- previous | index | next ->
To write a parser generator, we need to have a clear picture of what the output should be. We can start to do that by first manually writing some recursive descent parsing functions.
Parsing a single terminal:
# grammar:
# foo = BAZ
def parse_foo(tokens, index):
failure = (None, index)
token = tokens[index]
if not is_BAZ(token):
return failure
subnodes = [token]
node = {
"type": "nonterminal",
"nonterminal_type": "foo",
"subnodes": subnodes
}
return (node, index+1)
Here, we assume the existence of an is_BAZ()
function to detect a token of type BAZ
.
Parsing a non-terminal:
# grammar:
# foo = bar
# bar = QUX
def parse_foo(tokens, index):
failure = (None, index)
token = tokens[index]
(node, index2) = parse_bar(tokens, index)
if node is None:
return failure
subnodes = [node]
node = {
"type": "nonterminal",
"nonterminal_type": "foo",
"subnodes": subnodes
}
return (node, index2)
def parse_bar(tokens, index):
failure = (None, index)
token = tokens[index]
if not is_QUX(token):
return failure
subnodes = [token]
node = {
"type": "nonterminal",
"nonterminal_type": "bar",
"subnodes": subnodes
}
return (node, index+1)
Parsing a sequence of two terminals:
# grammar:
# foo = BAZ QUX
def parse_foo(tokens, index):
failure = (None, index)
token = tokens[index]
subnodes = []
if not is_BAZ(token):
return failure
subnodes.append(token)
index += 1
token = tokens[index]
if not is_QUX(token):
return failure
subnodes.append(token)
index += 1
node = {
"type": "nonterminal",
"nonterminal_type": "foo",
"subnodes": subnodes
}
return (node, index)
Parsing a choice of two terminals:
# grammar:
# foo = BAZ | QUX
def parse_foo(tokens, index):
failure = (None, index)
token = tokens[index]
subnodes = []
while True:
if is_BAZ(token):
break
if is_QUX(token):
break
return failure
subnodes.append(token)
index += 1
node = {
"type": "nonterminal",
"nonterminal_type": "foo",
"subnodes": subnodes
}
return (node, index)
Here, we are sort-of abusing the while
statement, as we have no intention of looping, and are effectively just using break
as a form of goto
. But, it is effective, and will keep our code generation straight-forward.
Parsing an optional terminal, followed by a non-optional terminal:
# grammar:
# foo = BAZ? QUX
def parse_foo(tokens, index):
failure = (None, index)
token = tokens[index]
subnodes = []
if is_BAZ(token):
subnodes.append(token)
index += 1
token = tokens[index]
if not is_QUX(token):
return failure
subnodes.append(token)
index += 1
node = {
"type": "nonterminal",
"nonterminal_type": "foo",
"subnodes": subnodes
}
return (node, index)
Parsing zero more instances of a terminal, followed by a non-optional terminal:
# grammar:
# foo = BAZ* QUX
def parse_foo(tokens, index):
failure = (None, index)
token = tokens[index]
subnodes = []
while True:
if is_BAZ(token):
subnodes.append(token)
index += 1
token = tokens[index]
continue
else:
break
if not is_QUX(token):
return failure
subnodes.append(token)
index += 1
node = {
"type": "nonterminal",
"nonterminal_type": "foo",
"subnodes": subnodes
}
return (node, index)
Parsing one or more of a terminal, followed by a non-optional terminal:
# grammar:
# foo = BAZ+ QUX
def parse_foo(tokens, index):
failure = (None, index)
token = tokens[index]
subnodes = []
if not is_BAZ(token):
return failure
subnodes.append(token)
index += 1
while True:
if is_BAZ(token):
subnodes.append(token)
index += 1
token = tokens[index]
continue
else:
break
if not is_QUX(token):
return failure
subnodes.append(token)
index += 1
node = {
"type": "nonterminal",
"nonterminal_type": "foo",
"subnodes": subnodes
}
return (node, index)
Parsing a choice of one grouping or another grouping. We handle groupings by defining nested functions:
# grammar:
# foo = ( BAZ QUX ) | ( QUX BAZ )
def parse_foo(tokens, index):
# ( BAZ QUX )
def _parse_group1(tokens, index):
failure = (None, index)
token = tokens[index]
subnodes = []
if not is_BAZ(token):
return failure
subnodes.append(token)
index += 1
token = tokens[index]
if not is_QUX(token):
return failure
subnodes.append(token)
index += 1
return (subnodes, index)
# ( QUX BAZ )
def _parse_group2(tokens, index):
failure = (None, index)
token = tokens[index]
subnodes = []
if not is_QUX(token):
return failure
subnodes.append(token)
index += 1
token = tokens[index]
if not is_BAZ(token):
return failure
subnodes.append(token)
index += 1
return (subnodes, index)
failure = (None, index)
token = tokens[index]
subnodes = []
while True:
(subnodes2, index2) = _parse_group1(tokens, index)
if subnodes2 is not None:
index = index2
break
(subnodes2, index2) = _parse_group2(tokens, index)
if subnodes2 is not None:
index = index2
break
return failure
subnodes += subnodes2
node = {
"type": "nonterminal",
"nonterminal_type": "foo",
"subnodes": subnodes
}
return (node, index)
Parsing a two-levels-deep grouping:
# grammar:
# aa = ( (BB CC) | (DD EE) ) | (FF GG)
def parse_aa(tokens, index):
# ( (BB CC) | (DD EE) )
def _parse_group1(tokens, index):
# (BB CC)
def _parse__group3(tokens, index):
failure = (None, index)
token = tokens[index]
subnodes = []
if not is_BB(token):
return failure
subnodes.append(token)
index += 1
token = tokens[index]
if not is_CC(token):
return failure
subnodes.append(token)
index += 1
return (subnodes, index)
# (DD EE)
def _parse__group4(tokens, index):
failure = (None, index)
token = tokens[index]
subnodes = []
if not is_BB(token):
return failure
subnodes.append(token)
index += 1
token = tokens[index]
if not is_CC(token):
return failure
subnodes.append(token)
index += 1
return (subnodes, index)
failure = (None, index)
token = tokens[index]
subnodes = []
while True:
(subnodes2, index2) = _parse__group3(tokens, index)
if subnodes2 is not None:
break
(subnodes2, index2) = _parse__group4(tokens, index)
if subnodes2 is not None:
break
return failure
subnodes += subnodes2
index = index2
return (subnodes, index)
# (FF GG)
def _parse_group2(tokens, index):
failure = (None, index)
token = tokens[index]
subnodes = []
if not is_FF(token):
return failure
subnodes.append(token)
index += 1
token = tokens[index]
if not is_GG(token):
return failure
subnodes.append(token)
index += 1
return (subnodes, index)
failure = (None, index)
token = tokens[index]
subnodes = []
while True:
(subnodes2, index2) = _parse_group1(tokens, index)
if subnodes2 is not None:
index = index2
break
(subnodes2, index2) = _parse_group2(tokens, index)
if subnodes2 is not None:
index = index2
break
return failure
index = index2
subnodes += subnodes2
node = {
"type": "nonterminal",
"nonterminal_type": "aa",
"subnodes": subnodes
}
return (node, index)