Skip to content

Instantly share code, notes, and snippets.

@cellularmitosis
Last active June 23, 2020 21:57
Show Gist options
  • Save cellularmitosis/f74711032056e07d2e9baf1fbbc8faf2 to your computer and use it in GitHub Desktop.
Save cellularmitosis/f74711032056e07d2e9baf1fbbc8faf2 to your computer and use it in GitHub Desktop.
Parser generator insights from manually writing parsing functions

Blog 2020/6/21

<- previous | index | next ->

Parser generator insights from manually writing parsing functions

To write a parser generator, we need to have a clear picture of what the output should be. We can start to do that by first manually writing some recursive descent parsing functions.

Terminals

Parsing a single terminal:

# grammar:
# foo = BAZ
def parse_foo(tokens, index):
    failure = (None, index)
    token = tokens[index]
    if not is_BAZ(token):
        return failure
    subnodes = [token]
    node = {
        "type": "nonterminal",
        "nonterminal_type": "foo",
        "subnodes": subnodes
    }
    return (node, index+1)

Here, we assume the existence of an is_BAZ() function to detect a token of type BAZ.

Non-terminals

Parsing a non-terminal:

# grammar:
# foo = bar
# bar = QUX
def parse_foo(tokens, index):
    failure = (None, index)
    token = tokens[index]
    (node, index2) = parse_bar(tokens, index)
    if node is None:
        return failure
    subnodes = [node]
    node = {
        "type": "nonterminal",
        "nonterminal_type": "foo",
        "subnodes": subnodes
    }
    return (node, index2)

def parse_bar(tokens, index):
    failure = (None, index)
    token = tokens[index]
    if not is_QUX(token):
        return failure
    subnodes = [token]
    node = {
        "type": "nonterminal",
        "nonterminal_type": "bar",
        "subnodes": subnodes
    }
    return (node, index+1)

Sequence

Parsing a sequence of two terminals:

# grammar:
# foo = BAZ QUX
def parse_foo(tokens, index):
    failure = (None, index)
    token = tokens[index]
    subnodes = []

    if not is_BAZ(token):
        return failure
    subnodes.append(token)
    index += 1
    token = tokens[index]

    if not is_QUX(token):
        return failure
    subnodes.append(token)
    index += 1

    node = {
        "type": "nonterminal",
        "nonterminal_type": "foo",
        "subnodes": subnodes
    }
    return (node, index)

Choice / Alternation

Parsing a choice of two terminals:

# grammar:
# foo = BAZ | QUX
def parse_foo(tokens, index):
    failure = (None, index)
    token = tokens[index]
    subnodes = []
    while True:

        if is_BAZ(token):
            break

        if is_QUX(token):
            break

        return failure
    subnodes.append(token)
    index += 1
    node = {
        "type": "nonterminal",
        "nonterminal_type": "foo",
        "subnodes": subnodes
    }
    return (node, index)

Here, we are sort-of abusing the while statement, as we have no intention of looping, and are effectively just using break as a form of goto. But, it is effective, and will keep our code generation straight-forward.

Optionality (zero or one)

Parsing an optional terminal, followed by a non-optional terminal:

# grammar:
# foo = BAZ? QUX
def parse_foo(tokens, index):
    failure = (None, index)
    token = tokens[index]
    subnodes = []

    if is_BAZ(token):
        subnodes.append(token)
        index += 1
        token = tokens[index]

    if not is_QUX(token):
        return failure
    subnodes.append(token)
    index += 1

    node = {
        "type": "nonterminal",
        "nonterminal_type": "foo",
        "subnodes": subnodes
    }
    return (node, index)

Repetition (zero or more)

Parsing zero more instances of a terminal, followed by a non-optional terminal:

# grammar:
# foo = BAZ* QUX
def parse_foo(tokens, index):
    failure = (None, index)
    token = tokens[index]
    subnodes = []

    while True:
        if is_BAZ(token):
            subnodes.append(token)
            index += 1
            token = tokens[index]
            continue
        else:
            break

    if not is_QUX(token):
        return failure
    subnodes.append(token)
    index += 1

    node = {
        "type": "nonterminal",
        "nonterminal_type": "foo",
        "subnodes": subnodes
    }
    return (node, index)

Repetition (one or more)

Parsing one or more of a terminal, followed by a non-optional terminal:

# grammar:
# foo = BAZ+ QUX
def parse_foo(tokens, index):
    failure = (None, index)
    token = tokens[index]
    subnodes = []

    if not is_BAZ(token):
        return failure
    subnodes.append(token)
    index += 1

    while True:
        if is_BAZ(token):
            subnodes.append(token)
            index += 1
            token = tokens[index]
            continue
        else:
            break

    if not is_QUX(token):
        return failure
    subnodes.append(token)
    index += 1

    node = {
        "type": "nonterminal",
        "nonterminal_type": "foo",
        "subnodes": subnodes
    }
    return (node, index)

Grouping

Parsing a choice of one grouping or another grouping. We handle groupings by defining nested functions:

# grammar:
# foo = ( BAZ QUX ) | ( QUX BAZ )
def parse_foo(tokens, index):

    # ( BAZ QUX )
    def _parse_group1(tokens, index):
        failure = (None, index)
        token = tokens[index]
        subnodes = []
    
        if not is_BAZ(token):
            return failure
        subnodes.append(token)
        index += 1
        token = tokens[index]
    
        if not is_QUX(token):
            return failure
        subnodes.append(token)
        index += 1
    
        return (subnodes, index)

    # ( QUX BAZ )
    def _parse_group2(tokens, index):
        failure = (None, index)
        token = tokens[index]
        subnodes = []
    
        if not is_QUX(token):
            return failure
        subnodes.append(token)
        index += 1
        token = tokens[index]
    
        if not is_BAZ(token):
            return failure
        subnodes.append(token)
        index += 1
    
        return (subnodes, index)
    
    failure = (None, index)
    token = tokens[index]
    subnodes = []

    while True:
        (subnodes2, index2) = _parse_group1(tokens, index)
        if subnodes2 is not None:
            index = index2
            break
        
        (subnodes2, index2) = _parse_group2(tokens, index)
        if subnodes2 is not None:
            index = index2
            break
            
        return failure
    subnodes += subnodes2

    node = {
        "type": "nonterminal",
        "nonterminal_type": "foo",
        "subnodes": subnodes
    }
    return (node, index)

Nested groupings

Parsing a two-levels-deep grouping:

# grammar:
# aa = ( (BB CC) | (DD EE) ) | (FF GG)
def parse_aa(tokens, index):

    # ( (BB CC) | (DD EE) )
    def _parse_group1(tokens, index):

        # (BB CC)
        def _parse__group3(tokens, index):
            failure = (None, index)
            token = tokens[index]
            subnodes = []
        
            if not is_BB(token):
                return failure
            subnodes.append(token)
            index += 1
            token = tokens[index]
        
            if not is_CC(token):
                return failure
            subnodes.append(token)
            index += 1
        
            return (subnodes, index)

        # (DD EE)
        def _parse__group4(tokens, index):
            failure = (None, index)
            token = tokens[index]
            subnodes = []
        
            if not is_BB(token):
                return failure
            subnodes.append(token)
            index += 1
            token = tokens[index]
        
            if not is_CC(token):
                return failure
            subnodes.append(token)
            index += 1
        
            return (subnodes, index)

        failure = (None, index)
        token = tokens[index]
        subnodes = []
    
        while True:
            (subnodes2, index2) = _parse__group3(tokens, index)
            if subnodes2 is not None:
                break

            (subnodes2, index2) = _parse__group4(tokens, index)
            if subnodes2 is not None:
                break
            
            return failure
        subnodes += subnodes2
        index = index2
        return (subnodes, index)
    
    # (FF GG)
    def _parse_group2(tokens, index):
        failure = (None, index)
        token = tokens[index]
        subnodes = []
    
        if not is_FF(token):
            return failure
        subnodes.append(token)
        index += 1
        token = tokens[index]
    
        if not is_GG(token):
            return failure
        subnodes.append(token)
        index += 1
    
        return (subnodes, index)
    
    failure = (None, index)
    token = tokens[index]
    subnodes = []

    while True:
        (subnodes2, index2) = _parse_group1(tokens, index)
        if subnodes2 is not None:
            index = index2
            break
        
        (subnodes2, index2) = _parse_group2(tokens, index)
        if subnodes2 is not None:
            index = index2
            break
            
        return failure
    index = index2
    subnodes += subnodes2

    node = {
        "type": "nonterminal",
        "nonterminal_type": "aa",
        "subnodes": subnodes
    }
    return (node, index)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment