Last active
August 15, 2017 12:19
-
-
Save mgd020/33d8f5d9e13f1a53ad4af4e656ff683e to your computer and use it in GitHub Desktop.
python lexer with state stack
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
from __future__ import (absolute_import, division, print_function, | |
unicode_literals) | |
import re | |
import unittest | |
from collections import namedtuple | |
class TokenError(Exception): | |
pass | |
class Token(object): | |
__slots__ = ['pattern', 'transition'] | |
__unnamed_count = 0 | |
def __init__(self, pattern, name=None, push=None, pop=False): | |
''' | |
A single pattern object with associated name and action. | |
If name is None, matches are not emitted by lexer. | |
If push is given, state is pushed onto lexer state stack after | |
matching. | |
If pop is True, current state is popped after matching. | |
''' | |
if push and pop: | |
raise TokenError('can only push or pop, not both') | |
if not name: | |
name = '__%d__' % Token.__unnamed_count | |
Token.__unnamed_count += 1 | |
self.pattern = r'(?P<%s>%s)' % (name, pattern) | |
if push: | |
self.transition = name, push | |
elif pop: | |
self.transition = name, None | |
else: | |
self.transition = None | |
class Lexer(object): | |
INITIAL_STATE = 'initial' | |
State = namedtuple('State', ['name', 'pattern', 'transitions']) | |
def __init__(self, tokens): | |
''' | |
Initialise lexer with tokens. | |
Tokens can be either a list/tuple of Token objects or a dict of | |
state name to list/tuple of Token objects. | |
''' | |
if not isinstance(tokens, dict): | |
tokens = { | |
self.INITIAL_STATE: tokens, | |
} | |
flags = re.MULTILINE | re.DOTALL | |
self.states = {} | |
for name, state_tokens in tokens.iteritems(): | |
pop_disallowed = name == self.INITIAL_STATE | |
for token in state_tokens: | |
if token.transition: | |
if token.transition[1]: | |
if token.transition[1] not in tokens: | |
raise TokenError('invalid push state: %s' % token.transition[1]) | |
elif pop_disallowed: | |
raise TokenError('invalid state to pop from: %s' % self.INITIAL_STATE) | |
self.states[name] = self.State( | |
name=name, | |
pattern=re.compile('|'.join(t.pattern for t in state_tokens) + '|.', flags), | |
transitions=dict(t.transition for t in state_tokens if t.transition), | |
) | |
self.state = [self.states[self.INITIAL_STATE]] | |
def __call__(self, data): | |
pos = 0 | |
while True: | |
state = self.state[0] | |
for match in state.pattern.finditer(data, pos): | |
name = match.lastgroup | |
if not name or not name.startswith('__'): | |
# yield named tokens and error tokens | |
yield match | |
if not name: | |
continue | |
# transision state (if any) | |
try: | |
transition = state.transitions[name] | |
except KeyError: | |
# no transition, continue with current state | |
continue | |
if transition: | |
# push state | |
self.state.insert(0, self.states[transition]) | |
else: | |
# pop state | |
self.state.pop(0) | |
# update match start position for new state | |
pos = match.end() | |
break | |
else: | |
# no more matches found | |
break | |
class LexerTestCase(unittest.TestCase): | |
def test_tokens(self): | |
lexer = Lexer([ | |
Token(r'a', name='a'), | |
Token(r'b'), | |
]) | |
data = 'abcba' | |
tokens = [(m.lastgroup, m.group()) for m in lexer(data)] | |
self.assertEqual(tokens, [ | |
('a', 'a'), | |
(None, 'c'), | |
('a', 'a'), | |
]) | |
def test_states(self): | |
lexer = Lexer({ | |
'initial': [ | |
Token(r'a', push='a'), | |
Token(r'b', name='b1'), | |
], | |
'a': [ | |
Token(r'a', pop=True), | |
Token(r'b', name='b2'), | |
], | |
}) | |
data = 'babab' | |
tokens = [(m.lastgroup, m.group()) for m in lexer(data)] | |
self.assertEqual(tokens, [ | |
('b1', 'b'), | |
('b2', 'b'), | |
('b1', 'b'), | |
]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment