Skip to content

Instantly share code, notes, and snippets.

@mgd020
Last active August 15, 2017 12:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mgd020/33d8f5d9e13f1a53ad4af4e656ff683e to your computer and use it in GitHub Desktop.
Save mgd020/33d8f5d9e13f1a53ad4af4e656ff683e to your computer and use it in GitHub Desktop.
python lexer with state stack
# -*- coding: utf-8 -*-
from __future__ import (absolute_import, division, print_function,
unicode_literals)
import re
import unittest
from collections import namedtuple
class TokenError(Exception):
pass
class Token(object):
__slots__ = ['pattern', 'transition']
__unnamed_count = 0
def __init__(self, pattern, name=None, push=None, pop=False):
'''
A single pattern object with associated name and action.
If name is None, matches are not emitted by lexer.
If push is given, state is pushed onto lexer state stack after
matching.
If pop is True, current state is popped after matching.
'''
if push and pop:
raise TokenError('can only push or pop, not both')
if not name:
name = '__%d__' % Token.__unnamed_count
Token.__unnamed_count += 1
self.pattern = r'(?P<%s>%s)' % (name, pattern)
if push:
self.transition = name, push
elif pop:
self.transition = name, None
else:
self.transition = None
class Lexer(object):
INITIAL_STATE = 'initial'
State = namedtuple('State', ['name', 'pattern', 'transitions'])
def __init__(self, tokens):
'''
Initialise lexer with tokens.
Tokens can be either a list/tuple of Token objects or a dict of
state name to list/tuple of Token objects.
'''
if not isinstance(tokens, dict):
tokens = {
self.INITIAL_STATE: tokens,
}
flags = re.MULTILINE | re.DOTALL
self.states = {}
for name, state_tokens in tokens.iteritems():
pop_disallowed = name == self.INITIAL_STATE
for token in state_tokens:
if token.transition:
if token.transition[1]:
if token.transition[1] not in tokens:
raise TokenError('invalid push state: %s' % token.transition[1])
elif pop_disallowed:
raise TokenError('invalid state to pop from: %s' % self.INITIAL_STATE)
self.states[name] = self.State(
name=name,
pattern=re.compile('|'.join(t.pattern for t in state_tokens) + '|.', flags),
transitions=dict(t.transition for t in state_tokens if t.transition),
)
self.state = [self.states[self.INITIAL_STATE]]
def __call__(self, data):
pos = 0
while True:
state = self.state[0]
for match in state.pattern.finditer(data, pos):
name = match.lastgroup
if not name or not name.startswith('__'):
# yield named tokens and error tokens
yield match
if not name:
continue
# transision state (if any)
try:
transition = state.transitions[name]
except KeyError:
# no transition, continue with current state
continue
if transition:
# push state
self.state.insert(0, self.states[transition])
else:
# pop state
self.state.pop(0)
# update match start position for new state
pos = match.end()
break
else:
# no more matches found
break
class LexerTestCase(unittest.TestCase):
def test_tokens(self):
lexer = Lexer([
Token(r'a', name='a'),
Token(r'b'),
])
data = 'abcba'
tokens = [(m.lastgroup, m.group()) for m in lexer(data)]
self.assertEqual(tokens, [
('a', 'a'),
(None, 'c'),
('a', 'a'),
])
def test_states(self):
lexer = Lexer({
'initial': [
Token(r'a', push='a'),
Token(r'b', name='b1'),
],
'a': [
Token(r'a', pop=True),
Token(r'b', name='b2'),
],
})
data = 'babab'
tokens = [(m.lastgroup, m.group()) for m in lexer(data)]
self.assertEqual(tokens, [
('b1', 'b'),
('b2', 'b'),
('b1', 'b'),
])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment