Skip to content

Instantly share code, notes, and snippets.

@kergoth
Created May 10, 2011 00:38
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kergoth/963723 to your computer and use it in GitHub Desktop.
Save kergoth/963723 to your computer and use it in GitHub Desktop.
Experiments with splitting the lexer out of the parser for BitBake's file format

TODO

  • Verify that IndentTokenizer works with a non-LINE-based tokenizer specification, rather than the default
  • Resurrect the 'NEWLINE' token, as we need it to be as picky about the file format as the current parser is
  • Implement a parser which leverages this to properly change the lexer states for both ordinary functions and "def" syntax functions. Determine if this should be custom or PLY or codetalker or what, by first determining whether these libraries would support a lexer like ours
  • Do performance testing comparing the new parser against the old, and against the pyparsing implementation
import lexer
import re
class IndentTokenizer(lexer.Tokenizer):
"""Tokenizer which tracks indentation, for parsing python-like strings"""
_white = r'(\s+)(?=\S)'
_line = r'.*(?=\n)'
_spec = [
('NEWLINE', r'\n'),
('LINE', _line),
]
def __init__(self, string, tok_spec=None):
self.current_indent = 0
self.indents = []
self.pending = []
self.after_newline = True
if tok_spec is None:
tok_spec = self._spec
lexer.Tokenizer.__init__(self, string, tok_spec)
def next_token(self):
if self.pending:
return self.pending.pop(0)
token = lexer.Tokenizer.next_token(self)
if not token and self.indents:
return lexer.Token('DEDENT', self.indents.pop(), self.line - 1, 0)
return token
def process_match(self, match, old_position):
typ = match.lastgroup
if typ == 'NEWLINE':
self.after_newline = True
else:
if self.after_newline:
self.after_newline = False
obj = self.handle_indents(old_position)
if obj:
self.position = old_position
return obj
return lexer.Tokenizer.process_match(self, match, old_position)
def handle_indents(self, position):
indents = list(self.process_indent(position))
if indents:
self.pending.extend(indents[1:])
return indents[0]
def process_indent(self, position):
line = re.compile(self._line).match(self.string, position)
if line:
line_value = line.group()
if line_value.strip():
matched = re.match(self._white, line_value)
if matched:
indent = matched.group(1)
if self.indents and indent == self.indents[-1]:
pass
elif len(self.indents) > 1 and indent == self.indents[-2]:
# dedent
yield lexer.Token('DEDENT', self.indents.pop(), self.line - 1, 0)
else:
self.indents.append(indent)
yield lexer.Token('INDENT', indent, self.line - 1, 0)
elif self.indents:
while self.indents:
yield lexer.Token('DEDENT', self.indents.pop(), self.line - 1, 0)
def parse_function(string, position):
body = []
indent = 0
tokenizer = IndentTokenizer(string)
tokenizer.position = position
for token in tokenizer:
if token.typ == 'INDENT':
indent += 1
elif token.typ == 'DEDENT':
indent -= 1
if not indent:
break
else:
body.append(token.value)
return [line + '\n' for line in body], tokenizer.position
if __name__ == '__main__':
teststring = """
def foo(value):
print(value * 5)
# foo
def anotherfunc(anothervalue):
print('hi, mom!')
return anothervalue + 6
return anotherfunc(12)
print("hi, mom!")
"""
body, position = parse_function(teststring, 0)
print(''.join(body))
assert teststring[position:] == 'print("hi, mom!")\n'
import collections
import re
import sys
Token = collections.namedtuple('Token', 'typ value line column')
class Tokenizer(object):
"""Simple regular expression based tokenizer with supports for lexing states.
Based on http://docs.python.org/dev/library/re.html#writing-a-tokenizer.
"""
def __init__(self, string, tok_spec=None, keywords=None):
self.states = []
self.string = string
self.line = 1
self.position = self.line_start = 0
self.newline_tokens = False
self.length = len(string)
if keywords is None:
keywords = []
self.keywords = keywords
if tok_spec is not None:
self.push_state(tok_spec)
def push_state(self, tok_spec):
spec = [elem for elem in tok_spec if not elem[0].startswith('_')]
match = self.spec_match(spec)
self.states.append((tok_spec, match))
def pop_state(self):
self.states.pop()
def spec_match(self, tok_spec):
tok_re = '|'.join('(?P<%s>%s)' % pair for pair in tok_spec)
match = re.compile(tok_re).match
return lambda: match(self.string, self.position)
def next(self):
token_obj = self.next_token()
if token_obj:
return token_obj
if self.position != self.length:
raise RuntimeError('Unexpected character %r on line %d' %(self.string[self.position], self.line))
else:
raise StopIteration()
def next_token(self):
specification, match = self.states[-1]
for matched in iter(match, None):
old_position = self.position
self.position = matched.end()
token = self.process_match(matched, old_position)
if token:
if token.typ == 'IDENTIFIER' and token.value in self.keywords:
return token._replace(typ='KEYWORD')
return token
def process_match(self, match, old_position):
typ = match.lastgroup
if typ == 'NEWLINE':
self.line_start = old_position
self.line += 1
elif typ != 'SKIP':
return self.generate_token(match)
def generate_token(self, match):
typ = match.lastgroup
value = match.group(typ)
return Token(typ, value, self.line, match.start() - self.line_start)
def __iter__(self):
return self
def get_tokens(string):
specification = [
('OPERATOR', r'(=[+.]|[+.:?]=|=)'),
('LPAREN', r'\('),
('RPAREN', r'\)'),
('LBRACE', r'{'),
('RBRACE', r'}'),
('COLON', r':'),
('IDENTIFIER', r'[a-zA-Z0-9+-_.${}/]+'),
('COMMENT', r'#.*(?=\n)'),
('NEWLINE', r'\n'),
('SKIP', r'[ \t]'),
]
line_spec = [
('NEWLINE', r'\n'),
('SKIP', r'[ \t]+'),
('COMMENT', r'#.*(?=\n)'),
('LINE', r'.*(?=\n)'),
]
tokenizer = Tokenizer(string, specification, keywords=['inherit', 'include', 'require', 'addtask', 'export',
'before', 'after', 'python', 'EXPORT_FUNCTIONS'])
for token in tokenizer:
if token.typ == 'OPERATOR':
yield token
tokenizer.push_state(line_spec)
elif token.typ == 'LINE':
tokenizer.pop_state()
yield token._replace(typ='VALUE')
else:
yield token
def main():
teststring = """
inherit foo
include bar
require foo/bar.conf
FOO = "bar"
BAR := "foo bar"
ALPHA += "beta"
BETA .= 'theta'
OMEGA =. omega
# something commented
TEST =+ "meh"
python () {
alpha
beta
theta
}
shellfunc () {
echo foo
}
EXPORT_FUNCTIONS myfunc anotherfunc
EXPORT_FUNCTIONS myfunc
addtask some_task before this after that
addtask some_task before this
addtask some_task after that
addtask some_task
def get_something_or_other(d):
def something_else():
return 5
return something_else() * 3
"""
for token in get_tokens(teststring):
print(token)
if __name__ == '__main__':
result = main()
if not result:
sys.exit(1)
@esben
Copy link

esben commented May 10, 2011

I have written a BitBake parser with PLY (ply.lex + ply.yacc), and has achieved comparable performance with current BitBake, and is not done optimizing yet.
I would very much like to share the work done with you.

@kergoth
Copy link
Author

kergoth commented May 10, 2011 via email

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment