kergoth/README.rst

## README.rst

      
    Raw
  

              README.rst
            
          
    TODO


Verify that IndentTokenizer works with a non-LINE-based tokenizer specification, rather than the default
Resurrect the 'NEWLINE' token, as we need it to be as picky about the file format as the current parser is
Implement a parser which leverages this to properly change the lexer states for both ordinary functions and "def" syntax functions. Determine if this should be custom or PLY or codetalker or what, by first determining whether these libraries would support a lexer like ours
Do performance testing comparing the new parser against the old, and against the pyparsing implementation


## indent_lexer.py
import lexer
import re

class IndentTokenizer(lexer.Tokenizer):
    """Tokenizer which tracks indentation, for parsing python-like strings"""

    _white = r'(\s+)(?=\S)'
    _line = r'.*(?=\n)'
    _spec = [
        ('NEWLINE', r'\n'),
        ('LINE', _line),
    ]

    def __init__(self, string, tok_spec=None):
        self.current_indent = 0
        self.indents = []
        self.pending = []
        self.after_newline = True
        if tok_spec is None:
            tok_spec = self._spec
        lexer.Tokenizer.__init__(self, string, tok_spec)

    def next_token(self):
        if self.pending:
            return self.pending.pop(0)

        token = lexer.Tokenizer.next_token(self)
        if not token and self.indents:
            return lexer.Token('DEDENT', self.indents.pop(), self.line - 1, 0)
        return token

    def process_match(self, match, old_position):
        typ = match.lastgroup
        if typ == 'NEWLINE':
            self.after_newline = True
        else:
            if self.after_newline:
                self.after_newline = False
                obj = self.handle_indents(old_position)
                if obj:
                    self.position = old_position
                    return obj
        return lexer.Tokenizer.process_match(self, match, old_position)

    def handle_indents(self, position):
        indents = list(self.process_indent(position))
        if indents:
            self.pending.extend(indents[1:])
            return indents[0]

    def process_indent(self, position):
        line = re.compile(self._line).match(self.string, position)
        if line:
            line_value = line.group()
            if line_value.strip():
                matched = re.match(self._white, line_value)
                if matched:
                    indent = matched.group(1)
                    if self.indents and indent == self.indents[-1]:
                        pass
                    elif len(self.indents) > 1 and indent == self.indents[-2]:
                        # dedent
                        yield lexer.Token('DEDENT', self.indents.pop(), self.line - 1, 0)
                    else:
                        self.indents.append(indent)
                        yield lexer.Token('INDENT', indent, self.line - 1, 0)
                elif self.indents:
                    while self.indents:
                        yield lexer.Token('DEDENT', self.indents.pop(), self.line - 1, 0)

def parse_function(string, position):
    body = []
    indent = 0
    tokenizer = IndentTokenizer(string)
    tokenizer.position = position
    for token in tokenizer:
        if token.typ == 'INDENT':
            indent += 1
        elif token.typ == 'DEDENT':
            indent -= 1
            if not indent:
                break
        else:
            body.append(token.value)
    return [line + '\n' for line in body], tokenizer.position

if __name__ == '__main__':
    teststring = """
def foo(value):
    print(value * 5)

    # foo
    def anotherfunc(anothervalue):
        print('hi, mom!')
        return anothervalue + 6

    return anotherfunc(12)

print("hi, mom!")
"""
    body, position = parse_function(teststring, 0)
    print(''.join(body))
    assert teststring[position:] == 'print("hi, mom!")\n'

## lexer.py
import collections
import re
import sys


Token = collections.namedtuple('Token', 'typ value line column')

class Tokenizer(object):
    """Simple regular expression based tokenizer with supports for lexing states.

    Based on http://docs.python.org/dev/library/re.html#writing-a-tokenizer.
    """
    def __init__(self, string, tok_spec=None, keywords=None):
        self.states = []
        self.string = string
        self.line = 1
        self.position = self.line_start = 0
        self.newline_tokens = False
        self.length = len(string)

        if keywords is None:
            keywords = []
        self.keywords = keywords

        if tok_spec is not None:
            self.push_state(tok_spec)

    def push_state(self, tok_spec):
        spec = [elem for elem in tok_spec if not elem[0].startswith('_')]
        match = self.spec_match(spec)
        self.states.append((tok_spec, match))

    def pop_state(self):
        self.states.pop()

    def spec_match(self, tok_spec):
        tok_re = '|'.join('(?P<%s>%s)' % pair for pair in tok_spec)
        match = re.compile(tok_re).match
        return lambda: match(self.string, self.position)

    def next(self):
        token_obj = self.next_token()
        if token_obj:
            return token_obj

        if self.position != self.length:
            raise RuntimeError('Unexpected character %r on line %d' %(self.string[self.position], self.line))
        else:
            raise StopIteration()

    def next_token(self):
        specification, match = self.states[-1]
        for matched in iter(match, None):
            old_position = self.position
            self.position = matched.end()
            token = self.process_match(matched, old_position)
            if token:
                if token.typ == 'IDENTIFIER' and token.value in self.keywords:
                    return token._replace(typ='KEYWORD')
                return token

    def process_match(self, match, old_position):
        typ = match.lastgroup
        if typ == 'NEWLINE':
            self.line_start = old_position
            self.line += 1
        elif typ != 'SKIP':
            return self.generate_token(match)

    def generate_token(self, match):
        typ = match.lastgroup
        value = match.group(typ)
        return Token(typ, value, self.line, match.start() - self.line_start)

    def __iter__(self):
        return self

def get_tokens(string):
    specification = [
        ('OPERATOR', r'(=[+.]|[+.:?]=|=)'),
        ('LPAREN', r'\('),
        ('RPAREN', r'\)'),
        ('LBRACE', r'{'),
        ('RBRACE', r'}'),
        ('COLON', r':'),
        ('IDENTIFIER', r'[a-zA-Z0-9+-_.${}/]+'),
        ('COMMENT', r'#.*(?=\n)'),
        ('NEWLINE', r'\n'),
        ('SKIP', r'[ \t]'),
    ]
    line_spec = [
        ('NEWLINE', r'\n'),
        ('SKIP', r'[ \t]+'),
        ('COMMENT', r'#.*(?=\n)'),
        ('LINE', r'.*(?=\n)'),
    ]
    tokenizer = Tokenizer(string, specification, keywords=['inherit', 'include', 'require', 'addtask', 'export',
                                                           'before', 'after', 'python', 'EXPORT_FUNCTIONS'])
    for token in tokenizer:
        if token.typ == 'OPERATOR':
            yield token
            tokenizer.push_state(line_spec)
        elif token.typ == 'LINE':
            tokenizer.pop_state()
            yield token._replace(typ='VALUE')
        else:
            yield token

def main():
    teststring = """
inherit foo
include bar
require foo/bar.conf
FOO = "bar"
BAR := "foo bar"
ALPHA += "beta"
BETA .= 'theta'
OMEGA =. omega

# something commented
TEST =+ "meh"

python () {
    alpha
    beta
    theta
}

shellfunc () {
    echo foo
}

EXPORT_FUNCTIONS myfunc anotherfunc
EXPORT_FUNCTIONS myfunc

addtask some_task before this after that
addtask some_task before this
addtask some_task after that
addtask some_task

def get_something_or_other(d):
    def something_else():
        return 5
    return something_else() * 3
    """

    for token in get_tokens(teststring):
        print(token)


if __name__ == '__main__':
    result = main()
    if not result:
        sys.exit(1)
	import lexer
	import re

	class IndentTokenizer(lexer.Tokenizer):
	"""Tokenizer which tracks indentation, for parsing python-like strings"""

	_white = r'(\s+)(?=\S)'
	_line = r'.*(?=\n)'
	_spec = [
	('NEWLINE', r'\n'),
	('LINE', _line),
	]

	def __init__(self, string, tok_spec=None):
	self.current_indent = 0
	self.indents = []
	self.pending = []
	self.after_newline = True
	if tok_spec is None:
	tok_spec = self._spec
	lexer.Tokenizer.__init__(self, string, tok_spec)

	def next_token(self):
	if self.pending:
	return self.pending.pop(0)

	token = lexer.Tokenizer.next_token(self)
	if not token and self.indents:
	return lexer.Token('DEDENT', self.indents.pop(), self.line - 1, 0)
	return token

	def process_match(self, match, old_position):
	typ = match.lastgroup
	if typ == 'NEWLINE':
	self.after_newline = True
	else:
	if self.after_newline:
	self.after_newline = False
	obj = self.handle_indents(old_position)
	if obj:
	self.position = old_position
	return obj
	return lexer.Tokenizer.process_match(self, match, old_position)

	def handle_indents(self, position):
	indents = list(self.process_indent(position))
	if indents:
	self.pending.extend(indents[1:])
	return indents[0]

	def process_indent(self, position):
	line = re.compile(self._line).match(self.string, position)
	if line:
	line_value = line.group()
	if line_value.strip():
	matched = re.match(self._white, line_value)
	if matched:
	indent = matched.group(1)
	if self.indents and indent == self.indents[-1]:
	pass
	elif len(self.indents) > 1 and indent == self.indents[-2]:
	# dedent
	yield lexer.Token('DEDENT', self.indents.pop(), self.line - 1, 0)
	else:
	self.indents.append(indent)
	yield lexer.Token('INDENT', indent, self.line - 1, 0)
	elif self.indents:
	while self.indents:
	yield lexer.Token('DEDENT', self.indents.pop(), self.line - 1, 0)

	def parse_function(string, position):
	body = []
	indent = 0
	tokenizer = IndentTokenizer(string)
	tokenizer.position = position
	for token in tokenizer:
	if token.typ == 'INDENT':
	indent += 1
	elif token.typ == 'DEDENT':
	indent -= 1
	if not indent:
	break
	else:
	body.append(token.value)
	return [line + '\n' for line in body], tokenizer.position

	if __name__ == '__main__':
	teststring = """
	def foo(value):
	print(value * 5)

	# foo
	def anotherfunc(anothervalue):
	print('hi, mom!')
	return anothervalue + 6

	return anotherfunc(12)

	print("hi, mom!")
	"""
	body, position = parse_function(teststring, 0)
	print(''.join(body))
	assert teststring[position:] == 'print("hi, mom!")\n'
	import collections
	import re
	import sys


	Token = collections.namedtuple('Token', 'typ value line column')

	class Tokenizer(object):
	"""Simple regular expression based tokenizer with supports for lexing states.

	Based on http://docs.python.org/dev/library/re.html#writing-a-tokenizer.
	"""
	def __init__(self, string, tok_spec=None, keywords=None):
	self.states = []
	self.string = string
	self.line = 1
	self.position = self.line_start = 0
	self.newline_tokens = False
	self.length = len(string)

	if keywords is None:
	keywords = []
	self.keywords = keywords

	if tok_spec is not None:
	self.push_state(tok_spec)

	def push_state(self, tok_spec):
	spec = [elem for elem in tok_spec if not elem[0].startswith('_')]
	match = self.spec_match(spec)
	self.states.append((tok_spec, match))

	def pop_state(self):
	self.states.pop()

	def spec_match(self, tok_spec):
	tok_re = '\|'.join('(?P<%s>%s)' % pair for pair in tok_spec)
	match = re.compile(tok_re).match
	return lambda: match(self.string, self.position)

	def next(self):
	token_obj = self.next_token()
	if token_obj:
	return token_obj

	if self.position != self.length:
	raise RuntimeError('Unexpected character %r on line %d' %(self.string[self.position], self.line))
	else:
	raise StopIteration()

	def next_token(self):
	specification, match = self.states[-1]
	for matched in iter(match, None):
	old_position = self.position
	self.position = matched.end()
	token = self.process_match(matched, old_position)
	if token:
	if token.typ == 'IDENTIFIER' and token.value in self.keywords:
	return token._replace(typ='KEYWORD')
	return token

	def process_match(self, match, old_position):
	typ = match.lastgroup
	if typ == 'NEWLINE':
	self.line_start = old_position
	self.line += 1
	elif typ != 'SKIP':
	return self.generate_token(match)

	def generate_token(self, match):
	typ = match.lastgroup
	value = match.group(typ)
	return Token(typ, value, self.line, match.start() - self.line_start)

	def __iter__(self):
	return self

	def get_tokens(string):
	specification = [
	('OPERATOR', r'(=[+.]\|[+.:?]=\|=)'),
	('LPAREN', r'\('),
	('RPAREN', r'\)'),
	('LBRACE', r'{'),
	('RBRACE', r'}'),
	('COLON', r':'),
	('IDENTIFIER', r'[a-zA-Z0-9+-_.${}/]+'),
	('COMMENT', r'#.*(?=\n)'),
	('NEWLINE', r'\n'),
	('SKIP', r'[ \t]'),
	]
	line_spec = [
	('NEWLINE', r'\n'),
	('SKIP', r'[ \t]+'),
	('COMMENT', r'#.*(?=\n)'),
	('LINE', r'.*(?=\n)'),
	]
	tokenizer = Tokenizer(string, specification, keywords=['inherit', 'include', 'require', 'addtask', 'export',
	'before', 'after', 'python', 'EXPORT_FUNCTIONS'])
	for token in tokenizer:
	if token.typ == 'OPERATOR':
	yield token
	tokenizer.push_state(line_spec)
	elif token.typ == 'LINE':
	tokenizer.pop_state()
	yield token._replace(typ='VALUE')
	else:
	yield token

	def main():
	teststring = """
	inherit foo
	include bar
	require foo/bar.conf
	FOO = "bar"
	BAR := "foo bar"
	ALPHA += "beta"
	BETA .= 'theta'
	OMEGA =. omega

	# something commented
	TEST =+ "meh"

	python () {
	alpha
	beta
	theta
	}

	shellfunc () {
	echo foo
	}

	EXPORT_FUNCTIONS myfunc anotherfunc
	EXPORT_FUNCTIONS myfunc

	addtask some_task before this after that
	addtask some_task before this
	addtask some_task after that
	addtask some_task

	def get_something_or_other(d):
	def something_else():
	return 5
	return something_else() * 3
	"""

	for token in get_tokens(teststring):
	print(token)


	if __name__ == '__main__':
	result = main()
	if not result:
	sys.exit(1)