Skip to content

Instantly share code, notes, and snippets.

@drslump
Last active April 10, 2018 07:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save drslump/55752165288b7639b06f60888e22c759 to your computer and use it in GitHub Desktop.
Save drslump/55752165288b7639b06f60888e22c759 to your computer and use it in GitHub Desktop.
lexer
import sys, types, dis, struct
BINARY_SUBSCR = lambda: Op('BINARY_SUBSCR')
BUILD_TUPLE = lambda x: Op('BUILD_TUPLE', x)
COMPARE_OP = lambda x: Op('COMPARE_OP', x)
INPLACE_ADD = lambda: Op('INPLACE_ADD')
INPLACE_SUBTRACT = lambda: Op('INPLACE_SUBTRACT')
JUMP_ABSOLUTE = lambda x: Op('JUMP_ABSOLUTE', x)
LOAD_CONST = lambda x: Op('LOAD_CONST', x)
LOAD_FAST = lambda x: Op('LOAD_FAST', x)
POP_JUMP_IF_TRUE = lambda x: Op('POP_JUMP_IF_TRUE', x)
STORE_FAST = lambda x: Op('STORE_FAST', x)
RETURN_VALUE = lambda: Op('RETURN_VALUE')
STATE = lambda x: Op('$STATE', x)
if sys.version_info < (3,6):
def pack_opcode(opcode, arg=None):
if opcode >= dis.HAVE_ARGUMENT:
assert arg <= 0xFFFF, 'unsupported opcode arg over 16bits'
return struct.pack('<BH', opcode, arg)
else:
return struct.pack('B', opcode)
def patch_arg(buffer, offset, arg):
assert arg <= 0xFFFF, 'unsupported opcode arg over 16bits'
packed = struct.pack('<H', arg)
buffer[offset + 1] = packed[0]
buffer[offset + 2] = packed[1]
else:
def pack_opcode(opcode, arg=None):
if opcode >= dis.HAVE_ARGUMENT:
assert arg <= 0xFFFF, 'unsupported opcode arg over 16bits'
if arg <= 0xFF:
return struct.pack('BB', opcode, arg)
else:
return struct.pack('BBBB', dis.EXTENDED_ARG, arg>>8, opcode, arg&0xFF)
else:
return struct.pack('BB', opcode, 0)
def patch_arg(buffer, offset, arg):
assert arg <= 0xFFFF, 'unsupported opcode arg over 16bits'
# TODO: Parse opcode and adapt extended arg if needed
if False and arg <= 0xFF:
buffer[offset+1] = arg
else:
buffer[offset+1] = arg >> 8
buffer[offset+3] = arg & 0xFF
class Op(object):
__slots__ = ('opcode', 'arg')
def __init__(self, opcode, arg=None):
self.opcode = opcode
self.arg = arg
def __iter__(self):
return iter((self.opcode, self.arg))
class Ops(list):
def __lshift__(self, other):
if isinstance(other, (tuple, list)):
self.extend(other)
elif isinstance(other, Op):
self.append(other)
elif isinstance(other, Abstract):
self.extend(other.opcodes())
else:
raise TypeError('Unsupported type: ' + type(other))
class Abstract(object):
def opcodes(self):
raise AssertionError('Not implemented')
def optimize(self):
pass
def _encode(self, constnames, varnames):
"""
Note that constnames and varnames will be muteted
"""
code = bytearray()
states = {}
jumps = []
# First encode while keeping a registry of jumps and labels
for opcode, arg in self.opcodes():
if opcode == '$STATE':
states[arg] = len(code)
continue
if opcode == 'LOAD_CONST':
try:
arg = constnames.index(arg)
except ValueError:
constnames.append(arg)
arg = len(constnames) - 1
if opcode in ('STORE_FAST', 'LOAD_FAST'):
try:
arg = varnames.index(arg)
except ValueError:
varnames.append(arg)
arg = len(varnames) - 1
if opcode == 'COMPARE_OP':
arg = dis.cmp_op.index(arg)
opcode = dis.opmap[opcode]
if opcode in dis.hasjabs and not isinstance(arg, int):
jumps.append((arg, len(code)))
arg = 65535 # force a extended_arg on >3.6
code.extend(pack_opcode(opcode, arg))
# Now process the jumps to set the correct offsets
for state, offset in jumps:
patch_arg(code, offset, states[state])
return bytes(code)
def compile(self, name='fsmlex', docblock=None):
""" Builds a function with the currently configured opcodes
"""
argnames = ('stream', 'ofs')
varnames = list(argnames)
constnames = [docblock]
code = self._encode(constnames, varnames)
args = [
len(argnames), # co_argcount -> (stream, ofs)
len(varnames), # co_nlocals
2, # co_stacksize -> maximum number of values in the stack
0, # co_flags -> only if *args is used
code, # co_code -> compiled bytecode
tuple(constnames), # co_consts -> literals in the code (first is docblock)
(), # co_names -> ??? only used for closures?
tuple(varnames), # co_varnames -> list of local variables (starting with args)
name + '.py', # co_filename,
name, # co_name,
0, # co_firstlineno,
bytes() # co_lnotab
]
if sys.version_info >= (3,0,0):
args.insert(1, 0) # co_kwonlyargcount
co = types.CodeType(*args)
return types.FunctionType(co, {}, name, (0,))
class State(Abstract):
""" Holds the set of actions for a label
"""
__slots__ = ('label', 'actions')
def __init__(self, label=None, actions=None):
self.label = str(label) if label is not None else None
self.actions = actions if actions else []
def add(self, *actions):
self.actions.extend(actions)
def optimize(self):
# TODO: Collapse Matches with same target
for action in self.actions:
action.optimize()
def opcodes(self):
ops = Ops()
if self.label:
ops << STATE(self.label)
for action in self.actions:
ops << action
return ops
class Match(Abstract):
""" Jumps if the character matches the set of values
"""
__slots__ = ('value', 'label')
def __init__(self, values, label):
self.values = values
self.label = str(label)
def opcodes(self):
ops = Ops()
# > if ch in self.values
ops << LOAD_FAST('ch')
ops << LOAD_CONST(u''.join(self.values))
if 1 == len(self.values):
ops << COMPARE_OP('==')
else:
ops << COMPARE_OP('in')
ops << POP_JUMP_IF_TRUE(self.label)
return ops
class Jump(Abstract):
""" Jumps to a specific label
"""
__slots__ = ('label',)
def __init__(self, label):
self.label = str(label)
def opcodes(self):
ops = Ops()
ops << JUMP_ABSOLUTE(self.label)
return ops
class Consume(Abstract):
""" Consumes the next character from the stream
"""
__slots__ = ('advance',)
def __init__(self, advance=True):
self.advance = advance
def opcodes(self):
ops = Ops()
if self.advance:
ops << Advance()
ops << LOAD_FAST('stream')
ops << LOAD_FAST('ofs')
ops << BINARY_SUBSCR() # > stream[ofs]
ops << STORE_FAST('ch') # > ch = stream[ofs]
return ops
class Advance(Abstract):
""" Advances to the next character in the stream
"""
__slots__ = ()
def opcodes(self):
ops = Ops()
ops << LOAD_FAST('ofs')
ops << LOAD_CONST(1)
ops << INPLACE_ADD() # > ofs + 1
ops << STORE_FAST('ofs') # > ofs = ofs + 1
return ops
class Marker(Abstract):
""" Marks the current offset for a look ahead
"""
__slots__ = ('mark',)
def __init__(self, mark):
self.mark = mark
def opcodes(self):
ops = Ops()
ops << LOAD_CONST(self.mark)
ops << STORE_FAST('accept') # > accept = self.mark
ops << LOAD_FAST('ofs')
ops << STORE_FAST('marker') # > marker = ofs
return ops
class Backtrack(Abstract):
""" Backtracks a look ahead
"""
__slots__ = ()
def opcodes(self):
ops = Ops()
# > ofs = marker
ops << LOAD_FAST('marker')
ops << STORE_FAST('ofs')
return ops
class Accept(Abstract):
""" Accepts a look ahead
"""
__slots__ = ('mark', 'label')
def __init__(self, mark, label):
self.mark = mark
self.label = str(label)
def opcodes(self):
ops = Ops()
# > if accept == self.mark
ops << LOAD_FAST('accept')
ops << LOAD_CONST(self.mark)
ops << COMPARE_OP('==')
ops << POP_JUMP_IF_TRUE(self.label)
return ops
class Produce(Abstract):
""" Returns the current offset with an optional token
"""
__slots__ = ('token',)
def __init__(self, token=None):
self.token = token
def opcodes(self):
ops = Ops()
# > return (ofs, self.token)
ops << LOAD_FAST('ofs')
ops << LOAD_CONST(self.token)
ops << BUILD_TUPLE(2)
ops << RETURN_VALUE()
return ops
from fsm import Accept, Advance, Backtrack, Consume, Jump, Match, Marker, Produce, State
def factory_grammar():
fsm = State(None, [
Consume(False),
Match(' \t', 'yy4'),
Match('\n', 'yy7'),
Match('\r', 'yy10'),
Match('!', 'yy11'),
Match('"', 'yy12'),
Match('%', 'yy13'),
Match('(', 'yy14'),
Match(')', 'yy16'),
Match('*', 'yy18'),
Match('+', 'yy18'),
Match('-', 'yy20'),
Match('.', 'yy21'),
Match('/', 'yy23'),
Match(':', 'yy24'),
Match('?', 'yy26'),
Match('ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'yy27'),
Match('[', 'yy30'),
Match(']', 'yy32'),
Match('_', 'yy34'),
Match('abcdefghijklmnopqrstuvwxyz', 'yy27'),
Match('|', 'yy38'),
Match('~', 'yy40'),
Jump('yy2'),
State('yy2', [
Advance(),
]),
State('yy3', [
Produce(None),
]),
State('yy4', [
Consume(),
Match(' \t', 'yy4'),
Jump('yy6')
]),
State('yy6', [
Produce('WS'),
]),
State('yy7', [
Consume(),
Marker(0),
Match(' \t', 'yy42'),
Match('\n', 'yy7'),
Match('\r', 'yy44'),
Jump('yy9')
]),
State('yy9', [
Produce('_NL'),
]),
State('yy10', [
Consume(),
Match('\n', 'yy7'),
Jump('yy3'),
]),
State('yy11', [
Consume(),
Marker(1),
Match('?_', 'yy46'),
Match('abcdefghijklmnopqrstuvwxyz', 'yy35'),
Jump('yy3'),
]),
State('yy12', [
Consume(),
Marker(1),
Match('\n', 'yy3'),
Jump('yy48'),
]),
State('yy13', [
Consume(),
Marker(1),
Match('i', 'yy53'),
Jump('yy3'),
]),
State('yy14', [
Advance(),
Produce('_LPAR'),
]),
State('yy16', [
Advance(),
Produce('_RPAR'),
]),
State('yy18', [
Consume(),
Match('?', 'yy54'),
Match('abcdefghijklmnopqrstuvwxyz', 'yy55'),
Jump('yy19'),
]),
State('yy19', [
Produce('OP'),
]),
State('yy20', [
Consume(),
Match('>', 'yy57'),
Jump('yy3'),
]),
State('yy21', [
Advance(),
Produce('_DOT'),
]),
State('yy23', [
Consume(),
Marker(1),
Match('/', 'yy61'),
Jump('yy59'),
]),
State('yy24', [
Advance(),
Produce('_COLON'),
]),
State('yy26', [
Consume(),
Match('abcdefghijklmnopqrstuvwxyz', 'yy35'),
Jump('yy62'),
]),
State('yy27', [
Consume(),
Match('0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_', 'yy27'),
Jump('yy29'),
]),
State('yy29', [
Produce('TOKEN'),
]),
State('yy30', [
Advance(),
Produce('_LBRA'),
]),
State('yy32', [
Advance(),
Produce('_RBRA'),
]),
State('yy34', [
Consume(),
Match('ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'yy27'),
Match('abcdefghijklmnopqrstuvwxyz', 'yy35'),
Jump('yy3'),
]),
State('yy35', [
Consume(),
Match('0123456789_abcdefghijklmnopqrstuvwxyz', 'yy35'),
Jump('yy37'),
]),
State('yy37', [
Produce('RULE'),
]),
State('yy38', [
Advance(),
Produce('_OR'),
]),
State('yy40', [
Advance(),
Produce('_TILDE'),
]),
State('yy42', [
Consume(),
Match(' \t', 'yy42'),
Jump('yy9'),
]),
State('yy44', [
Consume(),
Match('\n', 'yy7'),
Jump('yy45'),
]),
State('yy45', [
Backtrack(),
Accept(0, 'yy9'),
Accept(1, 'yy3'),
Accept(2, 'yy50'),
Jump('yy69'),
]),
State('yy46', [
Consume(),
Match('abcdefghijklmnopqrstuvwxyz', 'yy35'),
Jump('yy45'),
]),
State('yy47', [
Consume()
]),
State('yy48', [
Match('\n', 'yy45'),
Match('"', 'yy49'),
Match('\\', 'yy51'),
Jump('yy47'),
]),
State('yy49', [
Consume(),
Match('i', 'yy63'),
Jump('yy50'),
]),
State('yy50', [
Produce('STRING'),
]),
State('yy51', [
Consume(),
Match('\n', 'yy45'),
Match('"', 'yy64'),
Match('\\', 'yy51'),
Jump('yy47'),
]),
State('yy53', [
Consume(),
Match('g', 'yy65'),
Match('m', 'yy66'),
Jump('yy45'),
]),
State('yy54', [
Consume(),
Match('abcdefghijklmnopqrstuvwxyz', 'yy55'),
Jump('yy19'),
]),
State('yy55', [
Advance(),
Produce('OP'),
]),
State('yy57', [
Advance(),
Produce('_TO'),
]),
State('yy59', [
Consume(),
Match('\n', 'yy45'),
Match('/', 'yy67'),
Match('\\', 'yy70'),
Jump('yy59'),
]),
State('yy61', [
Consume(),
Match('\n', 'yy45'),
Jump('yy72'),
]),
State('yy62', [
Advance(),
Jump('yy19'),
]),
State('yy63', [
Advance(),
Jump('yy50'),
]),
State('yy64', [
Consume(),
Marker(2),
Match('\n', 'yy50'),
Match('"', 'yy49'),
Match('\\', 'yy51'),
Match('i', 'yy74'),
Jump('yy47'),
]),
State('yy65', [
Consume(),
Match('n', 'yy75'),
Jump('yy45'),
]),
State('yy66', [
Consume(),
Match('p', 'yy76'),
Jump('yy45'),
]),
State('yy67', [
Consume(),
Match('ilmsux', 'yy67'),
Jump('yy69'),
]),
State('yy69', [
Produce('REGEXP'),
]),
State('yy70', [
Consume(),
Match('\n', 'yy45'),
Match('/', 'yy77'),
Match('\\', 'yy70'),
Jump('yy59'),
]),
State('yy72', [
Produce('COMMENT'),
]),
State('yy74', [
Consume(),
Marker(2),
Match('\n', 'yy50'),
Match('"', 'yy49'),
Match('\\', 'yy51'),
Jump('yy47'),
]),
State('yy75', [
Consume(),
Match('o', 'yy79'),
Jump('yy45'),
]),
State('yy76', [
Consume(),
Match('o', 'yy80'),
Jump('yy45'),
]),
State('yy77', [
Consume(),
Marker(3),
Match('\n', 'yy69'),
Match('/', 'yy67'),
Match('\\', 'yy70'),
Match('ilmsux', 'yy77'),
Jump('yy59'),
]),
State('yy79', [
Consume(),
Match('r', 'yy81'),
Jump('yy45'),
]),
State('yy80', [
Consume(),
Match('r', 'yy82'),
Jump('yy45'),
]),
State('yy81', [
Consume(),
Match('e', 'yy83'),
Jump('yy45'),
]),
State('yy82', [
Consume(),
Match('t', 'yy85'),
Jump('yy45'),
]),
State('yy83', [
Advance(),
Produce('_IGNORE'),
]),
State('yy85', [
Advance(),
Produce('_IMPORT')
]),
])
return fsm.compile()
lex = factory_grammar()
def genlex(stream):
ofs = 0
length = len(stream)
while ofs < length:
try:
pos, token = lex(stream, ofs)
yield ofs, token, stream[ofs:pos]
except IndexError:
break
ofs = pos
with open('lark/grammars/common.g') as fd:
lines = []
for line in fd:
if not line.startswith('//'):
lines.append(line)
stream = u''.join(lines) * 100
def lexit():
for ofs, token, value in genlex(stream):
pass
from timeit import timeit
print(timeit(lexit, number=100))
/* Generated by re2c 1.0.3 on Tue Apr 10 09:00:46 2018 */
#line 1 "/Users/drslump/tmp/test.re2c"
static bool lex(const char *s, unsigned long &u)
{
const char *YYMARKER;
const char *YYCTXMARKER;
int c = yycinit;
u = 0;
#line 17 "grammar.cc"
{
char yych;
unsigned int yyaccept = 0;
yych = *s;
switch (yych) {
case '\t':
case ' ': goto yy4;
case '\n': goto yy7;
case '\r': goto yy10;
case '!': goto yy11;
case '"': goto yy12;
case '%': goto yy13;
case '(': goto yy14;
case ')': goto yy16;
case '*':
case '+': goto yy18;
case '-': goto yy20;
case '.': goto yy21;
case '/': goto yy23;
case ':': goto yy24;
case '?': goto yy26;
case 'A':
case 'B':
case 'C':
case 'D':
case 'E':
case 'F':
case 'G':
case 'H':
case 'I':
case 'J':
case 'K':
case 'L':
case 'M':
case 'N':
case 'O':
case 'P':
case 'Q':
case 'R':
case 'S':
case 'T':
case 'U':
case 'V':
case 'W':
case 'X':
case 'Y':
case 'Z': goto yy27;
case '[': goto yy30;
case ']': goto yy32;
case '_': goto yy34;
case 'a':
case 'b':
case 'c':
case 'd':
case 'e':
case 'f':
case 'g':
case 'h':
case 'i':
case 'j':
case 'k':
case 'l':
case 'm':
case 'n':
case 'o':
case 'p':
case 'q':
case 'r':
case 's':
case 't':
case 'u':
case 'v':
case 'w':
case 'x':
case 'y':
case 'z': goto yy35;
case '|': goto yy38;
case '~': goto yy40;
default: goto yy2;
}
yy2:
++s;
yy3:
#line 40 "/Users/drslump/tmp/test.re2c"
{ return false; }
#line 103 "grammar.cc"
yy4:
yych = *++s;
switch (yych) {
case '\t':
case ' ': goto yy4;
default: goto yy6;
}
yy6:
#line 25 "/Users/drslump/tmp/test.re2c"
{ "WS" }
#line 114 "grammar.cc"
yy7:
yyaccept = 0;
yych = *(YYMARKER = ++s);
switch (yych) {
case '\t':
case ' ': goto yy42;
case '\n': goto yy7;
case '\r': goto yy44;
default: goto yy9;
}
yy9:
#line 19 "/Users/drslump/tmp/test.re2c"
{ "NL" }
#line 128 "grammar.cc"
yy10:
yych = *++s;
switch (yych) {
case '\n': goto yy7;
default: goto yy3;
}
yy11:
yyaccept = 1;
yych = *(YYMARKER = ++s);
switch (yych) {
case '?':
case '_': goto yy46;
case 'a':
case 'b':
case 'c':
case 'd':
case 'e':
case 'f':
case 'g':
case 'h':
case 'i':
case 'j':
case 'k':
case 'l':
case 'm':
case 'n':
case 'o':
case 'p':
case 'q':
case 'r':
case 's':
case 't':
case 'u':
case 'v':
case 'w':
case 'x':
case 'y':
case 'z': goto yy35;
default: goto yy3;
}
yy12:
yyaccept = 1;
yych = *(YYMARKER = ++s);
switch (yych) {
case '\n': goto yy3;
default: goto yy48;
}
yy13:
yyaccept = 1;
yych = *(YYMARKER = ++s);
switch (yych) {
case 'i': goto yy53;
default: goto yy3;
}
yy14:
++s;
#line 33 "/Users/drslump/tmp/test.re2c"
{ "LPAR" }
#line 187 "grammar.cc"
yy16:
++s;
#line 36 "/Users/drslump/tmp/test.re2c"
{ "RPAR" }
#line 192 "grammar.cc"
yy18:
yych = *++s;
switch (yych) {
case '?': goto yy54;
case 'a':
case 'b':
case 'c':
case 'd':
case 'e':
case 'f':
case 'g':
case 'h':
case 'i':
case 'j':
case 'k':
case 'l':
case 'm':
case 'n':
case 'o':
case 'p':
case 'q':
case 'r':
case 's':
case 't':
case 'u':
case 'v':
case 'w':
case 'x':
case 'y':
case 'z': goto yy55;
default: goto yy19;
}
yy19:
#line 29 "/Users/drslump/tmp/test.re2c"
{ "OP" }
#line 228 "grammar.cc"
yy20:
yych = *++s;
switch (yych) {
case '>': goto yy57;
default: goto yy3;
}
yy21:
++s;
#line 31 "/Users/drslump/tmp/test.re2c"
{ "DOT" }
#line 239 "grammar.cc"
yy23:
yyaccept = 1;
yych = *(YYMARKER = ++s);
switch (yych) {
case '/': goto yy61;
default: goto yy59;
}
yy24:
++s;
#line 38 "/Users/drslump/tmp/test.re2c"
{ return "COLON"; }
#line 251 "grammar.cc"
yy26:
yych = *++s;
switch (yych) {
case 'a':
case 'b':
case 'c':
case 'd':
case 'e':
case 'f':
case 'g':
case 'h':
case 'i':
case 'j':
case 'k':
case 'l':
case 'm':
case 'n':
case 'o':
case 'p':
case 'q':
case 'r':
case 's':
case 't':
case 'u':
case 'v':
case 'w':
case 'x':
case 'y':
case 'z': goto yy35;
default: goto yy62;
}
yy27:
yych = *++s;
switch (yych) {
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
case 'A':
case 'B':
case 'C':
case 'D':
case 'E':
case 'F':
case 'G':
case 'H':
case 'I':
case 'J':
case 'K':
case 'L':
case 'M':
case 'N':
case 'O':
case 'P':
case 'Q':
case 'R':
case 'S':
case 'T':
case 'U':
case 'V':
case 'W':
case 'X':
case 'Y':
case 'Z':
case '_': goto yy27;
default: goto yy29;
}
yy29:
#line 23 "/Users/drslump/tmp/test.re2c"
{ "TOKEN" }
#line 328 "grammar.cc"
yy30:
++s;
#line 32 "/Users/drslump/tmp/test.re2c"
{ "LBRA" }
#line 333 "grammar.cc"
yy32:
++s;
#line 35 "/Users/drslump/tmp/test.re2c"
{ "RBRA" }
#line 338 "grammar.cc"
yy34:
yych = *++s;
switch (yych) {
case 'A':
case 'B':
case 'C':
case 'D':
case 'E':
case 'F':
case 'G':
case 'H':
case 'I':
case 'J':
case 'K':
case 'L':
case 'M':
case 'N':
case 'O':
case 'P':
case 'Q':
case 'R':
case 'S':
case 'T':
case 'U':
case 'V':
case 'W':
case 'X':
case 'Y':
case 'Z': goto yy27;
case 'a':
case 'b':
case 'c':
case 'd':
case 'e':
case 'f':
case 'g':
case 'h':
case 'i':
case 'j':
case 'k':
case 'l':
case 'm':
case 'n':
case 'o':
case 'p':
case 'q':
case 'r':
case 's':
case 't':
case 'u':
case 'v':
case 'w':
case 'x':
case 'y':
case 'z': goto yy35;
default: goto yy3;
}
yy35:
yych = *++s;
switch (yych) {
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
case '_':
case 'a':
case 'b':
case 'c':
case 'd':
case 'e':
case 'f':
case 'g':
case 'h':
case 'i':
case 'j':
case 'k':
case 'l':
case 'm':
case 'n':
case 'o':
case 'p':
case 'q':
case 'r':
case 's':
case 't':
case 'u':
case 'v':
case 'w':
case 'x':
case 'y':
case 'z': goto yy35;
default: goto yy37;
}
yy37:
#line 22 "/Users/drslump/tmp/test.re2c"
{ "RULE" }
#line 441 "grammar.cc"
yy38:
++s;
#line 34 "/Users/drslump/tmp/test.re2c"
{ "OR" }
#line 446 "grammar.cc"
yy40:
++s;
#line 37 "/Users/drslump/tmp/test.re2c"
{ "TILDE" }
#line 451 "grammar.cc"
yy42:
yych = *++s;
switch (yych) {
case '\t':
case ' ': goto yy42;
default: goto yy9;
}
yy44:
yych = *++s;
switch (yych) {
case '\n': goto yy7;
default: goto yy45;
}
yy45:
s = YYMARKER;
switch (yyaccept) {
case 0: goto yy9;
case 1: goto yy3;
case 2: goto yy50;
default: goto yy69;
}
yy46:
yych = *++s;
switch (yych) {
case 'a':
case 'b':
case 'c':
case 'd':
case 'e':
case 'f':
case 'g':
case 'h':
case 'i':
case 'j':
case 'k':
case 'l':
case 'm':
case 'n':
case 'o':
case 'p':
case 'q':
case 'r':
case 's':
case 't':
case 'u':
case 'v':
case 'w':
case 'x':
case 'y':
case 'z': goto yy35;
default: goto yy45;
}
yy47:
yych = *++s;
yy48:
switch (yych) {
case '\n': goto yy45;
case '"': goto yy49;
case '\\': goto yy51;
default: goto yy47;
}
yy49:
yych = *++s;
switch (yych) {
case 'i': goto yy63;
default: goto yy50;
}
yy50:
#line 21 "/Users/drslump/tmp/test.re2c"
{ "STRING" }
#line 522 "grammar.cc"
yy51:
yych = *++s;
switch (yych) {
case '\n': goto yy45;
case '"': goto yy64;
case '\\': goto yy51;
default: goto yy47;
}
yy53:
yych = *++s;
switch (yych) {
case 'g': goto yy65;
case 'm': goto yy66;
default: goto yy45;
}
yy54:
yych = *++s;
switch (yych) {
case 'a':
case 'b':
case 'c':
case 'd':
case 'e':
case 'f':
case 'g':
case 'h':
case 'i':
case 'j':
case 'k':
case 'l':
case 'm':
case 'n':
case 'o':
case 'p':
case 'q':
case 'r':
case 's':
case 't':
case 'u':
case 'v':
case 'w':
case 'x':
case 'y':
case 'z': goto yy55;
default: goto yy19;
}
yy55:
++s;
s -= 1;
#line 39 "/Users/drslump/tmp/test.re2c"
{ return "OP"; }
#line 574 "grammar.cc"
yy57:
++s;
#line 30 "/Users/drslump/tmp/test.re2c"
{ "TO" }
#line 579 "grammar.cc"
yy59:
yych = *++s;
switch (yych) {
case '\n': goto yy45;
case '/': goto yy67;
case '\\': goto yy70;
default: goto yy59;
}
yy61:
yych = *++s;
switch (yych) {
case '\n': goto yy45;
default: goto yy72;
}
yy62:
++s;
goto yy19;
yy63:
++s;
goto yy50;
yy64:
yyaccept = 2;
yych = *(YYMARKER = ++s);
switch (yych) {
case '\n': goto yy50;
case '"': goto yy49;
case '\\': goto yy51;
case 'i': goto yy75;
default: goto yy47;
}
yy65:
yych = *++s;
switch (yych) {
case 'n': goto yy76;
default: goto yy45;
}
yy66:
yych = *++s;
switch (yych) {
case 'p': goto yy77;
default: goto yy45;
}
yy67:
yych = *++s;
switch (yych) {
case 'i':
case 'l':
case 'm':
case 's':
case 'u':
case 'x': goto yy67;
default: goto yy69;
}
yy69:
#line 20 "/Users/drslump/tmp/test.re2c"
{ "REGEXP" }
#line 636 "grammar.cc"
yy70:
yych = *++s;
switch (yych) {
case '\n': goto yy45;
case '/': goto yy78;
case '\\': goto yy70;
default: goto yy59;
}
yy72:
yych = *++s;
switch (yych) {
case '\n': goto yy74;
default: goto yy72;
}
yy74:
#line 24 "/Users/drslump/tmp/test.re2c"
{ "COMMENT" }
#line 654 "grammar.cc"
yy75:
yyaccept = 2;
yych = *(YYMARKER = ++s);
switch (yych) {
case '\n': goto yy50;
case '"': goto yy49;
case '\\': goto yy51;
default: goto yy47;
}
yy76:
yych = *++s;
switch (yych) {
case 'o': goto yy80;
default: goto yy45;
}
yy77:
yych = *++s;
switch (yych) {
case 'o': goto yy81;
default: goto yy45;
}
yy78:
yyaccept = 3;
yych = *(YYMARKER = ++s);
switch (yych) {
case '\n': goto yy69;
case '/': goto yy67;
case '\\': goto yy70;
case 'i':
case 'l':
case 'm':
case 's':
case 'u':
case 'x': goto yy78;
default: goto yy59;
}
yy80:
yych = *++s;
switch (yych) {
case 'r': goto yy82;
default: goto yy45;
}
yy81:
yych = *++s;
switch (yych) {
case 'r': goto yy83;
default: goto yy45;
}
yy82:
yych = *++s;
switch (yych) {
case 'e': goto yy84;
default: goto yy45;
}
yy83:
yych = *++s;
switch (yych) {
case 't': goto yy86;
default: goto yy45;
}
yy84:
++s;
#line 27 "/Users/drslump/tmp/test.re2c"
{ "IGNORE" }
#line 719 "grammar.cc"
yy86:
++s;
#line 28 "/Users/drslump/tmp/test.re2c"
{ "IMPORT" }
#line 724 "grammar.cc"
}
#line 42 "/Users/drslump/tmp/test.re2c"
}
import re
rex = re.compile(r'''
(?P<_NL>(\r?\n)+\s*)
|(?P<STRING>"(\\"|\\\\|[^"\n])*?"i?)
|(?P<RULE>!?[_?]?[a-z][_a-z0-9]*)
|(?P<TOKEN>_?[A-Z][_A-Z0-9]*)
|(?P<COMMENT>//[^\n]*)
|(?P<WS>[ \t]+)
|(?P<NUMBER>\d+)
|(?P<_IGNORE>%ignore)
|(?P<_IMPORT>%import)
|(?P<OP>[+*][?]?|[?](?![a-z]))
|(?P<_TO>->)
|(?P<_DOT>\.)
|(?P<_LBRA>\[)
|(?P<_LPAR>\()
|(?P<_OR>\|)
|(?P<_RBRA>\])
|(?P<_RPAR>\))
|(?P<TILDE>~)
|(?P<_COLON>:)
''', re.X | re.U)
types = {1: u'_NL', 3: u'STRING', 5: u'RULE', 6: u'TOKEN', 7: u'COMMENT', 8: u'WS', 9: u'NUMBER',
10: u'_IGNORE', 11: u'_IMPORT', 12: u'OP', 13: u'_TO', 14: u'_DOT', 15: u'_LBRA', 16: u'_LPAR',
17: u'_OR', 18: u'_RBRA', 19: u'_RPAR', 20: u'TILDE', 21: u'_COLON'}
def relex(stream):
ofs = 0
while ofs < len(stream):
m = rex.match(stream, ofs)
if m:
token = types[m.lastindex]
end = m.end()
value = m.group(0)
yield ofs, token, value
ofs = end
continue
ofs += 1
with open('lark/grammars/common.g') as fd:
lines = []
for line in fd:
if not line.startswith('//'):
lines.append(line)
data = u''.join(lines) * 100
def lexit():
for pos, token, value in relex(data):
pass
from timeit import timeit
print(timeit(lexit, number=100))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment