Skip to content

Instantly share code, notes, and snippets.

@edcrypt
Last active June 24, 2017 22:20
Show Gist options
  • Save edcrypt/935dbbac8149b079002a486a39dc0ee9 to your computer and use it in GitHub Desktop.
Save edcrypt/935dbbac8149b079002a486a39dc0ee9 to your computer and use it in GitHub Desktop.
Quick tokenizer to showcase some py3.6 features
# flake8: noqa
import regex
import collections
from typing import NamedTuple
class OrderedClassMembers(type):
@classmethod
def __prepare__(self, name, bases):
return collections.OrderedDict()
def __new__(self, name, bases, classdict):
classdict['__ordered__'] = [key for key in classdict.keys()
if key not in ('__module__', '__qualname__')]
return type.__new__(self, name, bases, classdict)
class Token(NamedTuple):
kind: str
value: str
start: int
end: int
class Tokenizer(metaclass=OrderedClassMembers):
""" >>> tokenizer.Tokenizer().scan('10. .10 10.10 10e10 10e-10 0xFF asd')
[Token(kind='T_FLOAT', value='10.', start=0, end=3),
Token(kind='T_FLOAT', value='.10', start=4, end=7),
Token(kind='T_FLOAT', value='10.10', start=8, end=13),
Token(kind='T_SCI_FLOAT', value='10e10', start=14, end=19),
Token(kind='T_SCI_FLOAT', value='10e-10', start=20, end=26),
Token(kind='T_HEX_INTEGER', value='0xFF', start=27, end=31),
Token(kind='T_IDENTIFIER', value='asd', start=32, end=35)]
"""
REAL = "[0-9]+\.[0-9]+|[0-9]+\.|\.[0-9]+"
NATURAL = "[0-9]+"
T_IDENT = r"\n\s{4,}"
T_SCI_FLOAT = rf"[+-]?({REAL}|{NATURAL})[eE][+-]?[0-9]+"
T_FLOAT = rf"[+-]?{REAL}"
T_HEX_INTEGER = r"0x[0-9A-Fa-f]+"
T_OCT_INTEGER = r"0o[0-7]+"
T_INTEGER = r"[+-]?[0-9]+"
T_IDENTIFIER = r"[^0-9\s;.][^\s;.]*"
T_PUNCTUATION = r"[;.]"
IGNORE = "\s+"
def __init__(self):
to_scan = [(getattr(self, token_type),
self.get_action(token_type))
for token_type in self.__ordered__
if token_type.startswith('T_')]
to_scan.append((self.IGNORE, None))
self.scanner = regex.Scanner(to_scan)
def scan(self, source):
tokens, remainder = self.scanner.scan(source)
if remainder:
raise SyntaxError(
f"Unknown token: {remainder} at: {tokens[-1].end}")
return tokens
def get_action(self, token_type):
def action(scanner, token):
return Token(token_type, token,
scanner.match.start(),
scanner.match.end())
return action
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment