Skip to content

Instantly share code, notes, and snippets.

@alexaleluia12
Last active June 23, 2021 22:59
Show Gist options
  • Save alexaleluia12/ee09a50fa57ca2091847 to your computer and use it in GitHub Desktop.
Save alexaleluia12/ee09a50fa57ca2091847 to your computer and use it in GitHub Desktop.
token / tokenizer / lexico / tokenizing in python
# Can I use finditer ?
# Yes
# ...
def generate_tokens(pat, text):
scanner = pat.finditer(text)
for i in scanner:
yield Token(i.lastgroup, i.group())
for tok in generate_tokens(master_pat, 'bar = 3'):
print(tok)
# output
# Token(type='NAME', value='bar')
# Token(type='WS', value=' ')
# Token(type='EQ', value='=')
# Token(type='WS', value=' ')
# Token(type='NUM', value='3')
# font
# http://chimera.labs.oreilly.com/books/1230000000393/ch02.html#tokenizing
from collections import namedtuple
import re
NAME = r'(?P<NAME>[a-zA-Z_][a-zA-Z_0-9]*)'
NUM = r'(?P<NUM>\d+)'
PLUS = r'(?P<PLUS>\+)'
TIMES = r'(?P<TIMES>\*)'
EQ = r'(?P<EQ>=)'
WS = r'(?P<WS>\s+)'
master_pat = re.compile('|'.join([NAME, NUM, PLUS, TIMES, EQ, WS]))
Token = namedtuple('Token', ['type','value'])
def generate_tokens(pat, text):
scanner = pat.scanner(text)
for m in iter(scanner.match, None):
yield Token(m.lastgroup, m.group())
# Example use
for tok in generate_tokens(master_pat, 'foo = 42'):
print(tok)
# Produces output
# Token(type='NAME', value='foo')
# Token(type='WS', value=' ')
# Token(type='EQ', value='=')
# Token(type='WS', value=' ')
# Token(type='NUM', value='42')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment