Skip to content

Instantly share code, notes, and snippets.

@Averroes
Created April 10, 2015 18:16
Show Gist options
  • Save Averroes/1c2528955b764bd36641 to your computer and use it in GitHub Desktop.
Save Averroes/1c2528955b764bd36641 to your computer and use it in GitHub Desktop.
tokenizing text
# example.py
#
# Example of a tokenizer
import re
from collections import namedtuple
NAME = r'(?P<NAME>[a-zA-Z_][a-zA-Z_0-9]*)'
NUM = r'(?P<NUM>\d+)'
PLUS = r'(?P<PLUS>\+)'
TIMES = r'(?P<TIMES>\*)'
EQ = r'(?P<EQ>=)'
WS = r'(?P<WS>\s+)'
master_pat = re.compile('|'.join([NAME, NUM, PLUS, TIMES, EQ, WS]))
Token = namedtuple('Token', ['type','value'])
def generate_tokens(pat, text):
scanner = pat.scanner(text)
for m in iter(scanner.match, None):
yield Token(m.lastgroup, m.group())
for tok in generate_tokens(master_pat, 'foo = 42'):
print(tok)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment