Skip to content

Instantly share code, notes, and snippets.

@aita
Created May 27, 2013 05:00
Show Gist options
  • Save aita/5655251 to your computer and use it in GitHub Desktop.
Save aita/5655251 to your computer and use it in GitHub Desktop.
tokenizer
# -*- coding:utf-8 -*-
import re
regexps = {
'escaped': ur'''
\\ # Escape
((?P<standard>["\\/bfnrt]) # Standard escapes
| (u(?P<unicode>[0-9A-Fa-f]{4}))) # uXXXX
''',
'unescaped': ur'''
[\x20-\x21\x23-\x5b\x5d-\uffff] # Unescaped: avoid ["\\]
''',
}
SPECS = (
('STRING', ur'"(%(unescaped)s | %(escaped)s)*"' % regexps, re.VERBOSE),
('NUMBER', r'''
-?
(0|([1-9][0-9]*))
(\.[0-9]+)?
([Ee][+-][0-9]+)?
''', re.VERBOSE),
('OP', r'[{}\[\]\-,:]'),
('NAME', r'[A-Za-z_][A-Za-z0-9]*'),
)
def make_specs(specs):
def _make_specs():
for spec in specs:
name = spec[0]
if len(spec) > 2:
regex = re.compile(spec[1], spec[2])
else:
regex = re.compile(spec[1])
yield name, regex
return list(_make_specs())
def lexer(text):
specs = make_specs(SPECS)
while text != '':
for name, regex in specs:
m = regex.match(text)
if m:
text = text[m.end():].lstrip()
yield name, m.group()
break
j = u'{"feiz": 1}'
for l in lexer(j):
print l
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment