Skip to content

Instantly share code, notes, and snippets.

@jdahlin
Created August 31, 2011 09:36
Show Gist options
  • Save jdahlin/1183175 to your computer and use it in GitHub Desktop.
Save jdahlin/1183175 to your computer and use it in GitHub Desktop.
impore re
import sys
IDENTIFIER = "IDENTIFIER"
INTEGER = "INTEGER"
STRING = "STRING"
COMMENT = "COMMENT"
DOT = "DOT"
COLON = "COLON"
SEMICOLON = "SEMICOLON"
OPEN_BLOCK = "OPEN_BLOCK"
CLOSE_BLOCK = "CLOSE_BLOCK"
NEWLINE = "NEWLINE"
WHITESPACE = "WHITESPACE"
token_pattern = r"""
(?P<IDENTIFIER>[a-zA-Z_][a-zA-Z0-9_]*)
|(?P<INTEGER>[0-9]+)
|(?P<STRING>([uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"|
[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'))
|(?P<COMMENT>//[^\r\n]*)
|(?P<DOT>\.)
|(?P<COLON>:)
|(?P<SEMICOLON>;)
|(?P<OPEN_BLOCK>[{])
|(?P<CLOSE_BLOCK>[}])
|(?P<NEWLINE>\n)
|(?P<WHITESPACE>\s+)
"""
token_re = re.compile(token_pattern, re.VERBOSE)
class Token(object):
def __init__(self, value, kind, line, start, end):
self.value = value
self.kind = kind
self.line = line
self.start = start
self.end = end
def __repr__(self):
return '<Token %s %r>' % (self.kind, self.value)
class TokenizerException(Exception):
pass
def tokenize(text):
pos = 0
line = 0
line_start = 0
for match in token_re.finditer(text):
pos = match.end()
tokkind = match.lastgroup
tokvalue = match.group(tokkind)
end = match.start() - line_start
start = match.end() - line_start
yield Token(tokvalue, tokkind, line, end, start)
if tokkind == 'newline':
line += 1
line_start = pos
elif tokkind == 'comment':
line += 1
line_start = pos
if pos != len(text):
raise TokenizerException('tokenizer stopped at pos %r of %r' % (
pos, len(text)))
for token in tokenize(open(sys.argv[1]).read()):
print token
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment