Skip to content

Instantly share code, notes, and snippets.

@fielder
Created July 18, 2012 20:44
Show Gist options
  • Select an option

  • Save fielder/3138757 to your computer and use it in GitHub Desktop.

Select an option

Save fielder/3138757 to your computer and use it in GitHub Desktop.
Simple token parser. Understands quoted tokens, escape characters, and single-line comments.
"""
A simple token parser.
"""
import string
class Tokenizer(object):
"""
Utility to parse out tokens from a possibly multi-line string.
All tokens are picked out by white-space separation. Quoted tokens
can be used to include white-spaces and escape characters.
The single-line comment character # can be used.
Example:
' token1 2nd_token "quoted token" # comment to end of line '
"""
def __init__(self, text):
self._txt = text
self._idx = 0
def _getQuotedToken(self):
self._idx += 1 # skip initial quote
ret = ""
while 1:
start = self._idx
while self._idx < len(self._txt) and self._txt[self._idx] not in "\\\"":
self._idx += 1
ret += self._txt[start:self._idx]
if self._idx == len(self._txt):
# no terminating quote
raise Exception("unterminated quoted token")
elif self._txt[self._idx] == "\"":
# end of quoted token
self._idx += 1
break
else:
# escape character
self._idx += 1
if self._idx == len(self._txt):
raise Exception("escape squence terminated by eof")
xlate = { "0": "\x00",
"a": "\a",
"b": "\b",
"t": "\t",
"n": "\n",
"v": "\v",
"f": "\f",
"r": "\r",
"e": "\x1b" }
if self._txt[self._idx] in xlate:
ret += xlate[self._txt[self._idx]]
else:
ret += self._txt[self._idx]
self._idx += 1
return ret
def _getNormalToken(self):
start = self._idx
while self._idx < len(self._txt) and self._txt[self._idx] not in string.whitespace + "#\"":
self._idx += 1
return self._txt[start:self._idx]
def get(self):
while 1:
while self._idx < len(self._txt) and self._txt[self._idx] in string.whitespace:
self._idx += 1
if self._idx == len(self._txt):
# end of input; no token
return None
elif self._txt[self._idx] == "#":
# comment; skip to next line
while self._idx < len(self._txt) and self._txt[self._idx] != "\n":
self._idx += 1
else:
# hit a token
if self._txt[self._idx] == "\"":
ret = self._getQuotedToken()
else:
ret = self._getNormalToken()
return ret
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment