Created
July 18, 2012 20:44
-
-
Save fielder/3138757 to your computer and use it in GitHub Desktop.
Simple token parser. Understands quoted tokens, escape characters, and single-line comments.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| A simple token parser. | |
| """ | |
| import string | |
| class Tokenizer(object): | |
| """ | |
| Utility to parse out tokens from a possibly multi-line string. | |
| All tokens are picked out by white-space separation. Quoted tokens | |
| can be used to include white-spaces and escape characters. | |
| The single-line comment character # can be used. | |
| Example: | |
| ' token1 2nd_token "quoted token" # comment to end of line ' | |
| """ | |
| def __init__(self, text): | |
| self._txt = text | |
| self._idx = 0 | |
| def _getQuotedToken(self): | |
| self._idx += 1 # skip initial quote | |
| ret = "" | |
| while 1: | |
| start = self._idx | |
| while self._idx < len(self._txt) and self._txt[self._idx] not in "\\\"": | |
| self._idx += 1 | |
| ret += self._txt[start:self._idx] | |
| if self._idx == len(self._txt): | |
| # no terminating quote | |
| raise Exception("unterminated quoted token") | |
| elif self._txt[self._idx] == "\"": | |
| # end of quoted token | |
| self._idx += 1 | |
| break | |
| else: | |
| # escape character | |
| self._idx += 1 | |
| if self._idx == len(self._txt): | |
| raise Exception("escape squence terminated by eof") | |
| xlate = { "0": "\x00", | |
| "a": "\a", | |
| "b": "\b", | |
| "t": "\t", | |
| "n": "\n", | |
| "v": "\v", | |
| "f": "\f", | |
| "r": "\r", | |
| "e": "\x1b" } | |
| if self._txt[self._idx] in xlate: | |
| ret += xlate[self._txt[self._idx]] | |
| else: | |
| ret += self._txt[self._idx] | |
| self._idx += 1 | |
| return ret | |
| def _getNormalToken(self): | |
| start = self._idx | |
| while self._idx < len(self._txt) and self._txt[self._idx] not in string.whitespace + "#\"": | |
| self._idx += 1 | |
| return self._txt[start:self._idx] | |
| def get(self): | |
| while 1: | |
| while self._idx < len(self._txt) and self._txt[self._idx] in string.whitespace: | |
| self._idx += 1 | |
| if self._idx == len(self._txt): | |
| # end of input; no token | |
| return None | |
| elif self._txt[self._idx] == "#": | |
| # comment; skip to next line | |
| while self._idx < len(self._txt) and self._txt[self._idx] != "\n": | |
| self._idx += 1 | |
| else: | |
| # hit a token | |
| if self._txt[self._idx] == "\"": | |
| ret = self._getQuotedToken() | |
| else: | |
| ret = self._getNormalToken() | |
| return ret |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment