Skip to content

Instantly share code, notes, and snippets.

@esho
Last active March 19, 2024 17:11
Show Gist options
  • Save esho/c12a4f7f3646b0ba5286daec8bf8af62 to your computer and use it in GitHub Desktop.
Save esho/c12a4f7f3646b0ba5286daec8bf8af62 to your computer and use it in GitHub Desktop.
Tokenizer - parses text by type return a list of tokens tagged by type
import pytest
import tokenizer
def test_string_with_spaces():
results = tokenizer.tokenize('a string of words')
assert results[0].token == 'a' and results[0].type == tokenizer.TokenType.STRING
assert results[1].token == 'string' and results[1].type == tokenizer.TokenType.STRING
assert results[2].token == 'of' and results[2].type == tokenizer.TokenType.STRING
assert results[3].token == 'words' and results[3].type == tokenizer.TokenType.STRING
def test_integers():
results = tokenizer.tokenize('123 456 789')
assert results[0].token == '123' and results[0].type == tokenizer.TokenType.INTEGER
assert results[1].token == '456' and results[1].type == tokenizer.TokenType.INTEGER
assert results[2].token == '789' and results[2].type == tokenizer.TokenType.INTEGER
def test_string_integers_symbols_with_spaces():
results = tokenizer.tokenize(' a string & 3 ')
assert results[0].token == 'a' and results[0].type == tokenizer.TokenType.STRING
assert results[1].token == 'string' and results[1].type == tokenizer.TokenType.STRING
assert results[2].token == '&' and results[2].type == tokenizer.TokenType.SYMBOL
assert results[3].token == '3' and results[3].type == tokenizer.TokenType.INTEGER
def test_string_integers_symbols_no_spaces():
results = tokenizer.tokenize('Astring&3B')
assert results[0].token == 'Astring' and results[0].type == tokenizer.TokenType.STRING
assert results[1].token == '&' and results[1].type == tokenizer.TokenType.SYMBOL
assert results[2].token == '3' and results[2].type == tokenizer.TokenType.INTEGER
assert results[3].token == 'B' and results[3].type == tokenizer.TokenType.STRING
def test_empty_string():
results = tokenizer.tokenize('')
assert results == []
import enum
import re
integer = re.compile(r"\d")
string = re.compile(r"[a-zA-Z]")
symbol = re.compile(r"[^\s]")
class TokenType(enum.Enum):
INTEGER = enum.auto()
STRING = enum.auto()
SYMBOL = enum.auto()
class Token:
def __init__(self, char=None, char_type=None):
self._token = ""
self._type = None
if char is not None:
self.add_char(char, char_type)
def __repr__(self):
return f"Token(token={self.token}, type={self.type})"
def __len__(self):
return len(self._token)
@property
def token(self):
return self._token
@property
def type(self):
return self._type
def _check(self):
assert (self.type is None and self.token == "") or (
self.token != "" and self.type is not None
)
def _set_type(self, type_):
assert self._type == type_ or self._type is None
if self._type is None:
self._type = type_
def add_char(self, char, char_type=None):
char_type = char_type or self.get_type(char)
if not char or not char_type:
return
self._check()
self._set_type(char_type)
self._token += char
@staticmethod
def get_type(char):
if integer.match(char):
return TokenType.INTEGER
if string.match(char):
return TokenType.STRING
if symbol.match(char):
return TokenType.SYMBOL
return None
def tokenize(raw_string):
tokens = []
token = Token()
chars = list(raw_string.strip()[::-1])
while chars:
char = chars.pop()
char_type = Token.get_type(char)
if not token.type or char_type == token.type:
token.add_char(char)
else:
if token:
tokens.append(token)
token = Token(char)
if token:
tokens.append(token)
return tokens
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment