Last active
March 19, 2024 17:11
-
-
Save esho/c12a4f7f3646b0ba5286daec8bf8af62 to your computer and use it in GitHub Desktop.
Tokenizer - parses text by type return a list of tokens tagged by type
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pytest | |
import tokenizer | |
def test_string_with_spaces(): | |
results = tokenizer.tokenize('a string of words') | |
assert results[0].token == 'a' and results[0].type == tokenizer.TokenType.STRING | |
assert results[1].token == 'string' and results[1].type == tokenizer.TokenType.STRING | |
assert results[2].token == 'of' and results[2].type == tokenizer.TokenType.STRING | |
assert results[3].token == 'words' and results[3].type == tokenizer.TokenType.STRING | |
def test_integers(): | |
results = tokenizer.tokenize('123 456 789') | |
assert results[0].token == '123' and results[0].type == tokenizer.TokenType.INTEGER | |
assert results[1].token == '456' and results[1].type == tokenizer.TokenType.INTEGER | |
assert results[2].token == '789' and results[2].type == tokenizer.TokenType.INTEGER | |
def test_string_integers_symbols_with_spaces(): | |
results = tokenizer.tokenize(' a string & 3 ') | |
assert results[0].token == 'a' and results[0].type == tokenizer.TokenType.STRING | |
assert results[1].token == 'string' and results[1].type == tokenizer.TokenType.STRING | |
assert results[2].token == '&' and results[2].type == tokenizer.TokenType.SYMBOL | |
assert results[3].token == '3' and results[3].type == tokenizer.TokenType.INTEGER | |
def test_string_integers_symbols_no_spaces(): | |
results = tokenizer.tokenize('Astring&3B') | |
assert results[0].token == 'Astring' and results[0].type == tokenizer.TokenType.STRING | |
assert results[1].token == '&' and results[1].type == tokenizer.TokenType.SYMBOL | |
assert results[2].token == '3' and results[2].type == tokenizer.TokenType.INTEGER | |
assert results[3].token == 'B' and results[3].type == tokenizer.TokenType.STRING | |
def test_empty_string(): | |
results = tokenizer.tokenize('') | |
assert results == [] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import enum | |
import re | |
integer = re.compile(r"\d") | |
string = re.compile(r"[a-zA-Z]") | |
symbol = re.compile(r"[^\s]") | |
class TokenType(enum.Enum): | |
INTEGER = enum.auto() | |
STRING = enum.auto() | |
SYMBOL = enum.auto() | |
class Token: | |
def __init__(self, char=None, char_type=None): | |
self._token = "" | |
self._type = None | |
if char is not None: | |
self.add_char(char, char_type) | |
def __repr__(self): | |
return f"Token(token={self.token}, type={self.type})" | |
def __len__(self): | |
return len(self._token) | |
@property | |
def token(self): | |
return self._token | |
@property | |
def type(self): | |
return self._type | |
def _check(self): | |
assert (self.type is None and self.token == "") or ( | |
self.token != "" and self.type is not None | |
) | |
def _set_type(self, type_): | |
assert self._type == type_ or self._type is None | |
if self._type is None: | |
self._type = type_ | |
def add_char(self, char, char_type=None): | |
char_type = char_type or self.get_type(char) | |
if not char or not char_type: | |
return | |
self._check() | |
self._set_type(char_type) | |
self._token += char | |
@staticmethod | |
def get_type(char): | |
if integer.match(char): | |
return TokenType.INTEGER | |
if string.match(char): | |
return TokenType.STRING | |
if symbol.match(char): | |
return TokenType.SYMBOL | |
return None | |
def tokenize(raw_string): | |
tokens = [] | |
token = Token() | |
chars = list(raw_string.strip()[::-1]) | |
while chars: | |
char = chars.pop() | |
char_type = Token.get_type(char) | |
if not token.type or char_type == token.type: | |
token.add_char(char) | |
else: | |
if token: | |
tokens.append(token) | |
token = Token(char) | |
if token: | |
tokens.append(token) | |
return tokens |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment