Last active
April 16, 2021 11:05
-
-
Save typoman/485cf13aaf6e9615ecc0f723bbeec017 to your computer and use it in GitHub Desktop.
Simple Lexer using regex in python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
``` | |
import re | |
from collections import namedtuple | |
Token = namedtuple('Token', ['type', 'value', 'line', 'column']) | |
class Lexer(object): | |
_KEYWORDS = { | |
'if': 'IF_CONDITION', | |
'while': 'WHILE_LOOP', | |
'def': 'FUNCTION_DEFINITION', | |
'return': 'FUNCTION_RETURNS', | |
'True': 'TRUE_STATE', | |
} | |
_TOKEN_PATTERNS = [ | |
('NUMBER', r'\d+(\.\d*)?'), # any number of subseqeunt digits and a period character for floats | |
('ASSIGN', r'='), # the = character | |
('SYMBOL', r'[+*\/\-():>]'), # any of the +*\/\-():> characters | |
('NEWLINE', r'\n'), # new line character | |
('INDENT', r'^[ \t]+'), # space or tab character at start of a line | |
('WHITESPACE', r'[ \t]'), # the space and tab character | |
('COMMENT', r'#[^\n]+'), # anything after the # mark until the end of the line | |
('STRING', r'"[^"]+"'), # anything inside "" marks | |
('NAME', r'[^=#+*\/\- \t\d\n():>]+'), | |
] | |
_RE_TOKEN_PATTERN = '|'.join('(?P<%s>%s)' % pair for pair in _TOKEN_PATTERNS) | |
_RE_GET_TOKEN = re.compile(_RE_TOKEN_PATTERN, flags=re.MULTILINE).match | |
def __init__(self, text): | |
self.line = 1 | |
self.position = 0 | |
self.lineStart = 0 | |
self.text = text | |
self.textlength = len(text) | |
def __iter__(self): | |
return self | |
def __next__(self): | |
while True: | |
token = self.getCurrentToken() | |
if token.type == 'NEWLINE': | |
self.lineStart = self.position | |
self.line += 1 | |
elif token.type != "WHITESPACE": | |
return token | |
def getCurrentToken(self): | |
if self.position >= self.textlength: | |
raise StopIteration() | |
matched = self._RE_GET_TOKEN(self.text, self.position) | |
if matched is not None: | |
tokenType = matched.lastgroup | |
column = matched.start() - self.lineStart | |
tokenValue = matched.group(tokenType) | |
tokenType = self._KEYWORDS.get(tokenValue, tokenType) | |
self.position = matched.end() | |
return Token(tokenType, tokenValue, self.line, column) | |
raise RuntimeError('Unexpected character %r on line %d' %(self.text[self.position], self.line)) | |
if __name__ == '__main__': | |
test = ''' | |
def testFunc(inputNum): | |
while True: | |
if inputNume > 10: | |
return inputNum | |
testString = "نوشته" | |
''' | |
l = Lexer(test) | |
t = [] | |
for token in l: | |
print(token) | |
``` |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment