Skip to content

Instantly share code, notes, and snippets.

@typoman
Last active April 16, 2021 11:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save typoman/485cf13aaf6e9615ecc0f723bbeec017 to your computer and use it in GitHub Desktop.
Save typoman/485cf13aaf6e9615ecc0f723bbeec017 to your computer and use it in GitHub Desktop.
Simple Lexer using regex in python
```
import re
from collections import namedtuple
Token = namedtuple('Token', ['type', 'value', 'line', 'column'])
class Lexer(object):
_KEYWORDS = {
'if': 'IF_CONDITION',
'while': 'WHILE_LOOP',
'def': 'FUNCTION_DEFINITION',
'return': 'FUNCTION_RETURNS',
'True': 'TRUE_STATE',
}
_TOKEN_PATTERNS = [
('NUMBER', r'\d+(\.\d*)?'), # any number of subseqeunt digits and a period character for floats
('ASSIGN', r'='), # the = character
('SYMBOL', r'[+*\/\-():>]'), # any of the +*\/\-():> characters
('NEWLINE', r'\n'), # new line character
('INDENT', r'^[ \t]+'), # space or tab character at start of a line
('WHITESPACE', r'[ \t]'), # the space and tab character
('COMMENT', r'#[^\n]+'), # anything after the # mark until the end of the line
('STRING', r'"[^"]+"'), # anything inside "" marks
('NAME', r'[^=#+*\/\- \t\d\n():>]+'),
]
_RE_TOKEN_PATTERN = '|'.join('(?P<%s>%s)' % pair for pair in _TOKEN_PATTERNS)
_RE_GET_TOKEN = re.compile(_RE_TOKEN_PATTERN, flags=re.MULTILINE).match
def __init__(self, text):
self.line = 1
self.position = 0
self.lineStart = 0
self.text = text
self.textlength = len(text)
def __iter__(self):
return self
def __next__(self):
while True:
token = self.getCurrentToken()
if token.type == 'NEWLINE':
self.lineStart = self.position
self.line += 1
elif token.type != "WHITESPACE":
return token
def getCurrentToken(self):
if self.position >= self.textlength:
raise StopIteration()
matched = self._RE_GET_TOKEN(self.text, self.position)
if matched is not None:
tokenType = matched.lastgroup
column = matched.start() - self.lineStart
tokenValue = matched.group(tokenType)
tokenType = self._KEYWORDS.get(tokenValue, tokenType)
self.position = matched.end()
return Token(tokenType, tokenValue, self.line, column)
raise RuntimeError('Unexpected character %r on line %d' %(self.text[self.position], self.line))
if __name__ == '__main__':
test = '''
def testFunc(inputNum):
while True:
if inputNume > 10:
return inputNum
testString = "نوشته"
'''
l = Lexer(test)
t = []
for token in l:
print(token)
```
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment