typoman/pythonSimpleRegexLexer.py

## pythonSimpleRegexLexer.py
```
import re
from collections import namedtuple

Token = namedtuple('Token', ['type', 'value', 'line', 'column'])

class Lexer(object):
    _KEYWORDS = {
        'if':       'IF_CONDITION',
        'while':    'WHILE_LOOP',
        'def':      'FUNCTION_DEFINITION',
        'return':   'FUNCTION_RETURNS',
        'True':     'TRUE_STATE',
        }

    _TOKEN_PATTERNS = [
        ('NUMBER',      r'\d+(\.\d*)?'), # any number of subseqeunt digits and a period character for floats
        ('ASSIGN',      r'='), # the = character
        ('SYMBOL',      r'[+*\/\-():>]'), # any of the +*\/\-():> characters
        ('NEWLINE',     r'\n'), # new line character
        ('INDENT',      r'^[ \t]+'), # space or tab character at start of a line
        ('WHITESPACE',  r'[ \t]'), # the space and tab character
        ('COMMENT',     r'#[^\n]+'), # anything after the # mark until the end of the line
        ('STRING',      r'"[^"]+"'), # anything inside "" marks
        ('NAME',        r'[^=#+*\/\- \t\d\n():>]+'),
    ]
    _RE_TOKEN_PATTERN = '|'.join('(?P<%s>%s)' % pair for pair in _TOKEN_PATTERNS)
    _RE_GET_TOKEN = re.compile(_RE_TOKEN_PATTERN, flags=re.MULTILINE).match

    def __init__(self, text):
        self.line = 1
        self.position = 0
        self.lineStart = 0
        self.text = text
        self.textlength = len(text)

    def __iter__(self):
        return self

    def __next__(self):
        while True:
            token = self.getCurrentToken()
            if token.type == 'NEWLINE':
                self.lineStart = self.position
                self.line += 1
            elif token.type != "WHITESPACE":
                return token

    def getCurrentToken(self):
        if self.position >= self.textlength:
            raise StopIteration()
        matched = self._RE_GET_TOKEN(self.text, self.position)
        if matched is not None:
            tokenType = matched.lastgroup
            column = matched.start() - self.lineStart
            tokenValue = matched.group(tokenType)
            tokenType = self._KEYWORDS.get(tokenValue, tokenType)
            self.position = matched.end()
            return Token(tokenType, tokenValue, self.line, column)
        raise RuntimeError('Unexpected character %r on line %d' %(self.text[self.position], self.line))

if __name__ == '__main__':
    test = '''
    def testFunc(inputNum):
        while True:
            if inputNume > 10:
                return inputNum
    testString = "نوشته"
    '''
    l = Lexer(test)
    t = []
    for token in l:
        print(token)
```
	```
	import re
	from collections import namedtuple

	Token = namedtuple('Token', ['type', 'value', 'line', 'column'])

	class Lexer(object):
	_KEYWORDS = {
	'if': 'IF_CONDITION',
	'while': 'WHILE_LOOP',
	'def': 'FUNCTION_DEFINITION',
	'return': 'FUNCTION_RETURNS',
	'True': 'TRUE_STATE',
	}

	_TOKEN_PATTERNS = [
	('NUMBER', r'\d+(\.\d*)?'), # any number of subseqeunt digits and a period character for floats
	('ASSIGN', r'='), # the = character
	('SYMBOL', r'[+\/\-():>]'), # any of the +\/\-():> characters
	('NEWLINE', r'\n'), # new line character
	('INDENT', r'^[ \t]+'), # space or tab character at start of a line
	('WHITESPACE', r'[ \t]'), # the space and tab character
	('COMMENT', r'#[^\n]+'), # anything after the # mark until the end of the line
	('STRING', r'"[^"]+"'), # anything inside "" marks
	('NAME', r'[^=#+*\/\- \t\d\n():>]+'),
	]
	_RE_TOKEN_PATTERN = '\|'.join('(?P<%s>%s)' % pair for pair in _TOKEN_PATTERNS)
	_RE_GET_TOKEN = re.compile(_RE_TOKEN_PATTERN, flags=re.MULTILINE).match

	def __init__(self, text):
	self.line = 1
	self.position = 0
	self.lineStart = 0
	self.text = text
	self.textlength = len(text)

	def __iter__(self):
	return self

	def __next__(self):
	while True:
	token = self.getCurrentToken()
	if token.type == 'NEWLINE':
	self.lineStart = self.position
	self.line += 1
	elif token.type != "WHITESPACE":
	return token

	def getCurrentToken(self):
	if self.position >= self.textlength:
	raise StopIteration()
	matched = self._RE_GET_TOKEN(self.text, self.position)
	if matched is not None:
	tokenType = matched.lastgroup
	column = matched.start() - self.lineStart
	tokenValue = matched.group(tokenType)
	tokenType = self._KEYWORDS.get(tokenValue, tokenType)
	self.position = matched.end()
	return Token(tokenType, tokenValue, self.line, column)
	raise RuntimeError('Unexpected character %r on line %d' %(self.text[self.position], self.line))

	if __name__ == '__main__':
	test = '''
	def testFunc(inputNum):
	while True:
	if inputNume > 10:
	return inputNum
	testString = "نوشته"
	'''
	l = Lexer(test)
	t = []
	for token in l:
	print(token)
	```