Skip to content

Instantly share code, notes, and snippets.

@vsajip
Forked from mitsuhiko/did-you-know-re.py
Created March 31, 2018 06:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save vsajip/8e6c96dfae0a401603223450cb490bec to your computer and use it in GitHub Desktop.
Save vsajip/8e6c96dfae0a401603223450cb490bec to your computer and use it in GitHub Desktop.
from sre_parse import Pattern, SubPattern, parse as sre_parse
from sre_compile import compile as sre_compile
from sre_constants import BRANCH, SUBPATTERN
class Scanner(object):
def __init__(self, tokens, flags=0):
subpatterns = []
pat = Pattern()
pat.flags = flags
self._tokens = []
for token_name, regex in tokens:
subpatterns.append(SubPattern(pat, [
(SUBPATTERN, (len(subpatterns) + 1, sre_parse(regex, flags))),
]))
self._tokens.append(token_name)
pat.groups = len(subpatterns) + 1
self._scanner = sre_compile(SubPattern(pat, [
(BRANCH, (None, subpatterns))
])).scanner
def scan(self, string, skip=False):
scanner = self._scanner(string)
match = skip and scanner.search or scanner.match
i = 0
while 1:
m = match()
if not m:
break
j = m.end()
if i == j:
break
token = self._tokens[m.lastindex - 1]
yield token, m
i = j
if i < len(string):
raise EOFError(i)
def test_precise():
scanner = Scanner([
('whitespace', r'\s+'),
('plus', r'\+'),
('minus', r'\-'),
('mult', r'\*'),
('div', r'/'),
('num', r'\d+'),
('paren_open', r'\('),
('paren_close', r'\)'),
])
for token, match in scanner.scan('(1 + 2) * 3'):
print (token, match.group())
def test_lenient():
scanner = Scanner([
('bold', '\*\*'),
('link', '\[\[(.*?)\]\]'),
])
input_text = 'Hello **World**! [[Stuff]]'
pos = 0
for token, match in scanner.scan(input_text, skip=True):
skipped = input_text[pos:match.start()]
if skipped:
print (None, skipped)
print (token, match.group())
pos = match.end()
skipped = input_text[pos:]
if skipped:
print (None, skipped)
if __name__ == '__main__':
print 'Precise:'
test_precise()
print 'Lenient:'
test_lenient()
Precise:
('paren_open', '(')
('num', '1')
('whitespace', ' ')
('plus', '+')
('whitespace', ' ')
('num', '2')
('paren_close', ')')
('whitespace', ' ')
('mult', '*')
('whitespace', ' ')
('num', '3')
Lenient:
(None, 'Hello ')
('bold', '**')
(None, 'World')
('bold', '**')
(None, '! ')
('link', '[[Stuff]]')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment