Skip to content

Instantly share code, notes, and snippets.

@alextremblay
Last active July 15, 2021 15:42
Show Gist options
  • Save alextremblay/5ad5f379f7aa25a983f55673d974be7a to your computer and use it in GitHub Desktop.
Save alextremblay/5ad5f379f7aa25a983f55673d974be7a to your computer and use it in GitHub Desktop.
An example string tokenizer / regex splitter
from __future__ import annotations
from typing import Any, Callable, List, Literal, Tuple, Union, TypeVar, cast
import regex
from regex.regex import Match
DEFAULT = '_SCANNER_DEFAULT_PATTERN' # a sentinel value
T = TypeVar('T')
class TokenMeta(type):
"""Metaclass for dynamic attribute access on class (not on class instance)
With any class you use this metaclass in, references to capital-case attributes
on that class will return named subclasses of that class
see Token as an example"""
def __getattr__(cls: TokenMeta, name: str) -> TokenMeta:
if name[0].isupper():
parent_class_name = cls.__name__
new_subclass_name = f'{parent_class_name}.{name}'
new_subclass = cast(
TokenMeta,
type(new_subclass_name, (cls,), {})
)
# register this subclass as an attribute on its parent class
# next time it is referenced, that attribute will be used, and this method will not be called
setattr(cls, name, new_subclass)
return new_subclass
#else:
raise AttributeError
class Token(str, metaclass=TokenMeta):
"""subclass of string, used to categorize different types of strings.
referencing any capital-case attribute of this class returns a subclass named after that attribute
Example:
>>> token_string = Token('hello')
>>> # all tokens and subclasses are, at the end of the day, strings.
>>> isinstance(token_string, str)
True
>>> # You can define any categories you want
>>> other_string = Token.Category('some string')
>>> a_third_string = Token.OtherCategory('some other string')
>>> # All categories are subclasses of Token (and by extension str)
>>> issubclass(Token.Category, Token) and issubclass(Token.OtherCategory, Token)
True
>>> isinstance(other_string, Token.Category)
True
>>> isinstance(a_third_string, Token.Category)
False
>>> # Categories can be arbitrarily nested
>>> yet_another_string = Token.Category.SubCategory('hello')
>>> # string equality still works between them all
>>> token_string == yet_another_string
True
"""
def __repr__(self,):
return f'{self.__class__.__name__}({super().__repr__()})'
class Scanner:
def __init__(self, rules: List[Tuple[str, Callable]], flags=regex.VERSION1) -> None:
"""
"""
self.patterns = {}
self.actions = {}
self.default_action = str
for index, rule in enumerate(rules):
pattern, action = rule
if pattern == DEFAULT:
self.default_action = action
continue
name = f'scanner_pattern{index}'
self.patterns[name] = pattern
self.actions[name] = action
self.scanner = regex.compile(self.assemble_pattern(), flags)
def assemble_pattern(self):
r"""
Construct a regex pattern from a set of sub-patterns, assigning each sub-pattern to a named capture group
Example:
>>> self.patterns = {
... "scanner_pattern0": r"\w+",
... "scanner_pattern1": r"\d{3}"
... }
>>> self.assemble_pattern()
'(?|(?<scanner_pattern0>\\w+)|(?<scanner_pattern1>\\d{3}))'
"""
named_patterns = []
for name, pattern in self.patterns.items():
named_patterns.append(f'(?<{name}>{pattern})')
alternates = '|'.join(named_patterns)
full_pattern = fr'(?|{alternates})'
return full_pattern
def get_pattern_name(self, match_object):
"""
Identify which pattern from the rules list was matched by a given match object
The match object's groupdict() will contain entries for each scanner pattern
(ex 'scanner_pattern0', 'scanner_pattern1', etc). all of those entries will have a value of None,
except for one, the one that matched. This method finds that entry, and returns its name
"""
for name, value in match_object.groupdict().items():
name: str
if name.startswith('scanner_pattern'):
if value is not None:
return name
# If we get to this point, something has gone seriously wrong
raise Exception("text fragment matched a pattern from the rules list, but was not captured by that pattern")
def __call__(self, input_str, pass_in_option: Union[Literal['string'], Literal['match_object']] = 'string'):
unprocessed_text = input_str
while len(unprocessed_text) > 0:
m = self.scanner.search(unprocessed_text)
if not m:
# If we've arrived at this point, we've got a chunk of text that doesn't contain any of the patterns.
yield self.default_action(unprocessed_text)
break
start, end = m.span()
if start > 0:
# At this point, we've got a match somewhere in the middle of the string, with unmatched text before it.
yield self.default_action(unprocessed_text[:start])
pattern_name = self.get_pattern_name(m)
action = self.actions[pattern_name]
if pass_in_option == 'string':
val = m[0]
else:
val = m
yield action(val)
unprocessed_text = unprocessed_text[end:]
if __name__ == "__main__":
scanner = Scanner([
(r'%\{.*?\}', Token.Grok),
(r'(?<!\\)\((?:[^)(]*(?R)?)*+(?<!\\)\)', Token.Group),
(DEFAULT, Token.Default)
])
testdata = r'%{WORD:action}test %\{WORD:action}test (%{ASA_TCP_UDP}|%{ASA_ICMP}) \(type \d, code \d\) (?:(?:[A-Fa-f0-9]{4}\.){2}[A-Fa-f0-9]{4}) some text' # noqa
for token in scanner(testdata):
print(repr(token))
# Output:
# Token.Grok('%{WORD:action}')
# Token.Default('test %\\{WORD:action}test ')
# Token.Group('(%{ASA_TCP_UDP}|%{ASA_ICMP})')
# Token.Default(' \\(type \\d, code \\d\\) ')
# Token.Group('(?:(?:[A-Fa-f0-9]{4}\\.){2}[A-Fa-f0-9]{4})')
# Token.Default(' some text')
# A more advanced example
import textwrap
class Markdown(Token):
pass
sample_text = textwrap.dedent("""
# Heading level 1
## Heading level 2
## Heading level 2
I just love **bold text**.
I just love __bold text__.
Italicized text is the *cat's meow*.
Italicized text is the _cat's meow_.
> Dorothy followed her through many of the beautiful rooms in her castle.
>
> The Witch bade her clean the pots and kettles and sweep the floor and keep the fire fed with wood.
- First item
- Second item
- Third item
- Fourth item
At the command prompt, type `nano`.
```
<html>
<head>
</head>
</html>
```
My favorite search engine is [Duck Duck Go](https://duckduckgo.com).
test **not a
heading**
""")
mdscanner = Scanner([
(r'[#]+ .*\n', Markdown.Heading),
(r'[*]{2}[^*\n]+[*]{2}', Markdown.Bold),
(r'[_]{2}[^_\n]+[_]{2}', Markdown.Bold),
(r'[*]{1}[^*\n]+[*]{1}', Markdown.Italics),
(r'[_]{1}[^_\n]+[_]{1}', Markdown.Italics),
(r'([>] .*\n)+', Markdown.Blockquote),
(r'([-] .*\n)+', Markdown.List),
(r'[`]{1}[^`\n]+[`]{1}', Markdown.Code),
(r'[`]{3}[^`]+[`]{3}', Markdown.Code),
(r'\[[^\]]+\]\([^\)]+\)', Markdown.Link),
(DEFAULT, Markdown.Text)
])
for token in mdscanner(sample_text):
print(repr(token))
# Output:
# Markdown.Text('\n')
# Markdown.Heading('# Heading level 1\n')
# Markdown.Heading('## Heading level 2\n')
# Markdown.Heading('## Heading level 2\n')
# Markdown.Text('I just love ')
# Markdown.Bold('**bold text**')
# Markdown.Text('.\nI just love ')
# Markdown.Bold('__bold text__')
# Markdown.Text('.\nItalicized text is the ')
# Markdown.Italics("*cat's meow*")
# Markdown.Text('.\nItalicized text is the ')
# Markdown.Italics("_cat's meow_")
# Markdown.Text('.\n')
# Markdown.Blockquote('> Dorothy followed her through many of the beautiful rooms in her castle.\n> \n> The Witch bade her clean the pots and kettles and sweep the floor and keep the fire fed with wood.\n')
# Markdown.List('- First item\n- Second item\n- Third item\n- Fourth item\n')
# Markdown.Text('At the command prompt, type ')
# Markdown.Code('`nano`')
# Markdown.Text('.\n')
# Markdown.Code('```\n<html>\n<head>\n</head>\n</html>\n```')
# Markdown.Text('\nMy favorite search engine is ')
# Markdown.Link('[Duck Duck Go](https://duckduckgo.com)')
# Markdown.Text('.\ntest **not a\nheading**\n')
# An even more advanced example
class NewMarkdownBase:
"""Base class for all types of NewMarkdown tokens
sets up the default behaviour for extracting a printable value from a regex match group
__init__ can be overridden to modify the logic for populating the value attribute
from the match object
"""
matches: List[str]
value: Any
def __repr__(self,):
return f'NewMarkdown.{self.__class__.__name__}({repr(self.value)})'
def extract(self, d) -> Any:
return d['text'][0]
def __init__(self, obj) -> None:
if isinstance(obj, Match):
self.value = self.extract(obj.capturesdict())
else:
# class is being manually instantiated
self.value = obj
class NewMarkdown:
class Text(str):
def __repr__(self,):
return f'NewMarkdown.Text({repr(str(self))})'
class Heading(NewMarkdownBase):
pattern = r'[#]+ (?<text>.*)\n'
class Italics(NewMarkdownBase):
patterns = [
r'[*]{1}(?<text>[^*\n]+)[*]{1}',
r'[_]{1}(?<text>[^_\n]+)[_]{1}'
]
class Bold(NewMarkdownBase):
patterns = [
r'[*]{2}(?<text>[^*\n]+)[*]{2}',
r'[_]{2}(?<text>[^_\n]+)[_]{2}'
]
class Blockquote(NewMarkdownBase):
pattern = r'([>] (?<text>.*)\n)+'
def extract(self, d) -> Any:
return '\n'.join(d['text'])
class List(NewMarkdownBase):
pattern = r'([-] (?<text>.*)\n)+'
def extract(self, d) -> Any:
return d['text']
class Code(NewMarkdownBase):
patterns = [
r'[`]{1}(?<text>[^`\n]+)[`]{1}',
r'[`]{3}(?<text>[^`]+)[`]{3}'
]
class Link(NewMarkdownBase):
pattern = r'\[(?<text>[^\]]+)\]\((?<url>[^\)]+)\)'
def extract(self, d) -> Any:
text = d['text'][0]
url = d['url'][0]
return dict(text=text, url=url)
mdscanner2 = Scanner([
(NewMarkdown.Heading.pattern, NewMarkdown.Heading),
(NewMarkdown.Bold.patterns[0], NewMarkdown.Bold),
(NewMarkdown.Bold.patterns[1], NewMarkdown.Bold),
(NewMarkdown.Italics.patterns[0], NewMarkdown.Italics),
(NewMarkdown.Italics.patterns[1], NewMarkdown.Italics),
(NewMarkdown.Blockquote.pattern, NewMarkdown.Blockquote),
(NewMarkdown.List.pattern, NewMarkdown.List),
(NewMarkdown.Code.patterns[0], NewMarkdown.Code),
(NewMarkdown.Code.patterns[1], NewMarkdown.Code),
(NewMarkdown.Link.pattern, NewMarkdown.Link),
(DEFAULT, NewMarkdown.Text)
])
for token in mdscanner2(sample_text, pass_in_option='match_object'):
print(repr(token))
# Output:
# NewMarkdown.Text('\n')
# NewMarkdown.Heading('Heading level 1')
# NewMarkdown.Heading('Heading level 2')
# NewMarkdown.Heading('Heading level 2')
# NewMarkdown.Text('I just love ')
# NewMarkdown.Bold('bold text')
# NewMarkdown.Text('.\nI just love ')
# NewMarkdown.Bold('bold text')
# NewMarkdown.Text('.\nItalicized text is the ')
# NewMarkdown.Italics("cat's meow")
# NewMarkdown.Text('.\nItalicized text is the ')
# NewMarkdown.Italics("cat's meow")
# NewMarkdown.Text('.\n')
# NewMarkdown.Blockquote('Dorothy followed her through many of the beautiful rooms in her castle.\n\nThe Witch bade her clean the pots and kettles and sweep the floor and keep the fire fed with wood.')
# NewMarkdown.List(['First item', 'Second item', 'Third item', 'Fourth item'])
# NewMarkdown.Text('At the command prompt, type ')
# NewMarkdown.Code('nano')
# NewMarkdown.Text('.\n')
# NewMarkdown.Code('\n<html>\n<head>\n</head>\n</html>\n')
# NewMarkdown.Text('\nMy favorite search engine is ')
# NewMarkdown.Link({'text': 'Duck Duck Go', 'url': 'https://duckduckgo.com'})
# NewMarkdown.Text('.\ntest **not a\nheading**\n')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment