Created
January 23, 2023 21:56
-
-
Save afparsons/69ad33f91280d3cbe2c8b4961cb4e045 to your computer and use it in GitHub Desktop.
spaCy 3.x RegularExpressionMatcher: Class
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
A quick excerpt demonstrating usage of a custom `RegularExpressionMatcher` for spaCy 3. | |
This is from one of my personal projects (HaleyNLP/Irnerius). Module-level imports and other code blocks have been elided. | |
""" | |
class RegularExpressionMatcher: | |
""" | |
Akin to spaCy's Token Matcher, although this runs regular expressions | |
on the entire doc text. | |
TODO: this is a minimal viable implementation; evaluate how much additional development it still needs | |
""" | |
def __init__(self, vocab): | |
self.vocab = vocab | |
self._patterns: Dict[str, List[re.Pattern]] = {} | |
self._callbacks: Dict[str, Callable[[Span, re.Match], Any]] = {} | |
def add( | |
self, | |
key: str, | |
patterns: List[re.Pattern], | |
on_match: Callable, | |
greedy: Literal['FIRST', 'LONGEST', None] = None, | |
) -> None: | |
""" | |
TODO: does `greedy` have a purpose here? Or should we just keep it to match the Matcher API? | |
""" | |
key = self._normalize_key(key) | |
self._patterns.setdefault(key, []) | |
self._patterns[key] = patterns | |
self._callbacks[key] = on_match | |
def _normalize_key(self, key): | |
if isinstance(key, str): | |
return self.vocab.strings.add(key) | |
else: | |
return key | |
def __call__(self, doclike: Union[Doc, Span]): | |
found: List[Tuple[str, Tuple[int, int], re.Match]] = [] | |
for key, patterns in self._patterns.items(): | |
callback: Callable = self._callbacks[key] | |
for pattern in patterns: | |
for match in pattern.finditer(doclike.text): | |
span_coords = match.span() | |
callback(doclike.char_span(*span_coords), match) | |
found.append((key, span_coords, match)) | |
return sorted(found, key=lambda t: t[1]) | |
class AbstractComponentMatcher(ABC): | |
# noinspection PyMethodOverriding | |
def __init_subclass__( | |
cls, | |
*, | |
matcher: Union[Type[Matcher], Type[RegularExpressionMatcher]], | |
**kwargs, | |
): | |
super().__init_subclass__(**kwargs) | |
cls._matcher: Union[Type[Matcher], Type[RegularExpressionMatcher]] = matcher | |
def __init__( | |
self, | |
nlp: Language, | |
matcher_rules, | |
) -> None: | |
""" | |
""" | |
self.matcher: Union[Matcher, RegularExpressionMatcher] = self._matcher(nlp.vocab) | |
for matcher_rule in matcher_rules: | |
self.matcher.add( | |
key=matcher_rule['key'], | |
patterns=matcher_rule['patterns'], | |
on_match=registry.misc.get(matcher_rule['on_match']), | |
greedy=matcher_rule.get('greedy'), | |
) | |
@classmethod | |
@abstractmethod | |
def _make_span_groups(cls, doc: Doc) -> None: | |
""" | |
""" | |
raise NotImplementedError | |
def __call__(self, doc: Doc) -> Doc: | |
""" | |
""" | |
_ = self.matcher(doc) | |
self._make_span_groups(doc) | |
return doc |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment