Skip to content

Instantly share code, notes, and snippets.

@afparsons
Created January 23, 2023 21:56
Show Gist options
  • Save afparsons/69ad33f91280d3cbe2c8b4961cb4e045 to your computer and use it in GitHub Desktop.
Save afparsons/69ad33f91280d3cbe2c8b4961cb4e045 to your computer and use it in GitHub Desktop.
spaCy 3.x RegularExpressionMatcher: Class
"""
A quick excerpt demonstrating usage of a custom `RegularExpressionMatcher` for spaCy 3.
This is from one of my personal projects (HaleyNLP/Irnerius). Module-level imports and other code blocks have been elided.
"""
class RegularExpressionMatcher:
"""
Akin to spaCy's Token Matcher, although this runs regular expressions
on the entire doc text.
TODO: this is a minimal viable implementation; evaluate how much additional development it still needs
"""
def __init__(self, vocab):
self.vocab = vocab
self._patterns: Dict[str, List[re.Pattern]] = {}
self._callbacks: Dict[str, Callable[[Span, re.Match], Any]] = {}
def add(
self,
key: str,
patterns: List[re.Pattern],
on_match: Callable,
greedy: Literal['FIRST', 'LONGEST', None] = None,
) -> None:
"""
TODO: does `greedy` have a purpose here? Or should we just keep it to match the Matcher API?
"""
key = self._normalize_key(key)
self._patterns.setdefault(key, [])
self._patterns[key] = patterns
self._callbacks[key] = on_match
def _normalize_key(self, key):
if isinstance(key, str):
return self.vocab.strings.add(key)
else:
return key
def __call__(self, doclike: Union[Doc, Span]):
found: List[Tuple[str, Tuple[int, int], re.Match]] = []
for key, patterns in self._patterns.items():
callback: Callable = self._callbacks[key]
for pattern in patterns:
for match in pattern.finditer(doclike.text):
span_coords = match.span()
callback(doclike.char_span(*span_coords), match)
found.append((key, span_coords, match))
return sorted(found, key=lambda t: t[1])
class AbstractComponentMatcher(ABC):
# noinspection PyMethodOverriding
def __init_subclass__(
cls,
*,
matcher: Union[Type[Matcher], Type[RegularExpressionMatcher]],
**kwargs,
):
super().__init_subclass__(**kwargs)
cls._matcher: Union[Type[Matcher], Type[RegularExpressionMatcher]] = matcher
def __init__(
self,
nlp: Language,
matcher_rules,
) -> None:
"""
"""
self.matcher: Union[Matcher, RegularExpressionMatcher] = self._matcher(nlp.vocab)
for matcher_rule in matcher_rules:
self.matcher.add(
key=matcher_rule['key'],
patterns=matcher_rule['patterns'],
on_match=registry.misc.get(matcher_rule['on_match']),
greedy=matcher_rule.get('greedy'),
)
@classmethod
@abstractmethod
def _make_span_groups(cls, doc: Doc) -> None:
"""
"""
raise NotImplementedError
def __call__(self, doc: Doc) -> Doc:
"""
"""
_ = self.matcher(doc)
self._make_span_groups(doc)
return doc
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment