afparsons/regular_expression_matcher.py

## regular_expression_matcher.py
"""
A quick excerpt demonstrating usage of a custom `RegularExpressionMatcher` for spaCy 3.
This is from one of my personal projects (HaleyNLP/Irnerius). Module-level imports and other code blocks have been elided.
"""

class RegularExpressionMatcher:
    """
    Akin to spaCy's Token Matcher, although this runs regular expressions
    on the entire doc text.

    TODO: this is a minimal viable implementation; evaluate how much additional development it still needs
    """

    def __init__(self, vocab):
        self.vocab = vocab
        self._patterns: Dict[str, List[re.Pattern]] = {}
        self._callbacks: Dict[str, Callable[[Span, re.Match], Any]] = {}

    def add(
        self,
        key: str,
        patterns: List[re.Pattern],
        on_match: Callable,
        greedy: Literal['FIRST', 'LONGEST', None] = None,
    ) -> None:
        """
        TODO: does `greedy` have a purpose here? Or should we just keep it to match the Matcher API?
        """
        key = self._normalize_key(key)
        self._patterns.setdefault(key, [])
        self._patterns[key] = patterns
        self._callbacks[key] = on_match

    def _normalize_key(self, key):
        if isinstance(key, str):
            return self.vocab.strings.add(key)
        else:
            return key

    def __call__(self, doclike: Union[Doc, Span]):
        found: List[Tuple[str, Tuple[int, int], re.Match]] = []
        for key, patterns in self._patterns.items():
            callback: Callable = self._callbacks[key]
            for pattern in patterns:
                for match in pattern.finditer(doclike.text):
                    span_coords = match.span()
                    callback(doclike.char_span(*span_coords), match)
                    found.append((key, span_coords, match))

        return sorted(found, key=lambda t: t[1])


class AbstractComponentMatcher(ABC):

    # noinspection PyMethodOverriding
    def __init_subclass__(
        cls,
        *,
        matcher: Union[Type[Matcher], Type[RegularExpressionMatcher]],
        **kwargs,
    ):
        super().__init_subclass__(**kwargs)
        cls._matcher: Union[Type[Matcher], Type[RegularExpressionMatcher]] = matcher

    def __init__(
        self,
        nlp: Language,
        matcher_rules,
    ) -> None:
        """
        """
        self.matcher: Union[Matcher, RegularExpressionMatcher] = self._matcher(nlp.vocab)
        for matcher_rule in matcher_rules:
            self.matcher.add(
                key=matcher_rule['key'],
                patterns=matcher_rule['patterns'],
                on_match=registry.misc.get(matcher_rule['on_match']),
                greedy=matcher_rule.get('greedy'),
            )

    @classmethod
    @abstractmethod
    def _make_span_groups(cls, doc: Doc) -> None:
        """
        """
        raise NotImplementedError

    def __call__(self, doc: Doc) -> Doc:
        """
        """
        _ = self.matcher(doc)
        self._make_span_groups(doc)
        return doc
	"""
	A quick excerpt demonstrating usage of a custom `RegularExpressionMatcher` for spaCy 3.
	This is from one of my personal projects (HaleyNLP/Irnerius). Module-level imports and other code blocks have been elided.
	"""

	class RegularExpressionMatcher:
	"""
	Akin to spaCy's Token Matcher, although this runs regular expressions
	on the entire doc text.

	TODO: this is a minimal viable implementation; evaluate how much additional development it still needs
	"""

	def __init__(self, vocab):
	self.vocab = vocab
	self._patterns: Dict[str, List[re.Pattern]] = {}
	self._callbacks: Dict[str, Callable[[Span, re.Match], Any]] = {}

	def add(
	self,
	key: str,
	patterns: List[re.Pattern],
	on_match: Callable,
	greedy: Literal['FIRST', 'LONGEST', None] = None,
	) -> None:
	"""
	TODO: does `greedy` have a purpose here? Or should we just keep it to match the Matcher API?
	"""
	key = self._normalize_key(key)
	self._patterns.setdefault(key, [])
	self._patterns[key] = patterns
	self._callbacks[key] = on_match

	def _normalize_key(self, key):
	if isinstance(key, str):
	return self.vocab.strings.add(key)
	else:
	return key

	def __call__(self, doclike: Union[Doc, Span]):
	found: List[Tuple[str, Tuple[int, int], re.Match]] = []
	for key, patterns in self._patterns.items():
	callback: Callable = self._callbacks[key]
	for pattern in patterns:
	for match in pattern.finditer(doclike.text):
	span_coords = match.span()
	callback(doclike.char_span(*span_coords), match)
	found.append((key, span_coords, match))

	return sorted(found, key=lambda t: t[1])


	class AbstractComponentMatcher(ABC):

	# noinspection PyMethodOverriding
	def __init_subclass__(
	cls,
	*,
	matcher: Union[Type[Matcher], Type[RegularExpressionMatcher]],
	**kwargs,
	):
	super().__init_subclass__(**kwargs)
	cls._matcher: Union[Type[Matcher], Type[RegularExpressionMatcher]] = matcher

	def __init__(
	self,
	nlp: Language,
	matcher_rules,
	) -> None:
	"""
	"""
	self.matcher: Union[Matcher, RegularExpressionMatcher] = self._matcher(nlp.vocab)
	for matcher_rule in matcher_rules:
	self.matcher.add(
	key=matcher_rule['key'],
	patterns=matcher_rule['patterns'],
	on_match=registry.misc.get(matcher_rule['on_match']),
	greedy=matcher_rule.get('greedy'),
	)

	@classmethod
	@abstractmethod
	def _make_span_groups(cls, doc: Doc) -> None:
	"""
	"""
	raise NotImplementedError

	def __call__(self, doc: Doc) -> Doc:
	"""
	"""
	_ = self.matcher(doc)
	self._make_span_groups(doc)
	return doc