Skip to content

Instantly share code, notes, and snippets.

@afparsons
Last active January 23, 2023 22:04
Show Gist options
  • Save afparsons/34ddca6428e4a74a70670d49b1ba5af2 to your computer and use it in GitHub Desktop.
Save afparsons/34ddca6428e4a74a70670d49b1ba5af2 to your computer and use it in GitHub Desktop.
spaCy 3.x RegularExpressionMatcher: Patterns and on_match pattern handlers
"""
A quick excerpt demonstrating usage of a custom `RegularExpressionMatcher` for spaCy 3.
This is from one of my personal projects (HaleyNLP/Irnerius). Module-level imports and other code blocks have been elided.
"""
@registry.misc('haleynlp.common.extraction.handler.on_match.bibliography._european_union_ecli')
def _european_union_ecli(
span: Span,
match: re.Match,
) -> None:
"""
An `on_match` pattern handler for extracting ECLI citations.
Args:
span (Span):
The spaCy Span in which a regular expression match was found.
match (re.Match):
A found regular expression match.
Returns:
None
"""
# The majority of this function's implementation has been omitted
# ... In short, it assembles an `Annotation` object from the regular expression `Match` and stores it on the `Doc` object
# ... (the `_.annotations` extension is created by another custom component not shown in this example)
annotation: Annotation = ...
span.doc._.annotations.citations.add(annotation)
MATCHING_RULES: Tuple = (
{
'key': '_european_union_ecli',
'patterns': [
re.compile(
pattern=r"""
(?P<ECLI>
(?P<IDENTIFIER>ECLI):
(?P<COUNTRY>[A-Z]{2}):
(?P<COURT>[A-Z][A-Z0-9]{0,6}):
(?P<YEAR>\d{4}):
(?P<CASE>(\d|\.){1,25})
)
""",
flags=re.VERBOSE,
)
],
'on_match': 'haleynlp.common.extraction.handler.on_match.bibliography._european_union_ecli',
},
# ...other elements for different citation types, like United States Acts, Code, Laws, and court cases...
)
@registry.misc('haleynlp.en.component.config.bibliography')
def create_patterns_bibliography() -> Tuple:
return MATCHING_RULES
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment