Skip to content

Instantly share code, notes, and snippets.

@afparsons
Created January 23, 2023 22:01
Show Gist options
  • Save afparsons/7e4891de96bc896e2ac30af26c2e5f5a to your computer and use it in GitHub Desktop.
Save afparsons/7e4891de96bc896e2ac30af26c2e5f5a to your computer and use it in GitHub Desktop.
spaCy 3.x RegularExpressionMatcher: Bibliographer
"""
A quick excerpt demonstrating usage of a custom `RegularExpressionMatcher` for spaCy 3.
This is from one of my personal projects (HaleyNLP/Irnerius). Module-level imports and other code blocks have been elided.
"""
class ComponentExtractionBibliographer(
AbstractComponentMatcher,
matcher=RegularExpressionMatcher,
):
"""
"""
def __init__(self, nlp: Language, matcher_rules):
super().__init__(nlp, matcher_rules)
Span.set_extension('citation', default=None, force=True)
@classmethod
def _make_span_groups(cls, doc: Doc) -> None:
"""
"""
citations: List[Span] = []
# noinspection PyProtectedMember
for annotation in doc._.annotations.citations: # type: Annotation
citations.append(annotation.span)
# noinspection PyProtectedMember
annotation.span._.set(
name='citation',
value=annotation,
)
doc.spans['citations'] = citations
@Language.factory(
name='haleynlp_extraction_bibliographer',
default_config={'matcher_rules': {'@misc': 'haleynlp.en.component.config.bibliography'}},
requires=['doc._.annotations', 'doc._.db']
)
def produce_component_extraction_bibliographer(
nlp: Language,
name: str,
matcher_rules: Tuple,
) -> ComponentExtractionBibliographer:
"""
"""
return ComponentExtractionBibliographer(nlp=nlp, matcher_rules=matcher_rules)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment