Created
February 15, 2022 11:42
-
-
Save BramVanroy/380ecfa82f361f61c6efb09ea3aee04a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from typing import List | |
import spacy | |
from spacy import Language, Vocab | |
from spacy.tokens import Doc | |
def load_nlp(model_name: str = "en_core_web_sm", | |
is_tokenized: bool = False, | |
exclude: List[str] = None): | |
"""Load a spaCy model. Disable sentence segmentation and tokenization with is_tokenized. | |
You can exclude components that you do not need """ | |
exclude_components = ["senter", "sentencizer"] if is_tokenized else [] | |
exclude_components = exclude_components if exclude is None else exclude_components + exclude | |
nlp = spacy.load(model_name, exclude=exclude_components) | |
if is_tokenized: | |
# Disable tokenization with custom tokenizer: It takes as input a string, which will | |
# be split on white-space, e.g. "I like cookies ." -> "I", "like", "cookies", "." | |
nlp.tokenizer = PretokenizedTokenizer(nlp.vocab) | |
# It is still possible that the dependency parser leads to sentence segmentation, disable | |
nlp.add_pipe("disable_sbd", name="disable-sbd", before="parser") | |
return nlp | |
@Language.factory("disable_sbd") | |
class SpacyDisableSentenceSegmentation: | |
"""Disables spaCy's dependency-based sentence boundary detection. In addition, senter and sentencizer components | |
need to be disabled as well.""" | |
def __init__(self, nlp: Language, name: str): | |
self.nlp = nlp | |
self.name = name | |
def __call__(self, doc: Doc) -> Doc: | |
for token in doc: | |
token.is_sent_start = False | |
return doc | |
class PretokenizedTokenizer: | |
"""Custom tokenizer to be used in spaCy when the text is already pretokenized. | |
It will simply be split on spaces.""" | |
def __init__(self, vocab: Vocab): | |
"""Initialize tokenizer with a given vocab | |
:param vocab: an existing vocabulary (see https://spacy.io/api/vocab) | |
""" | |
self.vocab = vocab | |
def __call__(self, inp: str) -> Doc: | |
"""Call the tokenizer on input `inp`. | |
:param inp: a string to be split on whitespace | |
:return: the created Doc object | |
""" | |
words = inp.split() | |
spaces = [True] * (len(words) - 1) + ([True] if inp[-1].isspace() else [False]) | |
return Doc(self.vocab, words=words, spaces=spaces) | |
if __name__ == '__main__': | |
snlp = load_nlp(is_tokenized=True, exclude=["ner"]) | |
doc = snlp("I like cookies .") | |
for word in doc: | |
print(word) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment