Skip to content

Instantly share code, notes, and snippets.

@BramVanroy
Created February 15, 2022 11:42
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save BramVanroy/380ecfa82f361f61c6efb09ea3aee04a to your computer and use it in GitHub Desktop.
Save BramVanroy/380ecfa82f361f61c6efb09ea3aee04a to your computer and use it in GitHub Desktop.
from typing import List
import spacy
from spacy import Language, Vocab
from spacy.tokens import Doc
def load_nlp(model_name: str = "en_core_web_sm",
is_tokenized: bool = False,
exclude: List[str] = None):
"""Load a spaCy model. Disable sentence segmentation and tokenization with is_tokenized.
You can exclude components that you do not need """
exclude_components = ["senter", "sentencizer"] if is_tokenized else []
exclude_components = exclude_components if exclude is None else exclude_components + exclude
nlp = spacy.load(model_name, exclude=exclude_components)
if is_tokenized:
# Disable tokenization with custom tokenizer: It takes as input a string, which will
# be split on white-space, e.g. "I like cookies ." -> "I", "like", "cookies", "."
nlp.tokenizer = PretokenizedTokenizer(nlp.vocab)
# It is still possible that the dependency parser leads to sentence segmentation, disable
nlp.add_pipe("disable_sbd", name="disable-sbd", before="parser")
return nlp
@Language.factory("disable_sbd")
class SpacyDisableSentenceSegmentation:
"""Disables spaCy's dependency-based sentence boundary detection. In addition, senter and sentencizer components
need to be disabled as well."""
def __init__(self, nlp: Language, name: str):
self.nlp = nlp
self.name = name
def __call__(self, doc: Doc) -> Doc:
for token in doc:
token.is_sent_start = False
return doc
class PretokenizedTokenizer:
"""Custom tokenizer to be used in spaCy when the text is already pretokenized.
It will simply be split on spaces."""
def __init__(self, vocab: Vocab):
"""Initialize tokenizer with a given vocab
:param vocab: an existing vocabulary (see https://spacy.io/api/vocab)
"""
self.vocab = vocab
def __call__(self, inp: str) -> Doc:
"""Call the tokenizer on input `inp`.
:param inp: a string to be split on whitespace
:return: the created Doc object
"""
words = inp.split()
spaces = [True] * (len(words) - 1) + ([True] if inp[-1].isspace() else [False])
return Doc(self.vocab, words=words, spaces=spaces)
if __name__ == '__main__':
snlp = load_nlp(is_tokenized=True, exclude=["ner"])
doc = snlp("I like cookies .")
for word in doc:
print(word)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment