BramVanroy/spacy-disable-tok.py

## spacy-disable-tok.py
from typing import List

import spacy
from spacy import Language, Vocab
from spacy.tokens import Doc

def load_nlp(model_name: str = "en_core_web_sm",
             is_tokenized: bool = False,
             exclude: List[str] = None):
    """Load a spaCy model. Disable sentence segmentation and tokenization with is_tokenized.
    You can exclude components that you do not need """
    exclude_components = ["senter", "sentencizer"] if is_tokenized else []
    exclude_components = exclude_components if exclude is None else exclude_components + exclude
    nlp = spacy.load(model_name, exclude=exclude_components)
    if is_tokenized:
        # Disable tokenization with custom tokenizer: It takes as input a string, which will
        # be split on white-space, e.g. "I like cookies ." -> "I", "like", "cookies", "."
        nlp.tokenizer = PretokenizedTokenizer(nlp.vocab)
        # It is still possible that the dependency parser leads to sentence segmentation, disable
        nlp.add_pipe("disable_sbd", name="disable-sbd", before="parser")

    return nlp

@Language.factory("disable_sbd")
class SpacyDisableSentenceSegmentation:
    """Disables spaCy's dependency-based sentence boundary detection. In addition, senter and sentencizer components
    need to be disabled as well."""

    def __init__(self, nlp: Language, name: str):
        self.nlp = nlp
        self.name = name

    def __call__(self, doc: Doc) -> Doc:
        for token in doc:
            token.is_sent_start = False
        return doc


class PretokenizedTokenizer:
    """Custom tokenizer to be used in spaCy when the text is already pretokenized.
    It will simply be split on spaces."""
    def __init__(self, vocab: Vocab):
        """Initialize tokenizer with a given vocab
        :param vocab: an existing vocabulary (see https://spacy.io/api/vocab)
        """
        self.vocab = vocab

    def __call__(self, inp: str) -> Doc:
        """Call the tokenizer on input `inp`.
        :param inp: a string to be split on whitespace
        :return: the created Doc object
        """
        words = inp.split()
        spaces = [True] * (len(words) - 1) + ([True] if inp[-1].isspace() else [False])
        return Doc(self.vocab, words=words, spaces=spaces)

if __name__ == '__main__':
    snlp = load_nlp(is_tokenized=True, exclude=["ner"])
    doc = snlp("I like cookies .")
    for word in doc:
        print(word)
	from typing import List

	import spacy
	from spacy import Language, Vocab
	from spacy.tokens import Doc

	def load_nlp(model_name: str = "en_core_web_sm",
	is_tokenized: bool = False,
	exclude: List[str] = None):
	"""Load a spaCy model. Disable sentence segmentation and tokenization with is_tokenized.
	You can exclude components that you do not need """
	exclude_components = ["senter", "sentencizer"] if is_tokenized else []
	exclude_components = exclude_components if exclude is None else exclude_components + exclude
	nlp = spacy.load(model_name, exclude=exclude_components)
	if is_tokenized:
	# Disable tokenization with custom tokenizer: It takes as input a string, which will
	# be split on white-space, e.g. "I like cookies ." -> "I", "like", "cookies", "."
	nlp.tokenizer = PretokenizedTokenizer(nlp.vocab)
	# It is still possible that the dependency parser leads to sentence segmentation, disable
	nlp.add_pipe("disable_sbd", name="disable-sbd", before="parser")

	return nlp

	@Language.factory("disable_sbd")
	class SpacyDisableSentenceSegmentation:
	"""Disables spaCy's dependency-based sentence boundary detection. In addition, senter and sentencizer components
	need to be disabled as well."""

	def __init__(self, nlp: Language, name: str):
	self.nlp = nlp
	self.name = name

	def __call__(self, doc: Doc) -> Doc:
	for token in doc:
	token.is_sent_start = False
	return doc


	class PretokenizedTokenizer:
	"""Custom tokenizer to be used in spaCy when the text is already pretokenized.
	It will simply be split on spaces."""
	def __init__(self, vocab: Vocab):
	"""Initialize tokenizer with a given vocab
	:param vocab: an existing vocabulary (see https://spacy.io/api/vocab)
	"""
	self.vocab = vocab

	def __call__(self, inp: str) -> Doc:
	"""Call the tokenizer on input `inp`.
	:param inp: a string to be split on whitespace
	:return: the created Doc object
	"""
	words = inp.split()
	spaces = [True] * (len(words) - 1) + ([True] if inp[-1].isspace() else [False])
	return Doc(self.vocab, words=words, spaces=spaces)

	if __name__ == '__main__':
	snlp = load_nlp(is_tokenized=True, exclude=["ner"])
	doc = snlp("I like cookies .")
	for word in doc:
	print(word)