ivyleavedtoadflax/customer_tokenizer.py

## customer_tokenizer.py
from spacy.util import (compile_prefix_regex, compile_infix_regex, compile_suffix_regex)

def _custom_tokenizer(self, nlp, regex=[r"[-/,.\n\s]"]):
    """Custom tokenizer to split date formats like 05-05-2015
    and 05/05/2015
    """
    # Use the default prefixes and suffixes
    prefix_re = compile_prefix_regex(nlp.Defaults.prefixes)
    suffix_re = compile_suffix_regex(nlp.Defaults.suffixes)

    # Add our own rule to the end of the infix regex

    infix_re = compile_infix_regex(tuple(list(nlp.Defaults.infixes) + regex))

    tokenizer = Tokenizer(
        nlp.vocab,
        nlp.Defaults.tokenizer_exceptions,
        prefix_search=prefix_re.search,
        infix_finditer=infix_re.finditer,
        suffix_search=suffix_re.search,
        token_match=None
        )

    return tokenizer
	from spacy.util import (compile_prefix_regex, compile_infix_regex, compile_suffix_regex)

	def _custom_tokenizer(self, nlp, regex=[r"[-/,.\n\s]"]):
	"""Custom tokenizer to split date formats like 05-05-2015
	and 05/05/2015
	"""
	# Use the default prefixes and suffixes
	prefix_re = compile_prefix_regex(nlp.Defaults.prefixes)
	suffix_re = compile_suffix_regex(nlp.Defaults.suffixes)

	# Add our own rule to the end of the infix regex

	infix_re = compile_infix_regex(tuple(list(nlp.Defaults.infixes) + regex))

	tokenizer = Tokenizer(
	nlp.vocab,
	nlp.Defaults.tokenizer_exceptions,
	prefix_search=prefix_re.search,
	infix_finditer=infix_re.finditer,
	suffix_search=suffix_re.search,
	token_match=None
	)

	return tokenizer