jamesdunham/template.py

## template.py
from copy import copy

import spacy
from spacy.tokens import Doc, Span


class Template(object):
    """Create synthetic NER training data from a template document.

    Provide a template NER-annotated spacy Doc when instantiating the class. Passing text to the `render` method
    populates the templated entity spans, preserving entity labels, and generates a new Doc.

    Attributes
    ----------
    template : Doc
        NER-annotated template document.
    ents : list
        Entities of the template document, replaced when rendered. A list of 4-tuples like `[(ent.text, ent.start,
        end.end, ent.label)]`.


    Methods
    -------
    update(template)
        Replace the existing template.
    render(substitutes)
        Populate the template by replacing its entity spans with `substitutes`.
    __len__()
        Count of template entities.
    __str__()
        Template text.

    Notes
    -----
    Any attributes of the template document and its component elements (e.g., tokens) are discarded, aside from IOB
    tags and entity labels.
    """

    def __init__(self, template: Doc):
        self._blank_nlp = spacy.blank('en')
        self.update(template)

    def __str__(self):
        return self.template.text

    def __len__(self):
        return len(self.ents)

    def update(self, template: Doc):
        self.template = template
        assert len(template.ents)
        self.ents = self._extract_ents(template)

    def render(self, substitutes: list) -> Doc:
        if len(self.ents) > len(substitutes):
            raise ValueError('Need at least as many substitute entities as original entities')
        substitutes = copy(substitutes)

        # Build up output text
        output_text = []
        output_spans = []
        output_idx = 0
        for token_idx, token in enumerate(self.template):
            if token.ent_iob_ == 'B':
                # Replace the first token of the entity span with its substitute
                output_spans.append((
                    output_idx,
                    output_idx + len(substitutes[0]),
                    token.ent_type_
                ))
                # Make the substitution and advance the output index to match
                output_idx += len(substitutes[0])
                output_text.append(substitutes.pop(0))
            if token.ent_iob_ in ['B', 'I'] and self.template[
                min(token_idx + 1, len(self.template) - 1)].ent_iob_ != 'I':
                # This is the last token of the entity span, so add its whitespace
                output_text.append(token.whitespace_)
                output_idx += len(token.whitespace_)
            if token.ent_iob_ in ['', 'O']:
                # Pass non-entity tokens through
                output_text.append(token.text_with_ws)
                output_idx += len(token.text_with_ws)
        output_doc = self._blank_nlp(''.join(output_text))
        self._add_ents(output_doc, output_spans)
        return output_doc

    @staticmethod
    def _add_ents(doc, spans):
        for start, end, label in spans:
            span = doc.char_span(start, end, label=label)
            if span:
                doc.ents = list(doc.ents) + [span]
            else:
                print('Skipping invalid span!')
        return doc

    @staticmethod
    def _extract_ents(doc):
        return [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]


def test_one_ent():
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(u'Cambridge is full of rabbits.')
    template = Template(doc)
    substitutes = ['New York']
    output_doc = template.render(substitutes)
    assert isinstance(output_doc, Doc)
    assert output_doc.text == 'New York is full of rabbits.'
    assert output_doc.ents and isinstance(output_doc.ents[0], Span)
    assert output_doc.ents[0].text == 'New York'


def test_two_ents():
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(u'New York or San Francisco?')
    template = Template(doc)
    substitutes = ['Boston', 'Philadelphia']
    output_doc = template.render(substitutes)
    assert isinstance(output_doc, Doc)
    assert output_doc.text == 'Boston or Philadelphia?'
    assert len(output_doc.ents) == 2
    assert output_doc.ents[0].text == 'Boston'
    assert output_doc.ents[1].text == 'Philadelphia'


if __name__ == '__main__':
    test_one_ent()
    test_two_ents()
	from copy import copy

	import spacy
	from spacy.tokens import Doc, Span


	class Template(object):
	"""Create synthetic NER training data from a template document.

	Provide a template NER-annotated spacy Doc when instantiating the class. Passing text to the `render` method
	populates the templated entity spans, preserving entity labels, and generates a new Doc.

	Attributes
	----------
	template : Doc
	NER-annotated template document.
	ents : list
	Entities of the template document, replaced when rendered. A list of 4-tuples like `[(ent.text, ent.start,
	end.end, ent.label)]`.


	Methods
	-------
	update(template)
	Replace the existing template.
	render(substitutes)
	Populate the template by replacing its entity spans with `substitutes`.
	__len__()
	Count of template entities.
	__str__()
	Template text.

	Notes
	-----
	Any attributes of the template document and its component elements (e.g., tokens) are discarded, aside from IOB
	tags and entity labels.
	"""

	def __init__(self, template: Doc):
	self._blank_nlp = spacy.blank('en')
	self.update(template)

	def __str__(self):
	return self.template.text

	def __len__(self):
	return len(self.ents)

	def update(self, template: Doc):
	self.template = template
	assert len(template.ents)
	self.ents = self._extract_ents(template)

	def render(self, substitutes: list) -> Doc:
	if len(self.ents) > len(substitutes):
	raise ValueError('Need at least as many substitute entities as original entities')
	substitutes = copy(substitutes)

	# Build up output text
	output_text = []
	output_spans = []
	output_idx = 0
	for token_idx, token in enumerate(self.template):
	if token.ent_iob_ == 'B':
	# Replace the first token of the entity span with its substitute
	output_spans.append((
	output_idx,
	output_idx + len(substitutes[0]),
	token.ent_type_
	))
	# Make the substitution and advance the output index to match
	output_idx += len(substitutes[0])
	output_text.append(substitutes.pop(0))
	if token.ent_iob_ in ['B', 'I'] and self.template[
	min(token_idx + 1, len(self.template) - 1)].ent_iob_ != 'I':
	# This is the last token of the entity span, so add its whitespace
	output_text.append(token.whitespace_)
	output_idx += len(token.whitespace_)
	if token.ent_iob_ in ['', 'O']:
	# Pass non-entity tokens through
	output_text.append(token.text_with_ws)
	output_idx += len(token.text_with_ws)
	output_doc = self._blank_nlp(''.join(output_text))
	self._add_ents(output_doc, output_spans)
	return output_doc

	@staticmethod
	def _add_ents(doc, spans):
	for start, end, label in spans:
	span = doc.char_span(start, end, label=label)
	if span:
	doc.ents = list(doc.ents) + [span]
	else:
	print('Skipping invalid span!')
	return doc

	@staticmethod
	def _extract_ents(doc):
	return [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]


	def test_one_ent():
	nlp = spacy.load('en_core_web_sm')
	doc = nlp(u'Cambridge is full of rabbits.')
	template = Template(doc)
	substitutes = ['New York']
	output_doc = template.render(substitutes)
	assert isinstance(output_doc, Doc)
	assert output_doc.text == 'New York is full of rabbits.'
	assert output_doc.ents and isinstance(output_doc.ents[0], Span)
	assert output_doc.ents[0].text == 'New York'


	def test_two_ents():
	nlp = spacy.load('en_core_web_sm')
	doc = nlp(u'New York or San Francisco?')
	template = Template(doc)
	substitutes = ['Boston', 'Philadelphia']
	output_doc = template.render(substitutes)
	assert isinstance(output_doc, Doc)
	assert output_doc.text == 'Boston or Philadelphia?'
	assert len(output_doc.ents) == 2
	assert output_doc.ents[0].text == 'Boston'
	assert output_doc.ents[1].text == 'Philadelphia'


	if __name__ == '__main__':
	test_one_ent()
	test_two_ents()