wesslen/overlapping.jsonl

## overlapping.jsonl
{"text":"Biomaterials and medical devices are broadly used in the diagnosis, treatment, repair, replacement or enhancing functions of human tissues or organs. Although the living conditions of human beings have been steadily improved in most parts of the world. ","label":"ID: 27047681","spans":[{ "start": 0, "end": 12, "label": "ORG" },{ "start": 0, "end": 12, "label": "ORG_2" }]}

## textcat-manual-spans.py
from typing import Iterable, List, Optional, Union

import prodigy
from prodigy.components.loaders import get_stream
from prodigy.components.preprocess import add_tokens
from prodigy.util import split_string

import spacy
from spacy.language import Language

# Recipe decorator with argument annotations: (description, argument type,
# shortcut, type / converter function called on value before it's passed to
# the function). Descriptions are also shown when typing --help.
@prodigy.recipe(
    "textcat.manual.spans",
    dataset=("The dataset to use", "positional", None, str),
    spacy_model=("Loadable spaCy pipeline for tokenization or blank:lang (e.g. blank:en)", "positional", None, str),
    source=("Data to annotate (file path or '-' to read from standard input)", "positional", None, str),
    loader=("Loader (guessed from file extension if not set)", "option", "lo", str),
    exclude=("Comma-separated list of dataset IDs whose annotations to exclude", "option", "e", split_string),
    highlight_chars=("Allow highlighting individual characters instead of tokens", "flag", "C", bool),
)

def textcat_manual_spans(
    dataset: str,
    spacy_model: Union[str, Language],
    source: Union[str, Iterable[dict]],
    loader: Optional[str] = None,
    exclude: Optional[List[str]] = None,
    highlight_chars: bool = False,
):
    """
    Text classification with overlapping spans
    """

    nlp = spacy.load(spacy_model)
    stream = get_stream(
        source, loader=loader, rehash=True, dedup=True, input_key="text"
    )
    # Add "tokens" key to the tasks, either with words or characters
    # If you remove this, the spans interface will switch back to ner_manual
    stream = add_tokens(nlp, stream, use_chars=highlight_chars)

    return {
        "view_id": "classification",
        "dataset": dataset,  # Name of dataset to save annotations
        "stream": stream,  # Incoming stream of examples
        "exclude": exclude,  # List of dataset names to exclude
    }
	from typing import Iterable, List, Optional, Union

	import prodigy
	from prodigy.components.loaders import get_stream
	from prodigy.components.preprocess import add_tokens
	from prodigy.util import split_string

	import spacy
	from spacy.language import Language

	# Recipe decorator with argument annotations: (description, argument type,
	# shortcut, type / converter function called on value before it's passed to
	# the function). Descriptions are also shown when typing --help.
	@prodigy.recipe(
	"textcat.manual.spans",
	dataset=("The dataset to use", "positional", None, str),
	spacy_model=("Loadable spaCy pipeline for tokenization or blank:lang (e.g. blank:en)", "positional", None, str),
	source=("Data to annotate (file path or '-' to read from standard input)", "positional", None, str),
	loader=("Loader (guessed from file extension if not set)", "option", "lo", str),
	exclude=("Comma-separated list of dataset IDs whose annotations to exclude", "option", "e", split_string),
	highlight_chars=("Allow highlighting individual characters instead of tokens", "flag", "C", bool),
	)

	def textcat_manual_spans(
	dataset: str,
	spacy_model: Union[str, Language],
	source: Union[str, Iterable[dict]],
	loader: Optional[str] = None,
	exclude: Optional[List[str]] = None,
	highlight_chars: bool = False,
	):
	"""
	Text classification with overlapping spans
	"""

	nlp = spacy.load(spacy_model)
	stream = get_stream(
	source, loader=loader, rehash=True, dedup=True, input_key="text"
	)
	# Add "tokens" key to the tasks, either with words or characters
	# If you remove this, the spans interface will switch back to ner_manual
	stream = add_tokens(nlp, stream, use_chars=highlight_chars)

	return {
	"view_id": "classification",
	"dataset": dataset, # Name of dataset to save annotations
	"stream": stream, # Incoming stream of examples
	"exclude": exclude, # List of dataset names to exclude
	}