Skip to content

Instantly share code, notes, and snippets.

@wesslen
Last active March 16, 2023 15:50
Show Gist options
  • Save wesslen/31c44ca0f83242c512772dcfe15a81fc to your computer and use it in GitHub Desktop.
Save wesslen/31c44ca0f83242c512772dcfe15a81fc to your computer and use it in GitHub Desktop.
Textcat classification with pre-annotated overlapping spans, see https://support.prodi.gy/t/textcat-using-span-overlapping-view/6434
{"text":"Biomaterials and medical devices are broadly used in the diagnosis, treatment, repair, replacement or enhancing functions of human tissues or organs. Although the living conditions of human beings have been steadily improved in most parts of the world. ","label":"ID: 27047681","spans":[{ "start": 0, "end": 12, "label": "ORG" },{ "start": 0, "end": 12, "label": "ORG_2" }]}
from typing import Iterable, List, Optional, Union
import prodigy
from prodigy.components.loaders import get_stream
from prodigy.components.preprocess import add_tokens
from prodigy.util import split_string
import spacy
from spacy.language import Language
# Recipe decorator with argument annotations: (description, argument type,
# shortcut, type / converter function called on value before it's passed to
# the function). Descriptions are also shown when typing --help.
@prodigy.recipe(
"textcat.manual.spans",
dataset=("The dataset to use", "positional", None, str),
spacy_model=("Loadable spaCy pipeline for tokenization or blank:lang (e.g. blank:en)", "positional", None, str),
source=("Data to annotate (file path or '-' to read from standard input)", "positional", None, str),
loader=("Loader (guessed from file extension if not set)", "option", "lo", str),
exclude=("Comma-separated list of dataset IDs whose annotations to exclude", "option", "e", split_string),
highlight_chars=("Allow highlighting individual characters instead of tokens", "flag", "C", bool),
)
def textcat_manual_spans(
dataset: str,
spacy_model: Union[str, Language],
source: Union[str, Iterable[dict]],
loader: Optional[str] = None,
exclude: Optional[List[str]] = None,
highlight_chars: bool = False,
):
"""
Text classification with overlapping spans
"""
nlp = spacy.load(spacy_model)
stream = get_stream(
source, loader=loader, rehash=True, dedup=True, input_key="text"
)
# Add "tokens" key to the tasks, either with words or characters
# If you remove this, the spans interface will switch back to ner_manual
stream = add_tokens(nlp, stream, use_chars=highlight_chars)
return {
"view_id": "classification",
"dataset": dataset, # Name of dataset to save annotations
"stream": stream, # Incoming stream of examples
"exclude": exclude, # List of dataset names to exclude
}
@wesslen
Copy link
Author

wesslen commented Mar 15, 2023

To run:

python -m prodigy textcat.manual.spans issue-6434 blank:en data/overlapping.jsonl -F scripts/textcat-manual-spans.py

@wesslen
Copy link
Author

wesslen commented Mar 15, 2023

localhost_8080_ (15)

@darrylestrada97
Copy link

I was able to modify it and add the labels.

from typing import Iterable, List, Optional, Union

import prodigy
from prodigy.components.loaders import get_stream
from prodigy.components.preprocess import add_tokens
from prodigy.util import split_string

import spacy
from spacy.language import Language

# Recipe decorator with argument annotations: (description, argument type,
# shortcut, type / converter function called on value before it's passed to
# the function). Descriptions are also shown when typing --help.

# Helper functions for adding user provided labels to annotation tasks.
def add_label_options_to_stream(stream, labels):
    options = [{"id": label, "text": label} for label in labels]
    for task in stream:
        task["options"] = options
        yield task

def add_labels_to_stream(stream, labels):
    for task in stream:
        task["label"] = labels[0]
        yield task



@prodigy.recipe(
    "textcat.manual.spans",
    dataset=("The dataset to use", "positional", None, str),
    spacy_model=("Loadable spaCy pipeline for tokenization or blank:lang (e.g. blank:en)", "positional", None, str),
    source=("Data to annotate (file path or '-' to read from standard input)", "positional", None, str),
    label=("One or more comma-separated labels", "option", "l", split_string),
    loader=("Loader (guessed from file extension if not set)", "option", "lo", str),
    exclude=("Comma-separated list of dataset IDs whose annotations to exclude", "option", "e", split_string),
    highlight_chars=("Allow highlighting individual characters instead of tokens", "flag", "C", bool),
)

def textcat_manual_spans(
    dataset: str,
    spacy_model: Union[str, Language],
    source: Union[str, Iterable[dict]],
    label: Optional[List[str]] = None,
    loader: Optional[str] = None,
    exclude: Optional[List[str]] = None,
    highlight_chars: bool = False,  
):
    """
    Text classification with overlapping spans
    """

    nlp = spacy.load(spacy_model)
    stream = get_stream(
        source, loader=loader, rehash=True, dedup=True, input_key="text"
    )
    # Add "tokens" key to the tasks, either with words or characters
    # If you remove this, the spans interface will switch back to ner_manual

    
    stream = add_tokens(nlp, stream, use_chars=highlight_chars)

    #Add labels to each task in stream
    has_options = len(label) > 1
    if has_options:
        stream = add_label_options_to_stream(stream, label)
    else:
        stream = add_labels_to_stream(stream, label)

    
    return {
        "view_id": "choice" if has_options else "classification",
        "dataset": dataset,  # Name of dataset to save annotations
        "stream": stream,  # Incoming stream of examples
        "exclude": exclude,  # List of dataset names to exclude
    }

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment