Last active
March 16, 2023 15:50
-
-
Save wesslen/31c44ca0f83242c512772dcfe15a81fc to your computer and use it in GitHub Desktop.
Textcat classification with pre-annotated overlapping spans, see https://support.prodi.gy/t/textcat-using-span-overlapping-view/6434
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{"text":"Biomaterials and medical devices are broadly used in the diagnosis, treatment, repair, replacement or enhancing functions of human tissues or organs. Although the living conditions of human beings have been steadily improved in most parts of the world. ","label":"ID: 27047681","spans":[{ "start": 0, "end": 12, "label": "ORG" },{ "start": 0, "end": 12, "label": "ORG_2" }]} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from typing import Iterable, List, Optional, Union | |
import prodigy | |
from prodigy.components.loaders import get_stream | |
from prodigy.components.preprocess import add_tokens | |
from prodigy.util import split_string | |
import spacy | |
from spacy.language import Language | |
# Recipe decorator with argument annotations: (description, argument type, | |
# shortcut, type / converter function called on value before it's passed to | |
# the function). Descriptions are also shown when typing --help. | |
@prodigy.recipe( | |
"textcat.manual.spans", | |
dataset=("The dataset to use", "positional", None, str), | |
spacy_model=("Loadable spaCy pipeline for tokenization or blank:lang (e.g. blank:en)", "positional", None, str), | |
source=("Data to annotate (file path or '-' to read from standard input)", "positional", None, str), | |
loader=("Loader (guessed from file extension if not set)", "option", "lo", str), | |
exclude=("Comma-separated list of dataset IDs whose annotations to exclude", "option", "e", split_string), | |
highlight_chars=("Allow highlighting individual characters instead of tokens", "flag", "C", bool), | |
) | |
def textcat_manual_spans( | |
dataset: str, | |
spacy_model: Union[str, Language], | |
source: Union[str, Iterable[dict]], | |
loader: Optional[str] = None, | |
exclude: Optional[List[str]] = None, | |
highlight_chars: bool = False, | |
): | |
""" | |
Text classification with overlapping spans | |
""" | |
nlp = spacy.load(spacy_model) | |
stream = get_stream( | |
source, loader=loader, rehash=True, dedup=True, input_key="text" | |
) | |
# Add "tokens" key to the tasks, either with words or characters | |
# If you remove this, the spans interface will switch back to ner_manual | |
stream = add_tokens(nlp, stream, use_chars=highlight_chars) | |
return { | |
"view_id": "classification", | |
"dataset": dataset, # Name of dataset to save annotations | |
"stream": stream, # Incoming stream of examples | |
"exclude": exclude, # List of dataset names to exclude | |
} |
I was able to modify it and add the labels.
from typing import Iterable, List, Optional, Union
import prodigy
from prodigy.components.loaders import get_stream
from prodigy.components.preprocess import add_tokens
from prodigy.util import split_string
import spacy
from spacy.language import Language
# Recipe decorator with argument annotations: (description, argument type,
# shortcut, type / converter function called on value before it's passed to
# the function). Descriptions are also shown when typing --help.
# Helper functions for adding user provided labels to annotation tasks.
def add_label_options_to_stream(stream, labels):
options = [{"id": label, "text": label} for label in labels]
for task in stream:
task["options"] = options
yield task
def add_labels_to_stream(stream, labels):
for task in stream:
task["label"] = labels[0]
yield task
@prodigy.recipe(
"textcat.manual.spans",
dataset=("The dataset to use", "positional", None, str),
spacy_model=("Loadable spaCy pipeline for tokenization or blank:lang (e.g. blank:en)", "positional", None, str),
source=("Data to annotate (file path or '-' to read from standard input)", "positional", None, str),
label=("One or more comma-separated labels", "option", "l", split_string),
loader=("Loader (guessed from file extension if not set)", "option", "lo", str),
exclude=("Comma-separated list of dataset IDs whose annotations to exclude", "option", "e", split_string),
highlight_chars=("Allow highlighting individual characters instead of tokens", "flag", "C", bool),
)
def textcat_manual_spans(
dataset: str,
spacy_model: Union[str, Language],
source: Union[str, Iterable[dict]],
label: Optional[List[str]] = None,
loader: Optional[str] = None,
exclude: Optional[List[str]] = None,
highlight_chars: bool = False,
):
"""
Text classification with overlapping spans
"""
nlp = spacy.load(spacy_model)
stream = get_stream(
source, loader=loader, rehash=True, dedup=True, input_key="text"
)
# Add "tokens" key to the tasks, either with words or characters
# If you remove this, the spans interface will switch back to ner_manual
stream = add_tokens(nlp, stream, use_chars=highlight_chars)
#Add labels to each task in stream
has_options = len(label) > 1
if has_options:
stream = add_label_options_to_stream(stream, label)
else:
stream = add_labels_to_stream(stream, label)
return {
"view_id": "choice" if has_options else "classification",
"dataset": dataset, # Name of dataset to save annotations
"stream": stream, # Incoming stream of examples
"exclude": exclude, # List of dataset names to exclude
}
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
To run: