Created
March 5, 2024 18:13
-
-
Save magdaaniol/b3e24bc071f6c7eac0b39b80b0ebef32 to your computer and use it in GitHub Desktop.
Utilities to split long Prodigy annotated documents into snippets and back
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pathlib import Path | |
import srsly | |
import spacy | |
import typer | |
from util import Snippet, Document | |
from collections import defaultdict | |
def main(snippets_path: Path, articles_path: Path) -> None: | |
""" | |
Merge annotated snippets into articles. | |
Args: | |
snippets_path (Path): Path to the snippets in JSONL format. | |
articles_path (Path): Path to save articles in JSONL format. | |
""" | |
nlp = spacy.blank("en") | |
articles_path.parent.mkdir(parents=True, exist_ok=True) | |
dataset = srsly.read_jsonl(snippets_path) | |
prodigy_articles = [] | |
doc_id_to_snippets = defaultdict(list) | |
for example in dataset: | |
snippet = Snippet.from_prodigy(nlp, example) | |
doc_id_to_snippets[snippet.doc_id].append(snippet) | |
for news_id, snippets in news_id_to_snippets.items(): | |
document = Document.from_snippets(snippets) | |
prodigy_articles.append(document.to_prodigy()) | |
srsly.write_jsonl(articles_path, prodigy_articles) | |
if __name__ == "__main__": | |
typer.run(main) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pathlib import Path | |
import srsly | |
import spacy | |
import typer | |
from util import Document | |
def main(articles_path: Path, snippets_path: Path) -> None: | |
""" | |
Split articles into snippets respecting span annotations if available. | |
Args: | |
articles_path (Path): Path to the articles in JSONL format. | |
snippets_path (Path): Path to save the Prodigy snippets. | |
""" | |
nlp = spacy.blank("en") | |
snippets_path.parent.mkdir(parents=True, exist_ok=True) | |
dataset = srsly.read_jsonl(articles_path) | |
prodigy_snippets = [] | |
for example in dataset: | |
# convert Prodigy annotation into a spaCy doc | |
document = Document.from_prodigy(nlp, example) | |
# split into snippets based on custom logic | |
snippets = document.make_snippets() | |
# convert the snippets back to Prodigy task format | |
prodigy_snippets.extend([snippet.to_prodigy() for snippet in snippets]) | |
srsly.write_jsonl(snippets_path, prodigy_snippets) | |
if __name__ == "__main__": | |
typer.run(main) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import spacy | |
from spacy.tokens import Doc, Span | |
from spacy.language import Language | |
from spacy.vocab import Vocab | |
from dataclasses import dataclass | |
from typing import List, Optional, Dict, Any | |
from prodigy.components.preprocess import get_token | |
from prodigy.util import BINARY_ATTR, set_hashes | |
@dataclass | |
class Snippet: | |
"""A snippet of a document to be used in ner.manual inteface.""" | |
doc_id: str | |
index: int | |
doc: Doc | |
_view_id: Optional[str] = None | |
answer: Optional[str] = None | |
_annotator_id: Optional[str] = None | |
_session_id: Optional[str] = None | |
_timestamp: Optional[str] = None | |
def __repr__(self) -> str: | |
return f"Snippet({self.doc_id}, {self.index}): {self.doc.text}" | |
@classmethod | |
def from_prodigy( | |
cls, nlp: Language, task: dict | |
) -> "Snippet": | |
doc = prodigy2spacy_ner(nlp.vocab, task) | |
return cls( | |
task["meta"]["article_id"], # this (or similar) property should exist as meta data | |
task["meta"]["part"], | |
doc, | |
task.get("_view_id"), | |
task.get("answer"), | |
task.get("_annotator_id"), | |
task.get("_session_id"), | |
task.get("_timestamp"), | |
) | |
def to_prodigy(self) -> dict: | |
return spacy2prodigy_ner_snippet(self) | |
@dataclass | |
class Document: | |
"""Document that can be divided into snippets and put back together.""" | |
doc_id: str | |
doc: Doc | |
_view_id: Optional[str] = None | |
answer: Optional[str] = None | |
_annotator_id: Optional[str] = None | |
_session_id: Optional[str] = None | |
_timestamp: Optional[str] = None | |
@classmethod | |
def from_prodigy(cls, nlp: Language, task: dict) -> "Document": | |
doc = prodigy2spacy_ner(nlp.vocab, task) | |
return cls( | |
task["meta"]["article_id"], # this (or similar) property should exist as meta data | |
doc, | |
task.get("_view_id"), | |
task.get("answer"), | |
task.get("_annotator_id"), | |
task.get("_session_id"), | |
task.get("_timestamp"), | |
) | |
@classmethod | |
def from_snippets(cls, snippets: List[Snippet]) -> "Document": | |
"""Join a sequence of snippets back up into a single document.""" | |
doc_ids = set() | |
view_ids = set() | |
answers = set() | |
annotator_id = set() | |
session_id = set() | |
timestamp = set() | |
for snippet in snippets: | |
doc_ids.add(snippet.doc_id) | |
view_ids.add(snippet._view_id) | |
answers.add(snippet.answer) | |
annotator_id.add(snippet._annotator_id) | |
session_id.add(snippet._session_id) | |
timestamp.add(snippet._timestamp) | |
for attr in [ | |
doc_ids, | |
view_ids, | |
]: | |
if "review" in attr: | |
attr.remove("review") | |
if len(attr) != 1: | |
raise ValueError(f"All snippets must have the same {attr}") | |
# We don't just use .sort here, as it's impolite to permute the | |
# input | |
snippets = list(sorted(snippets, key=lambda s: s.index)) | |
doc = Doc.from_docs([s.doc for s in snippets]) | |
return cls( | |
list(doc_ids)[0], | |
doc, | |
list(view_ids)[0], | |
list(answers)[0], | |
list(annotator_id)[0], | |
list(session_id)[0], | |
list(timestamp)[0], | |
) | |
def make_snippets(self) -> List[Snippet]: | |
spans = [] | |
start = 0 | |
for token in self.doc: | |
# this logic splits on a new line character | |
# it would have to be substituted with the logic that | |
# splits the doc into snippets that fits your purpose | |
if token.is_space and token.text[0] == "\n" and len(token) >= 2: | |
spans.append(self.doc[start : token.i + 1]) | |
start = token.i + 1 | |
if start < len(self.doc): | |
spans.append(self.doc[start:]) | |
return [ | |
Snippet( | |
self.doc_id, | |
i, | |
doc=span.as_doc(), | |
is_final=i == (len(spans) - 1), | |
_view_id=self._view_id, | |
answer=self.answer, | |
_annotator_id=self._annotator_id, | |
_session_id=self._session_id, | |
_timestamp=self._timestamp, | |
) | |
for i, span in enumerate(spans) | |
] | |
def to_prodigy(self) -> dict: | |
return spacy2prodigy_ner_document(self) | |
def spacy2prodigy_ner_snippet( | |
snippet: Snippet, *, source: Optional[str] = None | |
) -> dict: | |
task: Dict[str, Any] = { | |
"text": snippet.doc.text, | |
"meta": {"article_id": snippet.doc_id, "part": snippet.index}, | |
"tokens": [get_token(t, t.i) for t in snippet.doc], | |
"spans": [get_ent(ent, source=source) for ent in snippet.doc.ents], | |
BINARY_ATTR: False, | |
"_view_id": snippet._view_id, | |
"answer": snippet.answer, | |
"_timestamp": snippet._timestamp, | |
"_annotator_id": snippet._annotator_id, | |
"_session_id": snippet._session_id, | |
} | |
task = set_hashes(task) | |
return task | |
def spacy2prodigy_ner_document( | |
document: Document, *, source: Optional[str] = None | |
) -> dict: | |
task: Dict[str, Any] = { | |
"text": document.doc.text, | |
"meta": {"article_id": document.doc_id}, | |
"tokens": [get_token(t, t.i) for t in document.doc], | |
"spans": [get_ent(ent, source=source) for ent in document.doc.ents], | |
BINARY_ATTR: False, | |
"_view_id": document._view_id, | |
"answer": document.answer, | |
"_timestamp": document._timestamp, | |
"_annotator_id": document._annotator_id, | |
"_session_id": document._session_id, | |
} | |
task = set_hashes(task) | |
return task | |
def prodigy2spacy_ner(vocab: Vocab, task: dict) -> Doc: | |
"""Convert Prodigy annotations to a spaCy doc""" | |
tokens = task.get("tokens", []) | |
words = [token["text"] for token in tokens] | |
spaces = [token["ws"] for token in task["tokens"]] | |
doc = Doc(vocab, words=words, spaces=spaces) | |
spans = [] | |
for span in task["spans"]: | |
spans.append(doc.char_span(span["start"], span["end"], span["label"])) | |
doc.set_ents(spans) | |
return doc | |
def get_ent(ent: Span, *, source: Optional[str]) -> dict: | |
return { | |
"token_start": ent.start, | |
"token_end": ent.end - 1, | |
"start": ent.start_char, | |
"end": ent.end_char, | |
"text": ent.text, | |
"label": ent.label_, | |
"source": source, | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment