Skip to content

Instantly share code, notes, and snippets.

@magdaaniol
Created March 5, 2024 18:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save magdaaniol/b3e24bc071f6c7eac0b39b80b0ebef32 to your computer and use it in GitHub Desktop.
Save magdaaniol/b3e24bc071f6c7eac0b39b80b0ebef32 to your computer and use it in GitHub Desktop.
Utilities to split long Prodigy annotated documents into snippets and back
from pathlib import Path
import srsly
import spacy
import typer
from util import Snippet, Document
from collections import defaultdict
def main(snippets_path: Path, articles_path: Path) -> None:
"""
Merge annotated snippets into articles.
Args:
snippets_path (Path): Path to the snippets in JSONL format.
articles_path (Path): Path to save articles in JSONL format.
"""
nlp = spacy.blank("en")
articles_path.parent.mkdir(parents=True, exist_ok=True)
dataset = srsly.read_jsonl(snippets_path)
prodigy_articles = []
doc_id_to_snippets = defaultdict(list)
for example in dataset:
snippet = Snippet.from_prodigy(nlp, example)
doc_id_to_snippets[snippet.doc_id].append(snippet)
for news_id, snippets in news_id_to_snippets.items():
document = Document.from_snippets(snippets)
prodigy_articles.append(document.to_prodigy())
srsly.write_jsonl(articles_path, prodigy_articles)
if __name__ == "__main__":
typer.run(main)
from pathlib import Path
import srsly
import spacy
import typer
from util import Document
def main(articles_path: Path, snippets_path: Path) -> None:
"""
Split articles into snippets respecting span annotations if available.
Args:
articles_path (Path): Path to the articles in JSONL format.
snippets_path (Path): Path to save the Prodigy snippets.
"""
nlp = spacy.blank("en")
snippets_path.parent.mkdir(parents=True, exist_ok=True)
dataset = srsly.read_jsonl(articles_path)
prodigy_snippets = []
for example in dataset:
# convert Prodigy annotation into a spaCy doc
document = Document.from_prodigy(nlp, example)
# split into snippets based on custom logic
snippets = document.make_snippets()
# convert the snippets back to Prodigy task format
prodigy_snippets.extend([snippet.to_prodigy() for snippet in snippets])
srsly.write_jsonl(snippets_path, prodigy_snippets)
if __name__ == "__main__":
typer.run(main)
import spacy
from spacy.tokens import Doc, Span
from spacy.language import Language
from spacy.vocab import Vocab
from dataclasses import dataclass
from typing import List, Optional, Dict, Any
from prodigy.components.preprocess import get_token
from prodigy.util import BINARY_ATTR, set_hashes
@dataclass
class Snippet:
"""A snippet of a document to be used in ner.manual inteface."""
doc_id: str
index: int
doc: Doc
_view_id: Optional[str] = None
answer: Optional[str] = None
_annotator_id: Optional[str] = None
_session_id: Optional[str] = None
_timestamp: Optional[str] = None
def __repr__(self) -> str:
return f"Snippet({self.doc_id}, {self.index}): {self.doc.text}"
@classmethod
def from_prodigy(
cls, nlp: Language, task: dict
) -> "Snippet":
doc = prodigy2spacy_ner(nlp.vocab, task)
return cls(
task["meta"]["article_id"], # this (or similar) property should exist as meta data
task["meta"]["part"],
doc,
task.get("_view_id"),
task.get("answer"),
task.get("_annotator_id"),
task.get("_session_id"),
task.get("_timestamp"),
)
def to_prodigy(self) -> dict:
return spacy2prodigy_ner_snippet(self)
@dataclass
class Document:
"""Document that can be divided into snippets and put back together."""
doc_id: str
doc: Doc
_view_id: Optional[str] = None
answer: Optional[str] = None
_annotator_id: Optional[str] = None
_session_id: Optional[str] = None
_timestamp: Optional[str] = None
@classmethod
def from_prodigy(cls, nlp: Language, task: dict) -> "Document":
doc = prodigy2spacy_ner(nlp.vocab, task)
return cls(
task["meta"]["article_id"], # this (or similar) property should exist as meta data
doc,
task.get("_view_id"),
task.get("answer"),
task.get("_annotator_id"),
task.get("_session_id"),
task.get("_timestamp"),
)
@classmethod
def from_snippets(cls, snippets: List[Snippet]) -> "Document":
"""Join a sequence of snippets back up into a single document."""
doc_ids = set()
view_ids = set()
answers = set()
annotator_id = set()
session_id = set()
timestamp = set()
for snippet in snippets:
doc_ids.add(snippet.doc_id)
view_ids.add(snippet._view_id)
answers.add(snippet.answer)
annotator_id.add(snippet._annotator_id)
session_id.add(snippet._session_id)
timestamp.add(snippet._timestamp)
for attr in [
doc_ids,
view_ids,
]:
if "review" in attr:
attr.remove("review")
if len(attr) != 1:
raise ValueError(f"All snippets must have the same {attr}")
# We don't just use .sort here, as it's impolite to permute the
# input
snippets = list(sorted(snippets, key=lambda s: s.index))
doc = Doc.from_docs([s.doc for s in snippets])
return cls(
list(doc_ids)[0],
doc,
list(view_ids)[0],
list(answers)[0],
list(annotator_id)[0],
list(session_id)[0],
list(timestamp)[0],
)
def make_snippets(self) -> List[Snippet]:
spans = []
start = 0
for token in self.doc:
# this logic splits on a new line character
# it would have to be substituted with the logic that
# splits the doc into snippets that fits your purpose
if token.is_space and token.text[0] == "\n" and len(token) >= 2:
spans.append(self.doc[start : token.i + 1])
start = token.i + 1
if start < len(self.doc):
spans.append(self.doc[start:])
return [
Snippet(
self.doc_id,
i,
doc=span.as_doc(),
is_final=i == (len(spans) - 1),
_view_id=self._view_id,
answer=self.answer,
_annotator_id=self._annotator_id,
_session_id=self._session_id,
_timestamp=self._timestamp,
)
for i, span in enumerate(spans)
]
def to_prodigy(self) -> dict:
return spacy2prodigy_ner_document(self)
def spacy2prodigy_ner_snippet(
snippet: Snippet, *, source: Optional[str] = None
) -> dict:
task: Dict[str, Any] = {
"text": snippet.doc.text,
"meta": {"article_id": snippet.doc_id, "part": snippet.index},
"tokens": [get_token(t, t.i) for t in snippet.doc],
"spans": [get_ent(ent, source=source) for ent in snippet.doc.ents],
BINARY_ATTR: False,
"_view_id": snippet._view_id,
"answer": snippet.answer,
"_timestamp": snippet._timestamp,
"_annotator_id": snippet._annotator_id,
"_session_id": snippet._session_id,
}
task = set_hashes(task)
return task
def spacy2prodigy_ner_document(
document: Document, *, source: Optional[str] = None
) -> dict:
task: Dict[str, Any] = {
"text": document.doc.text,
"meta": {"article_id": document.doc_id},
"tokens": [get_token(t, t.i) for t in document.doc],
"spans": [get_ent(ent, source=source) for ent in document.doc.ents],
BINARY_ATTR: False,
"_view_id": document._view_id,
"answer": document.answer,
"_timestamp": document._timestamp,
"_annotator_id": document._annotator_id,
"_session_id": document._session_id,
}
task = set_hashes(task)
return task
def prodigy2spacy_ner(vocab: Vocab, task: dict) -> Doc:
"""Convert Prodigy annotations to a spaCy doc"""
tokens = task.get("tokens", [])
words = [token["text"] for token in tokens]
spaces = [token["ws"] for token in task["tokens"]]
doc = Doc(vocab, words=words, spaces=spaces)
spans = []
for span in task["spans"]:
spans.append(doc.char_span(span["start"], span["end"], span["label"]))
doc.set_ents(spans)
return doc
def get_ent(ent: Span, *, source: Optional[str]) -> dict:
return {
"token_start": ent.start,
"token_end": ent.end - 1,
"start": ent.start_char,
"end": ent.end_char,
"text": ent.text,
"label": ent.label_,
"source": source,
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment