magdaaniol/merge_snippets.py

## merge_snippets.py
from pathlib import Path
import srsly
import spacy
import typer
from util import Snippet, Document
from collections import defaultdict


def main(snippets_path: Path, articles_path: Path) -> None:
    """
    Merge annotated snippets into articles.

    Args:
        snippets_path (Path): Path to the snippets in JSONL format.
        articles_path (Path): Path to save articles in JSONL format.
    """
    nlp = spacy.blank("en")
    articles_path.parent.mkdir(parents=True, exist_ok=True)
    dataset = srsly.read_jsonl(snippets_path)
    prodigy_articles = []
    doc_id_to_snippets = defaultdict(list)
    for example in dataset:
        snippet = Snippet.from_prodigy(nlp, example)
        doc_id_to_snippets[snippet.doc_id].append(snippet)
    for news_id, snippets in news_id_to_snippets.items():
        document = Document.from_snippets(snippets)
        prodigy_articles.append(document.to_prodigy())
    srsly.write_jsonl(articles_path, prodigy_articles)


if __name__ == "__main__":
    typer.run(main)

## split_doc.py
from pathlib import Path
import srsly
import spacy
import typer
from util import Document


def main(articles_path: Path, snippets_path: Path) -> None:
    """
    Split articles into snippets respecting span annotations if available.

    Args:
        articles_path (Path): Path to the articles in JSONL format.
        snippets_path (Path): Path to save the Prodigy snippets.
    """
    nlp = spacy.blank("en")
    snippets_path.parent.mkdir(parents=True, exist_ok=True)
    dataset = srsly.read_jsonl(articles_path)
    prodigy_snippets = []
    for example in dataset:
        # convert Prodigy annotation into a spaCy doc
        document = Document.from_prodigy(nlp, example)
        # split into snippets based on custom logic
        snippets = document.make_snippets()
        # convert the snippets back to Prodigy task format
        prodigy_snippets.extend([snippet.to_prodigy() for snippet in snippets])
    srsly.write_jsonl(snippets_path, prodigy_snippets)


if __name__ == "__main__":
    typer.run(main)

## util.py
import spacy
from spacy.tokens import Doc, Span
from spacy.language import Language
from spacy.vocab import Vocab
from dataclasses import dataclass
from typing import List, Optional, Dict, Any
from prodigy.components.preprocess import get_token
from prodigy.util import BINARY_ATTR, set_hashes

@dataclass
class Snippet:
    """A snippet of a document to be used in ner.manual inteface."""

    doc_id: str
    index: int
    doc: Doc
    _view_id: Optional[str] = None
    answer: Optional[str] = None
    _annotator_id: Optional[str] = None
    _session_id: Optional[str] = None
    _timestamp: Optional[str] = None

    def __repr__(self) -> str:
        return f"Snippet({self.doc_id}, {self.index}): {self.doc.text}"

    @classmethod
    def from_prodigy(
        cls, nlp: Language, task: dict
    ) -> "Snippet":
        doc = prodigy2spacy_ner(nlp.vocab, task)
        return cls(
            task["meta"]["article_id"], # this (or similar) property should exist as meta data
            task["meta"]["part"],
            doc,
            task.get("_view_id"),
            task.get("answer"),
            task.get("_annotator_id"),
            task.get("_session_id"),
            task.get("_timestamp"),
        )

    def to_prodigy(self) -> dict:
        return spacy2prodigy_ner_snippet(self)

@dataclass
class Document:
    """Document that can be divided into snippets and put back together."""

    doc_id: str
    doc: Doc
    _view_id: Optional[str] = None
    answer: Optional[str] = None
    _annotator_id: Optional[str] = None
    _session_id: Optional[str] = None
    _timestamp: Optional[str] = None

    @classmethod
    def from_prodigy(cls, nlp: Language, task: dict) -> "Document":
        doc = prodigy2spacy_ner(nlp.vocab, task)
        return cls(
            task["meta"]["article_id"], # this (or similar) property should exist as meta data
            doc,
            task.get("_view_id"),
            task.get("answer"),
            task.get("_annotator_id"),
            task.get("_session_id"),
            task.get("_timestamp"),
        )

    @classmethod
    def from_snippets(cls, snippets: List[Snippet]) -> "Document":
        """Join a sequence of snippets back up into a single document."""
        doc_ids = set()
        view_ids = set()
        answers = set()
        annotator_id = set()
        session_id = set()
        timestamp = set()
        for snippet in snippets:
            doc_ids.add(snippet.doc_id)
            view_ids.add(snippet._view_id)
            answers.add(snippet.answer)
            annotator_id.add(snippet._annotator_id)
            session_id.add(snippet._session_id)
            timestamp.add(snippet._timestamp)
        for attr in [
            doc_ids,
            view_ids,
        ]:
            if "review" in attr:
                attr.remove("review")
            if len(attr) != 1:
                raise ValueError(f"All snippets must have the same {attr}")

        # We don't just use .sort here, as it's impolite to permute the
        # input
        snippets = list(sorted(snippets, key=lambda s: s.index))
        doc = Doc.from_docs([s.doc for s in snippets])
        return cls(
            list(doc_ids)[0],
            doc,
            list(view_ids)[0],
            list(answers)[0],
            list(annotator_id)[0],
            list(session_id)[0],
            list(timestamp)[0],
        )

    def make_snippets(self) -> List[Snippet]:
        spans = []
        start = 0
        for token in self.doc:
            # this logic splits on a new line character
            # it would have to be substituted with the logic that
            # splits the doc into snippets that fits your purpose
            if token.is_space and token.text[0] == "\n" and len(token) >= 2:
                spans.append(self.doc[start : token.i + 1])
                start = token.i + 1
        if start < len(self.doc):
            spans.append(self.doc[start:])
        return [
            Snippet(
                self.doc_id,
                i,
                doc=span.as_doc(),
                is_final=i == (len(spans) - 1),
                _view_id=self._view_id,
                answer=self.answer,
                _annotator_id=self._annotator_id,
                _session_id=self._session_id,
                _timestamp=self._timestamp,
            )
            for i, span in enumerate(spans)
        ]

    def to_prodigy(self) -> dict:
        return spacy2prodigy_ner_document(self)

def spacy2prodigy_ner_snippet(
    snippet: Snippet, *, source: Optional[str] = None
) -> dict:
    task: Dict[str, Any] = {
        "text": snippet.doc.text,
        "meta": {"article_id": snippet.doc_id, "part": snippet.index},
        "tokens": [get_token(t, t.i) for t in snippet.doc],
        "spans": [get_ent(ent, source=source) for ent in snippet.doc.ents],
        BINARY_ATTR: False,
        "_view_id": snippet._view_id,
        "answer": snippet.answer,
        "_timestamp": snippet._timestamp,
        "_annotator_id": snippet._annotator_id,
        "_session_id": snippet._session_id,
    }
    task = set_hashes(task)
    return task


def spacy2prodigy_ner_document(
    document: Document, *, source: Optional[str] = None
) -> dict:
    task: Dict[str, Any] = {
        "text": document.doc.text,
        "meta": {"article_id": document.doc_id},
        "tokens": [get_token(t, t.i) for t in document.doc],
        "spans": [get_ent(ent, source=source) for ent in document.doc.ents],
        BINARY_ATTR: False,
        "_view_id": document._view_id,
        "answer": document.answer,
        "_timestamp": document._timestamp,
        "_annotator_id": document._annotator_id,
        "_session_id": document._session_id,
    }
    task = set_hashes(task)
    return task

def prodigy2spacy_ner(vocab: Vocab, task: dict) -> Doc:
    """Convert Prodigy annotations to a spaCy doc"""
    tokens = task.get("tokens", [])
    words = [token["text"] for token in tokens]
    spaces = [token["ws"] for token in task["tokens"]]
    doc = Doc(vocab, words=words, spaces=spaces)
    spans = []
    for span in task["spans"]:
        spans.append(doc.char_span(span["start"], span["end"], span["label"]))
    doc.set_ents(spans)
    return doc


def get_ent(ent: Span, *, source: Optional[str]) -> dict:
    return {
        "token_start": ent.start,
        "token_end": ent.end - 1,
        "start": ent.start_char,
        "end": ent.end_char,
        "text": ent.text,
        "label": ent.label_,
        "source": source,
    }
	from pathlib import Path
	import srsly
	import spacy
	import typer
	from util import Snippet, Document
	from collections import defaultdict


	def main(snippets_path: Path, articles_path: Path) -> None:
	"""
	Merge annotated snippets into articles.

	Args:
	snippets_path (Path): Path to the snippets in JSONL format.
	articles_path (Path): Path to save articles in JSONL format.
	"""
	nlp = spacy.blank("en")
	articles_path.parent.mkdir(parents=True, exist_ok=True)
	dataset = srsly.read_jsonl(snippets_path)
	prodigy_articles = []
	doc_id_to_snippets = defaultdict(list)
	for example in dataset:
	snippet = Snippet.from_prodigy(nlp, example)
	doc_id_to_snippets[snippet.doc_id].append(snippet)
	for news_id, snippets in news_id_to_snippets.items():
	document = Document.from_snippets(snippets)
	prodigy_articles.append(document.to_prodigy())
	srsly.write_jsonl(articles_path, prodigy_articles)


	if __name__ == "__main__":
	typer.run(main)
	import spacy
	from spacy.tokens import Doc, Span
	from spacy.language import Language
	from spacy.vocab import Vocab
	from dataclasses import dataclass
	from typing import List, Optional, Dict, Any
	from prodigy.components.preprocess import get_token
	from prodigy.util import BINARY_ATTR, set_hashes

	@dataclass
	class Snippet:
	"""A snippet of a document to be used in ner.manual inteface."""

	doc_id: str
	index: int
	doc: Doc
	_view_id: Optional[str] = None
	answer: Optional[str] = None
	_annotator_id: Optional[str] = None
	_session_id: Optional[str] = None
	_timestamp: Optional[str] = None

	def __repr__(self) -> str:
	return f"Snippet({self.doc_id}, {self.index}): {self.doc.text}"

	@classmethod
	def from_prodigy(
	cls, nlp: Language, task: dict
	) -> "Snippet":
	doc = prodigy2spacy_ner(nlp.vocab, task)
	return cls(
	task["meta"]["article_id"], # this (or similar) property should exist as meta data
	task["meta"]["part"],
	doc,
	task.get("_view_id"),
	task.get("answer"),
	task.get("_annotator_id"),
	task.get("_session_id"),
	task.get("_timestamp"),
	)

	def to_prodigy(self) -> dict:
	return spacy2prodigy_ner_snippet(self)

	@dataclass
	class Document:
	"""Document that can be divided into snippets and put back together."""

	doc_id: str
	doc: Doc
	_view_id: Optional[str] = None
	answer: Optional[str] = None
	_annotator_id: Optional[str] = None
	_session_id: Optional[str] = None
	_timestamp: Optional[str] = None

	@classmethod
	def from_prodigy(cls, nlp: Language, task: dict) -> "Document":
	doc = prodigy2spacy_ner(nlp.vocab, task)
	return cls(
	task["meta"]["article_id"], # this (or similar) property should exist as meta data
	doc,
	task.get("_view_id"),
	task.get("answer"),
	task.get("_annotator_id"),
	task.get("_session_id"),
	task.get("_timestamp"),
	)

	@classmethod
	def from_snippets(cls, snippets: List[Snippet]) -> "Document":
	"""Join a sequence of snippets back up into a single document."""
	doc_ids = set()
	view_ids = set()
	answers = set()
	annotator_id = set()
	session_id = set()
	timestamp = set()
	for snippet in snippets:
	doc_ids.add(snippet.doc_id)
	view_ids.add(snippet._view_id)
	answers.add(snippet.answer)
	annotator_id.add(snippet._annotator_id)
	session_id.add(snippet._session_id)
	timestamp.add(snippet._timestamp)
	for attr in [
	doc_ids,
	view_ids,
	]:
	if "review" in attr:
	attr.remove("review")
	if len(attr) != 1:
	raise ValueError(f"All snippets must have the same {attr}")

	# We don't just use .sort here, as it's impolite to permute the
	# input
	snippets = list(sorted(snippets, key=lambda s: s.index))
	doc = Doc.from_docs([s.doc for s in snippets])
	return cls(
	list(doc_ids)[0],
	doc,
	list(view_ids)[0],
	list(answers)[0],
	list(annotator_id)[0],
	list(session_id)[0],
	list(timestamp)[0],
	)

	def make_snippets(self) -> List[Snippet]:
	spans = []
	start = 0
	for token in self.doc:
	# this logic splits on a new line character
	# it would have to be substituted with the logic that
	# splits the doc into snippets that fits your purpose
	if token.is_space and token.text[0] == "\n" and len(token) >= 2:
	spans.append(self.doc[start : token.i + 1])
	start = token.i + 1
	if start < len(self.doc):
	spans.append(self.doc[start:])
	return [
	Snippet(
	self.doc_id,
	i,
	doc=span.as_doc(),
	is_final=i == (len(spans) - 1),
	_view_id=self._view_id,
	answer=self.answer,
	_annotator_id=self._annotator_id,
	_session_id=self._session_id,
	_timestamp=self._timestamp,
	)
	for i, span in enumerate(spans)
	]

	def to_prodigy(self) -> dict:
	return spacy2prodigy_ner_document(self)

	def spacy2prodigy_ner_snippet(
	snippet: Snippet, *, source: Optional[str] = None
	) -> dict:
	task: Dict[str, Any] = {
	"text": snippet.doc.text,
	"meta": {"article_id": snippet.doc_id, "part": snippet.index},
	"tokens": [get_token(t, t.i) for t in snippet.doc],
	"spans": [get_ent(ent, source=source) for ent in snippet.doc.ents],
	BINARY_ATTR: False,
	"_view_id": snippet._view_id,
	"answer": snippet.answer,
	"_timestamp": snippet._timestamp,
	"_annotator_id": snippet._annotator_id,
	"_session_id": snippet._session_id,
	}
	task = set_hashes(task)
	return task


	def spacy2prodigy_ner_document(
	document: Document, *, source: Optional[str] = None
	) -> dict:
	task: Dict[str, Any] = {
	"text": document.doc.text,
	"meta": {"article_id": document.doc_id},
	"tokens": [get_token(t, t.i) for t in document.doc],
	"spans": [get_ent(ent, source=source) for ent in document.doc.ents],
	BINARY_ATTR: False,
	"_view_id": document._view_id,
	"answer": document.answer,
	"_timestamp": document._timestamp,
	"_annotator_id": document._annotator_id,
	"_session_id": document._session_id,
	}
	task = set_hashes(task)
	return task

	def prodigy2spacy_ner(vocab: Vocab, task: dict) -> Doc:
	"""Convert Prodigy annotations to a spaCy doc"""
	tokens = task.get("tokens", [])
	words = [token["text"] for token in tokens]
	spaces = [token["ws"] for token in task["tokens"]]
	doc = Doc(vocab, words=words, spaces=spaces)
	spans = []
	for span in task["spans"]:
	spans.append(doc.char_span(span["start"], span["end"], span["label"]))
	doc.set_ents(spans)
	return doc


	def get_ent(ent: Span, *, source: Optional[str]) -> dict:
	return {
	"token_start": ent.start,
	"token_end": ent.end - 1,
	"start": ent.start_char,
	"end": ent.end_char,
	"text": ent.text,
	"label": ent.label_,
	"source": source,
	}