konverner/spans_to_conll.py

## spans_to_conll.py
import json
from typing import Any, Dict, List

import spacy
from spacy.training.iob_utils import biluo_to_iob, doc_to_biluo_tags
from tqdm import tqdm


def spans_to_conll(
        samples: List[Dict[str, Any]],
        output_path: str,
        spacy_model: str
    ):
    """
    Converts

    Args:
        samples : data samples in the json format
        output_path:
        spacy_model: https://spacy.io/usage/models

    """
    nlp = spacy.load(spacy_model)
    with open(output_path, 'w', encoding='utf-8') as f:
        for sample in tqdm(samples):

            # Process the text with spaCy
            doc = nlp(sample["text"])

            # Get BIO tags from spaCy's biluo_tags_from_offsets
            entities = []
            for span in sample["spans"]:
                spacy_span = doc.char_span(span["start"], span["end"], label=span["label"])
                if spacy_span is None:
                    print(f"Skipping entity {span['label']} in the '{sample['text'][span['start']:span['end']]}'")
                else:
                    entities.append(spacy_span)

            doc.ents = entities
            for token, iob_tag in zip(doc, biluo_to_iob(doc_to_biluo_tags(doc))):
                f.write(f"{token.text}\t{iob_tag}\n")
            f.write('\n')
    print(f"Samples are saved as {output_path}")

test_dataset_path = "../data/annotated_ner_dataset_2901.jsonl"

with open(test_dataset_path, 'r', encoding='utf-8') as json_file:
    test_dataset = [json.loads(line) for line in json_file]

result = spans_to_conll(test_dataset, "annotated_ner_dataset_2901.txt", "fr_core_news_sm")
	import json
	from typing import Any, Dict, List

	import spacy
	from spacy.training.iob_utils import biluo_to_iob, doc_to_biluo_tags
	from tqdm import tqdm


	def spans_to_conll(
	samples: List[Dict[str, Any]],
	output_path: str,
	spacy_model: str
	):
	"""
	Converts

	Args:
	samples : data samples in the json format
	output_path:
	spacy_model: https://spacy.io/usage/models

	"""
	nlp = spacy.load(spacy_model)
	with open(output_path, 'w', encoding='utf-8') as f:
	for sample in tqdm(samples):

	# Process the text with spaCy
	doc = nlp(sample["text"])

	# Get BIO tags from spaCy's biluo_tags_from_offsets
	entities = []
	for span in sample["spans"]:
	spacy_span = doc.char_span(span["start"], span["end"], label=span["label"])
	if spacy_span is None:
	print(f"Skipping entity {span['label']} in the '{sample['text'][span['start']:span['end']]}'")
	else:
	entities.append(spacy_span)

	doc.ents = entities
	for token, iob_tag in zip(doc, biluo_to_iob(doc_to_biluo_tags(doc))):
	f.write(f"{token.text}\t{iob_tag}\n")
	f.write('\n')
	print(f"Samples are saved as {output_path}")

	test_dataset_path = "../data/annotated_ner_dataset_2901.jsonl"

	with open(test_dataset_path, 'r', encoding='utf-8') as json_file:
	test_dataset = [json.loads(line) for line in json_file]

	result = spans_to_conll(test_dataset, "annotated_ner_dataset_2901.txt", "fr_core_news_sm")