Last active
February 5, 2024 12:47
-
-
Save konverner/fb9321df63449557b41fe4fa96300f16 to your computer and use it in GitHub Desktop.
convert spans NER annotation to conll BIO format
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
from typing import Any, Dict, List | |
import spacy | |
from spacy.training.iob_utils import biluo_to_iob, doc_to_biluo_tags | |
from tqdm import tqdm | |
def spans_to_conll( | |
samples: List[Dict[str, Any]], | |
output_path: str, | |
spacy_model: str | |
): | |
""" | |
Converts | |
Args: | |
samples : data samples in the json format | |
output_path: | |
spacy_model: https://spacy.io/usage/models | |
""" | |
nlp = spacy.load(spacy_model) | |
with open(output_path, 'w', encoding='utf-8') as f: | |
for sample in tqdm(samples): | |
# Process the text with spaCy | |
doc = nlp(sample["text"]) | |
# Get BIO tags from spaCy's biluo_tags_from_offsets | |
entities = [] | |
for span in sample["spans"]: | |
spacy_span = doc.char_span(span["start"], span["end"], label=span["label"]) | |
if spacy_span is None: | |
print(f"Skipping entity {span['label']} in the '{sample['text'][span['start']:span['end']]}'") | |
else: | |
entities.append(spacy_span) | |
doc.ents = entities | |
for token, iob_tag in zip(doc, biluo_to_iob(doc_to_biluo_tags(doc))): | |
f.write(f"{token.text}\t{iob_tag}\n") | |
f.write('\n') | |
print(f"Samples are saved as {output_path}") | |
test_dataset_path = "../data/annotated_ner_dataset_2901.jsonl" | |
with open(test_dataset_path, 'r', encoding='utf-8') as json_file: | |
test_dataset = [json.loads(line) for line in json_file] | |
result = spans_to_conll(test_dataset, "annotated_ner_dataset_2901.txt", "fr_core_news_sm") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment