Skip to content

Instantly share code, notes, and snippets.

@konverner
Last active February 5, 2024 12:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save konverner/fb9321df63449557b41fe4fa96300f16 to your computer and use it in GitHub Desktop.
Save konverner/fb9321df63449557b41fe4fa96300f16 to your computer and use it in GitHub Desktop.
convert spans NER annotation to conll BIO format
import json
from typing import Any, Dict, List
import spacy
from spacy.training.iob_utils import biluo_to_iob, doc_to_biluo_tags
from tqdm import tqdm
def spans_to_conll(
samples: List[Dict[str, Any]],
output_path: str,
spacy_model: str
):
"""
Converts
Args:
samples : data samples in the json format
output_path:
spacy_model: https://spacy.io/usage/models
"""
nlp = spacy.load(spacy_model)
with open(output_path, 'w', encoding='utf-8') as f:
for sample in tqdm(samples):
# Process the text with spaCy
doc = nlp(sample["text"])
# Get BIO tags from spaCy's biluo_tags_from_offsets
entities = []
for span in sample["spans"]:
spacy_span = doc.char_span(span["start"], span["end"], label=span["label"])
if spacy_span is None:
print(f"Skipping entity {span['label']} in the '{sample['text'][span['start']:span['end']]}'")
else:
entities.append(spacy_span)
doc.ents = entities
for token, iob_tag in zip(doc, biluo_to_iob(doc_to_biluo_tags(doc))):
f.write(f"{token.text}\t{iob_tag}\n")
f.write('\n')
print(f"Samples are saved as {output_path}")
test_dataset_path = "../data/annotated_ner_dataset_2901.jsonl"
with open(test_dataset_path, 'r', encoding='utf-8') as json_file:
test_dataset = [json.loads(line) for line in json_file]
result = spans_to_conll(test_dataset, "annotated_ner_dataset_2901.txt", "fr_core_news_sm")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment