Skip to content

Instantly share code, notes, and snippets.

@thatbudakguy
Created January 6, 2024 23:21
Show Gist options
  • Save thatbudakguy/9abd74d89cbc1af7cc40a2a9bcad9712 to your computer and use it in GitHub Desktop.
Save thatbudakguy/9abd74d89cbc1af7cc40a2a9bcad9712 to your computer and use it in GitHub Desktop.
CoNLL-2002 and CoNLL-U generators (spaCy)
"""Auto-generate CoNLL-2002 (IOB) entities by tagging a text file."""
from pathlib import Path
from typing import Optional
from typing_extensions import Annotated
import spacy
from spacy.training import offsets_to_biluo_tags
from spacy.training import biluo_to_iob
import typer
def generate(
input_path: Path,
model_name: Annotated[Optional[str], typer.Argument()] = "en_core_web_lg"
):
# load the input text file
assert input_path.is_file()
input_text = input_path.read_text().strip()
# load the model and parse the text
nlp = spacy.load(model_name)
doc = nlp(input_text)
# convert to conll-2002 (IOB) format
output = ""
for sentence in doc.sents:
doc = sentence.as_doc()
ents = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
iob_tags = biluo_to_iob(offsets_to_biluo_tags(doc, ents))
tokens = [token.text for token in sentence]
assert len(tokens) == len(iob_tags)
for token, tag in zip(tokens, iob_tags):
output += f"{token} {tag}\n"
output += "\n"
# write to stdout
typer.echo(output.strip())
if __name__ == "__main__":
typer.run(generate)
"""Auto-generate CoNLL-U output by tagging a text file."""
from pathlib import Path
from typing import Optional
from typing_extensions import Annotated
import spacy
import spacy_conll
import typer
def generate(
input_path: Path,
model_name: Annotated[Optional[str], typer.Argument()] = "en_core_web_lg",
):
# load the input text file
assert input_path.is_file()
input_text = input_path.read_text().strip()
# load the model and add output formatter
nlp = spacy.load(model_name)
nlp.add_pipe("conll_formatter", last=True, config={"include_headers": True})
# parse the doc and write to stdout
doc = nlp(input_text)
output = doc._.conll_str
# write to stdout
typer.echo(output.strip())
if __name__ == "__main__":
typer.run(generate)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment