Skip to content

Instantly share code, notes, and snippets.

@shreyasrye
Last active January 2, 2021 19:09
Show Gist options
  • Save shreyasrye/5c28676f5ca76a42d15bc31934d304c6 to your computer and use it in GitHub Desktop.
Save shreyasrye/5c28676f5ca76a42d15bc31934d304c6 to your computer and use it in GitHub Desktop.
gold2spacy --> Create spacy-acceptable training data from prodigy-style annotated data (file)
import json
def gold2spacy(prodigy_file: str, empty_train_data: list):
"""
Extracts gold-standard data annotated with prodigy from prodigy_file and converts it into spacy's training format, returning it in a list
Args:
prodigy_file (str): the stringified name of the list containing (unformatted) annotated data from prodigy
empty__train_data (list): en empty list to append the spacy-formatted training data to
"""
# Iterate over each line in the prodigy file
for line in open(prodigy_file, 'r'):
# Convert the string line to a dictionary
dictionary = json.loads(line)
text, start_char, end_char, ent = "", None, None, ""
ent_list = [] # Create a list of entities for each piece of text
# Iterate over the key-value pairs of the dictionary and store the relevant data in variables
for key, value in dictionary.items():
if key == "text":
text = value
elif key == "spans":
# Iterate over the nested list stored in value of this pair
for dc in value:
# Store the start, end and and entity labels of the words (stored as dictionaries) in variables
for k, v in dc.items():
if k == "start":
start_char = v
elif k == "end":
end_char = v
elif k == "label":
ent = v
# Add the triads of entity information as tuples to the list of entities
ent_list.append((start_char, end_char, ent))
# Skip over the sentence if it was rejected by the annotator
elif key == "answer" and value != "accept":
continue
# Spacy's training format - text with its corresponding list of entities
spacy_formatted_line = (text, {"entities": ent_list})
empty_train_data.append(spacy_formatted_line) # add the spacy-formatted line to the list of training data
# Return the list
return empty_train_data
if __name__ == "__main__":
# This list can now be used to train/update a spacy named entity recognizer
TRAIN_DATA = gold2spacy("annotations.jsonl", [])
# Iterating through the training data list and printint out each text-entity list tuple
for ent in TRAIN_DATA:
print(ent)
"""
Example Output:
('Who is Shaka Khan?', {'entities': [(7, 17, 'PERSON')]})
("I like London and Berlin.", {"entities": [(7, 13, "LOC"), (18, 24, "LOC")]})
.
.
.
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment