Last active
January 2, 2021 19:09
-
-
Save shreyasrye/5c28676f5ca76a42d15bc31934d304c6 to your computer and use it in GitHub Desktop.
gold2spacy --> Create spacy-acceptable training data from prodigy-style annotated data (file)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
def gold2spacy(prodigy_file: str, empty_train_data: list): | |
""" | |
Extracts gold-standard data annotated with prodigy from prodigy_file and converts it into spacy's training format, returning it in a list | |
Args: | |
prodigy_file (str): the stringified name of the list containing (unformatted) annotated data from prodigy | |
empty__train_data (list): en empty list to append the spacy-formatted training data to | |
""" | |
# Iterate over each line in the prodigy file | |
for line in open(prodigy_file, 'r'): | |
# Convert the string line to a dictionary | |
dictionary = json.loads(line) | |
text, start_char, end_char, ent = "", None, None, "" | |
ent_list = [] # Create a list of entities for each piece of text | |
# Iterate over the key-value pairs of the dictionary and store the relevant data in variables | |
for key, value in dictionary.items(): | |
if key == "text": | |
text = value | |
elif key == "spans": | |
# Iterate over the nested list stored in value of this pair | |
for dc in value: | |
# Store the start, end and and entity labels of the words (stored as dictionaries) in variables | |
for k, v in dc.items(): | |
if k == "start": | |
start_char = v | |
elif k == "end": | |
end_char = v | |
elif k == "label": | |
ent = v | |
# Add the triads of entity information as tuples to the list of entities | |
ent_list.append((start_char, end_char, ent)) | |
# Skip over the sentence if it was rejected by the annotator | |
elif key == "answer" and value != "accept": | |
continue | |
# Spacy's training format - text with its corresponding list of entities | |
spacy_formatted_line = (text, {"entities": ent_list}) | |
empty_train_data.append(spacy_formatted_line) # add the spacy-formatted line to the list of training data | |
# Return the list | |
return empty_train_data | |
if __name__ == "__main__": | |
# This list can now be used to train/update a spacy named entity recognizer | |
TRAIN_DATA = gold2spacy("annotations.jsonl", []) | |
# Iterating through the training data list and printint out each text-entity list tuple | |
for ent in TRAIN_DATA: | |
print(ent) | |
""" | |
Example Output: | |
('Who is Shaka Khan?', {'entities': [(7, 17, 'PERSON')]}) | |
("I like London and Berlin.", {"entities": [(7, 13, "LOC"), (18, 24, "LOC")]}) | |
. | |
. | |
. | |
""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment