shreyasrye/gold2spacy.py

## gold2spacy.py
import json


def gold2spacy(prodigy_file: str, empty_train_data: list):
    """
      Extracts gold-standard data annotated with prodigy from prodigy_file and converts it into spacy's training format, returning it in a list
      Args:
            prodigy_file (str): the stringified name of the list containing (unformatted) annotated data from prodigy
            empty__train_data (list): en empty list to append the spacy-formatted training data to
    """

    # Iterate over each line in the prodigy file
    for line in open(prodigy_file, 'r'):

        # Convert the string line to a dictionary
        dictionary = json.loads(line)
        text, start_char, end_char, ent = "", None, None, ""
        ent_list = [] # Create a list of entities for each piece of text

        # Iterate over the key-value pairs of the dictionary and store the relevant data in variables
        for key, value in dictionary.items():
            if key == "text":
                text = value
            elif key == "spans":
                # Iterate over the nested list stored in value of this pair
                for dc in value:
                    # Store the start, end and and entity labels of the words (stored as dictionaries) in variables
                    for k, v in dc.items():
                        if k == "start":
                            start_char = v
                        elif k == "end":
                            end_char = v
                        elif k == "label":
                            ent = v

                    # Add the triads of entity information as tuples to the list of entities
                    ent_list.append((start_char, end_char, ent))

            # Skip over the sentence if it was rejected by the annotator
            elif key == "answer" and value != "accept":
                continue

         # Spacy's training format - text with its corresponding list of entities
        spacy_formatted_line = (text, {"entities": ent_list})
        empty_train_data.append(spacy_formatted_line) # add the spacy-formatted line to the list of training data

    # Return the list
    return empty_train_data

if __name__ == "__main__":

    # This list can now be used to train/update a spacy named entity recognizer
    TRAIN_DATA = gold2spacy("annotations.jsonl", [])

    # Iterating through the training data list and printint out each text-entity list tuple
    for ent in TRAIN_DATA:
        print(ent)

"""
Example Output:
 ('Who is Shaka Khan?', {'entities': [(7, 17, 'PERSON')]})
 ("I like London and Berlin.", {"entities": [(7, 13, "LOC"), (18, 24, "LOC")]})
 .
 .
 .
"""
	import json


	def gold2spacy(prodigy_file: str, empty_train_data: list):
	"""
	Extracts gold-standard data annotated with prodigy from prodigy_file and converts it into spacy's training format, returning it in a list
	Args:
	prodigy_file (str): the stringified name of the list containing (unformatted) annotated data from prodigy
	empty__train_data (list): en empty list to append the spacy-formatted training data to
	"""

	# Iterate over each line in the prodigy file
	for line in open(prodigy_file, 'r'):

	# Convert the string line to a dictionary
	dictionary = json.loads(line)
	text, start_char, end_char, ent = "", None, None, ""
	ent_list = [] # Create a list of entities for each piece of text

	# Iterate over the key-value pairs of the dictionary and store the relevant data in variables
	for key, value in dictionary.items():
	if key == "text":
	text = value
	elif key == "spans":
	# Iterate over the nested list stored in value of this pair
	for dc in value:
	# Store the start, end and and entity labels of the words (stored as dictionaries) in variables
	for k, v in dc.items():
	if k == "start":
	start_char = v
	elif k == "end":
	end_char = v
	elif k == "label":
	ent = v

	# Add the triads of entity information as tuples to the list of entities
	ent_list.append((start_char, end_char, ent))

	# Skip over the sentence if it was rejected by the annotator
	elif key == "answer" and value != "accept":
	continue

	# Spacy's training format - text with its corresponding list of entities
	spacy_formatted_line = (text, {"entities": ent_list})
	empty_train_data.append(spacy_formatted_line) # add the spacy-formatted line to the list of training data

	# Return the list
	return empty_train_data

	if __name__ == "__main__":

	# This list can now be used to train/update a spacy named entity recognizer
	TRAIN_DATA = gold2spacy("annotations.jsonl", [])

	# Iterating through the training data list and printint out each text-entity list tuple
	for ent in TRAIN_DATA:
	print(ent)

	"""
	Example Output:
	('Who is Shaka Khan?', {'entities': [(7, 17, 'PERSON')]})
	("I like London and Berlin.", {"entities": [(7, 13, "LOC"), (18, 24, "LOC")]})
	.
	.
	.
	"""