This is a gist to explain how to easily integrate spaCy and tagtog. Find the article that goes through this gist's context at https://tagtog.medium.com/integrating-tagtog-and-spacy-16fb0addeea1
import spacy | |
import json | |
import requests | |
import os | |
def get_class_id(label): | |
""" | |
Translates the spaCy label id into the tagtog entity type id | |
- label: spaCy label id | |
""" | |
choices = {'PERSON': 'e_1', 'ORG': 'e_2', 'MONEY': 'e_3'} | |
return choices.get(label, None) | |
def get_entities(spans, pipeline): | |
""" | |
Translates a tuple of named entity Span objects (https://spacy.io/api/span) into a | |
list of tagtog entities (https://docs.tagtog.net/anndoc.html#ann-json). Each entity is | |
defined by the entity type ID (classId), the part name where the annotation is (part), | |
the entity offsets and the confidence (annotation status, who created it and probabilty). | |
- spans: the named entities in the spaCy doc | |
- pipeline: trained pipeline name | |
""" | |
default_prob = 1 | |
default_part_id = 's1v1' | |
default_state = 'pre-added' | |
tagtog_entities = [] | |
for span in spans: | |
class_id = get_class_id(span.label_) | |
if class_id is not None: | |
tagtog_entities.append( { | |
'classId': class_id, | |
'part': default_part_id, | |
'offsets':[{'start': span.start_char, 'text': span.text}], | |
'confidence': {'state': default_state,'who': ['ml:' + pipeline],'prob': default_prob}, | |
'fields':{}, | |
# this is related to the kb_id (knowledge base ID) field from the Span spaCy object | |
'normalizations': {}} ) | |
return tagtog_entities | |
# Set the credentials at tagtog and project name | |
MY_USERNAME = os.environ['MY_TAGTOG_USERNAME'] | |
MY_PASSWORD = os.environ['MY_TAGTOG_PASSWORD'] | |
MY_PROJECT = 'demo-spaCy' | |
# API authentication | |
tagtogAPIUrl = "https://www.tagtog.net/-api/documents/v1" | |
auth = requests.auth.HTTPBasicAuth(username=MY_USERNAME, password=MY_PASSWORD) | |
text = "Paypal Holdings Inc (PYPL) President and CEO Daniel Schulman Sold $2.7 million of Shares" | |
# Load the spaCy trained pipeline (https://spacy.io/models/en#en_core_web_sm) and apply it to text | |
pipeline = 'en_core_web_sm' | |
nlp = spacy.load(pipeline) | |
doc = nlp(text) | |
# Initialize ann.json (specification: https://docs.tagtog.net/anndoc.html#ann-json) | |
annjson = {} | |
# Set the document as not confirmed, an annotator will manually confirm whether the annotations are correct | |
annjson['anncomplete'] = False | |
annjson['metas'] = {} | |
annjson['relations'] = [] | |
# Transform the spaCy entities into tagtog entities | |
annjson['entities'] = get_entities(doc.ents, pipeline) | |
# Parameters for the API call | |
# see https://docs.tagtog.net/API_documents_v1.html#examples-import-pre-annotated-plain-text-file | |
params = {'owner': MY_USERNAME, 'project': MY_PROJECT, 'output': 'null', 'format': 'default-plus-annjson'} | |
# Pre-annotated document composed of the content and the annotations | |
files=[('doc1.txt', text), ('doc1.ann.json', json.dumps(annjson))] | |
# POST request to send the pre-annotated document | |
response = requests.post(tagtogAPIUrl, params=params, auth=auth, files=files) | |
print(response.text) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment