Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
This is a gist to explain how to easily integrate spaCy and tagtog. Find the article that goes through this gist's context at https://tagtog.medium.com/integrating-tagtog-and-spacy-16fb0addeea1
import spacy
import json
import requests
import os
def get_class_id(label):
"""
Translates the spaCy label id into the tagtog entity type id
- label: spaCy label id
"""
choices = {'PERSON': 'e_1', 'ORG': 'e_2', 'MONEY': 'e_3'}
return choices.get(label, None)
def get_entities(spans, pipeline):
"""
Translates a tuple of named entity Span objects (https://spacy.io/api/span) into a
list of tagtog entities (https://docs.tagtog.net/anndoc.html#ann-json). Each entity is
defined by the entity type ID (classId), the part name where the annotation is (part),
the entity offsets and the confidence (annotation status, who created it and probabilty).
- spans: the named entities in the spaCy doc
- pipeline: trained pipeline name
"""
default_prob = 1
default_part_id = 's1v1'
default_state = 'pre-added'
tagtog_entities = []
for span in spans:
class_id = get_class_id(span.label_)
if class_id is not None:
tagtog_entities.append( {
'classId': class_id,
'part': default_part_id,
'offsets':[{'start': span.start_char, 'text': span.text}],
'confidence': {'state': default_state,'who': ['ml:' + pipeline],'prob': default_prob},
'fields':{},
# this is related to the kb_id (knowledge base ID) field from the Span spaCy object
'normalizations': {}} )
return tagtog_entities
# Set the credentials at tagtog and project name
MY_USERNAME = os.environ['MY_TAGTOG_USERNAME']
MY_PASSWORD = os.environ['MY_TAGTOG_PASSWORD']
MY_PROJECT = 'demo-spaCy'
# API authentication
tagtogAPIUrl = "https://www.tagtog.net/-api/documents/v1"
auth = requests.auth.HTTPBasicAuth(username=MY_USERNAME, password=MY_PASSWORD)
text = "Paypal Holdings Inc (PYPL) President and CEO Daniel Schulman Sold $2.7 million of Shares"
# Load the spaCy trained pipeline (https://spacy.io/models/en#en_core_web_sm) and apply it to text
pipeline = 'en_core_web_sm'
nlp = spacy.load(pipeline)
doc = nlp(text)
# Initialize ann.json (specification: https://docs.tagtog.net/anndoc.html#ann-json)
annjson = {}
# Set the document as not confirmed, an annotator will manually confirm whether the annotations are correct
annjson['anncomplete'] = False
annjson['metas'] = {}
annjson['relations'] = []
# Transform the spaCy entities into tagtog entities
annjson['entities'] = get_entities(doc.ents, pipeline)
# Parameters for the API call
# see https://docs.tagtog.net/API_documents_v1.html#examples-import-pre-annotated-plain-text-file
params = {'owner': MY_USERNAME, 'project': MY_PROJECT, 'output': 'null', 'format': 'default-plus-annjson'}
# Pre-annotated document composed of the content and the annotations
files=[('doc1.txt', text), ('doc1.ann.json', json.dumps(annjson))]
# POST request to send the pre-annotated document
response = requests.post(tagtogAPIUrl, params=params, auth=auth, files=files)
print(response.text)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment