Last active
May 25, 2021 09:09
-
-
Save tagtog-gists/05ca8cc4d2cb12b786aa5270814e65e4 to your computer and use it in GitHub Desktop.
This is a gist to explain how to easily integrate spaCy and tagtog. Find the article that goes through this gist's context at https://tagtog.medium.com/integrating-tagtog-and-spacy-16fb0addeea1
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import spacy | |
import json | |
import requests | |
import os | |
def get_class_id(label): | |
""" | |
Translates the spaCy label id into the tagtog entity type id | |
- label: spaCy label id | |
""" | |
choices = {'PERSON': 'e_1', 'ORG': 'e_2', 'MONEY': 'e_3'} | |
return choices.get(label, None) | |
def get_entities(spans, pipeline): | |
""" | |
Translates a tuple of named entity Span objects (https://spacy.io/api/span) into a | |
list of tagtog entities (https://docs.tagtog.net/anndoc.html#ann-json). Each entity is | |
defined by the entity type ID (classId), the part name where the annotation is (part), | |
the entity offsets and the confidence (annotation status, who created it and probabilty). | |
- spans: the named entities in the spaCy doc | |
- pipeline: trained pipeline name | |
""" | |
default_prob = 1 | |
default_part_id = 's1v1' | |
default_state = 'pre-added' | |
tagtog_entities = [] | |
for span in spans: | |
class_id = get_class_id(span.label_) | |
if class_id is not None: | |
tagtog_entities.append( { | |
'classId': class_id, | |
'part': default_part_id, | |
'offsets':[{'start': span.start_char, 'text': span.text}], | |
'confidence': {'state': default_state,'who': ['ml:' + pipeline],'prob': default_prob}, | |
'fields':{}, | |
# this is related to the kb_id (knowledge base ID) field from the Span spaCy object | |
'normalizations': {}} ) | |
return tagtog_entities | |
# Set the credentials at tagtog and project name | |
MY_USERNAME = os.environ['MY_TAGTOG_USERNAME'] | |
MY_PASSWORD = os.environ['MY_TAGTOG_PASSWORD'] | |
MY_PROJECT = 'demo-spaCy' | |
# API authentication | |
tagtogAPIUrl = "https://www.tagtog.net/-api/documents/v1" | |
auth = requests.auth.HTTPBasicAuth(username=MY_USERNAME, password=MY_PASSWORD) | |
text = "Paypal Holdings Inc (PYPL) President and CEO Daniel Schulman Sold $2.7 million of Shares" | |
# Load the spaCy trained pipeline (https://spacy.io/models/en#en_core_web_sm) and apply it to text | |
pipeline = 'en_core_web_sm' | |
nlp = spacy.load(pipeline) | |
doc = nlp(text) | |
# Initialize ann.json (specification: https://docs.tagtog.net/anndoc.html#ann-json) | |
annjson = {} | |
# Set the document as not confirmed, an annotator will manually confirm whether the annotations are correct | |
annjson['anncomplete'] = False | |
annjson['metas'] = {} | |
annjson['relations'] = [] | |
# Transform the spaCy entities into tagtog entities | |
annjson['entities'] = get_entities(doc.ents, pipeline) | |
# Parameters for the API call | |
# see https://docs.tagtog.net/API_documents_v1.html#examples-import-pre-annotated-plain-text-file | |
params = {'owner': MY_USERNAME, 'project': MY_PROJECT, 'output': 'null', 'format': 'default-plus-annjson'} | |
# Pre-annotated document composed of the content and the annotations | |
files=[('doc1.txt', text), ('doc1.ann.json', json.dumps(annjson))] | |
# POST request to send the pre-annotated document | |
response = requests.post(tagtogAPIUrl, params=params, auth=auth, files=files) | |
print(response.text) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The fully-working repository is here 👉: https://github.com/tagtog/demo-webhooks