-
-
Save lucafrost/2e4d1e185bf467fe0dabb456c0791d5f to your computer and use it in GitHub Desktop.
Entity Linking
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class EntityLinkingNode: | |
""" | |
EntityLinkingNode performs Named Entity Extraction (NER) and Entity | |
Linking (EL) through DBpedia [1]. | |
This node replaces all entity mentions in the full-text of an article with | |
the "universal" name of the entity to ensure entities are referenced in a | |
consistent manner, and reduce issues with the entailment model. | |
Notes | |
----- | |
**How it works** | |
First, the DBpedia Spotlight API [2] is used to annotate the text for | |
named entities and their respective knowledge graph URIs. Second, a SPARQL | |
query is performed to fetch the "proper" name of each entity. Finally, the | |
text is modified to replace instances of the original entity with its | |
proper name to universalise references. | |
**Internal Methods** | |
1. self._annotate_entities() | |
Calls the DBpedia Spotlight API to annotate entities in the full-text. | |
2. self._build_sparql_payload() | |
Creates a SPARQL query using the URIs of entities returned by the | |
DBpedia Spotlight API. | |
3. self._call_dbpedia() | |
Executes the SPARQL query. | |
4. self._extract_names() | |
Parses the JSON output from the SPARQL endpoint. | |
5. self._disambiguate_text() | |
Replaces all original entity mentions with the proper names returned | |
from the SPARQL query. | |
References | |
---------- | |
[1] DBpedia : https://www.dbpedia.org/about/ | |
[2] DBpedia Spotlight API : https://www.dbpedia.org/resources/spotlight/ | |
""" | |
def __init__( | |
self, | |
dbpedia_endpoint: Optional[ | |
str | |
] = "https://api.dbpedia-spotlight.org/en/annotate", | |
sparql_endpoint: Optional[str] = "https://dbpedia.org/sparql", | |
boto_session: Optional[boto3.Session] = None, | |
) -> None: | |
""" | |
Initialise a new EntityLinkingNode object. | |
Parameters | |
---------- | |
dbpedia_endpoint : str | |
URL of a DBpedia Spotlight API. See notes. | |
sparql_endpoint : str | |
URL of a DBpedia SPARQL endpoint. See notes. | |
boto_session : boto3.Session | |
Boto3 session used for AWS authentication. This parameter is not | |
used by this class, however, it was included to ensure uniformity | |
across all APIs. | |
Notes | |
----- | |
The DBpedia Spotlight and SPARQL endpoints can be self-hosted, and we | |
may wish to host our own endpoints dependent on latency and other | |
considerations. More information available here [1] [2]. | |
References | |
---------- | |
[1] Self-hosted Spotlight : https://www.dbpedia.org/resources/spotlight/ | |
[2] Self-hosted SPARQL API : https://www.dbpedia.org/resources/sparql/ | |
""" | |
self.endpoint = dbpedia_endpoint | |
self.sparql_endpoint = sparql_endpoint | |
# included to conform to API norms | |
self.boto_session = boto_session | |
def run( | |
self, | |
doc: DocumentType, | |
entity_confidence: Optional[float] = 0.8, | |
target: Literal["default", "coref"] = "default", | |
) -> DocumentType: | |
""" | |
Perform entity linking and harmonization on a DocumentType. | |
Harmonization involves replacing all entity mentions with their | |
"proper" name located in DBpedia. | |
Parameters | |
---------- | |
doc : DocumentType | |
The article to perform entity linking (EL) on. | |
entity_confidence : float | |
Confidence threshold for the DBpedia Spotlight API for NER. | |
Defaults to 0.8 | |
target : Literal["default", "coref"] | |
Specifies which field to perform entity linking on. If "default", | |
then doc.fulltext.content is used. If "coref", then | |
doc.resolved_text is used. | |
Returns | |
------- | |
DocumentType with .entities attribute populated. | |
""" | |
# [0.] HANDLING DIFFERENT TARGETS | |
if target == "default": | |
text = doc.fulltext.content | |
elif target == "coref": | |
text = doc.resolved_text | |
else: | |
raise ValueError(f"Target `{target}` is unsupported!") | |
# [1.] MAIN LOGIC | |
ents = self._annotate_entities(text=text, confidence=entity_confidence) | |
query = self._build_sparql_payload(entities=ents) | |
resp = self._call_dbpedia(query) | |
ents = self._extract_names(dbpedia_resp=resp, entities=ents) | |
disambig_text = self._disambiguate_text(text=text, entities=ents) | |
doc.entities = LinkedEntities(resolved_text=disambig_text, entities=ents) | |
return doc | |
def _annotate_entities( | |
self, text: str, confidence: Optional[float] = 0.8 | |
) -> List[EntityType]: | |
""" | |
[INTERNAL] Performs Named Entity Recognition (NER) via the DBpedia | |
Spotlight API. Spotlight returns the DBpedia URI of each entity | |
discovered. | |
Parameters | |
---------- | |
text : str | |
Text to analyse. | |
confidence : float | |
Confidence threshold for entity extraction. Defaults to 0.8. | |
Returns | |
------- | |
List[EntityType] : List of EntityType objects containing the raw_text | |
of the entity mention, index of start/end characters, and URI of | |
located entity in DBpedia. | |
""" | |
# [1.] CALL SPOTLIGHT | |
resp = requests.get( | |
self.endpoint, | |
params={"text": text, "confidence": str(confidence)}, | |
headers={"accept": "application/json"}, | |
).json() | |
# [2.] CREATE LIST OF *UNIQUE* ENTITIES | |
root_entities = [] | |
for entity in resp["Resources"]: | |
ent_obj = {"uri": entity["@URI"], "dbp_text": None, "mentions": []} | |
if ent_obj not in root_entities: | |
root_entities.append(ent_obj) | |
root_entities = [EntityType(**ent) for ent in root_entities] | |
# [3.] ASSIGN MENTION(S) TO RESPECTIVE ENTITY | |
# see: https://github.com/whispAI/whispy/pull/88#issuecomment-1722229202 | |
for mention in resp["Resources"]: | |
uri = mention["@URI"] | |
for ent in root_entities: | |
if ent.uri == uri: | |
if mention["@surfaceForm"] not in ent.mentions: | |
ent.mentions.append(mention["@surfaceForm"]) | |
return root_entities | |
def _build_sparql_payload( | |
self, entities: List[EntityType], name_property: Optional[str] = "rdfs:label" | |
) -> str: | |
""" | |
[INTERNAL] Creates a SPARQL payload to query a desired property (see: | |
name_property param) for the URI of each EntityType. | |
Parameters | |
---------- | |
entities : List[EntityType] | |
List of identified entities. | |
name_property : str | |
Name of the DBpedia / KG property desired. Defaults to | |
"rdfs:label" which is the standard entity name property. | |
Returns | |
------- | |
str : SPARQL query | |
""" | |
uris = [e.uri for e in entities] | |
payload = """ | |
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> | |
SELECT ?name | |
WHERE { | |
""" | |
for idx, uri in enumerate(uris): | |
payload += """ | |
{{ | |
<{0}> {1} ?name . | |
FILTER(LANGMATCHES(LANG(?name), "en")) | |
}} | |
""".format( | |
uri, name_property | |
) | |
if idx != len(uris) - 1: # if not last URI | |
payload += "UNION \n" | |
payload += "}" | |
return payload | |
def _call_dbpedia(self, sparql_query: str) -> dict: | |
""" | |
[INTERNAL] Provides the SPARQL query to DBpedia and returns a JSON | |
representation of the response. | |
Parameters | |
---------- | |
sparql_query : str | |
Generated SPARQL query. | |
Returns | |
------- | |
dict : JSON response from DBpedia. | |
""" | |
return requests.get( | |
self.sparql_endpoint, params={"format": "json", "query": sparql_query} | |
).json() | |
def _extract_names( | |
self, dbpedia_resp: dict, entities: List[EntityType] | |
) -> List[EntityType]: | |
""" | |
[INTERNAL] Processes the JSON response from the SPARQL query to extract | |
the "proper" name for each entity. These names are then used to | |
populate each EntityType.dbp_text value. | |
Parameters | |
---------- | |
dbpedia_resp : dict | |
JSON response from SPARQL query. | |
entities : List[EntityType] | |
List of entities. | |
Returns | |
------- | |
List[EntityType] : with .dbp_text value populated. | |
""" | |
for ent, dbp in zip(entities, dbpedia_resp["results"]["bindings"]): | |
ent.dbp_text = dbp["name"]["value"] | |
return entities | |
def _disambiguate_text(self, text: str, entities: List[EntityType]) -> str: | |
""" | |
[INTERNAL] Iterates over each EntityType and uses the .replace() | |
builtin to replace entity mentions with their "proper" name from | |
DBpedia. | |
Parameters | |
---------- | |
text : str | |
Text to disambiguate. | |
entities : List[EntityType] | |
List of entities. | |
Returns | |
------- | |
str : disambiguated text. | |
""" | |
for entity in entities: | |
for mention in entity.mentions: | |
text = text.replace(mention, entity.dbp_text) | |
return text |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment