Skip to content

Instantly share code, notes, and snippets.

@lucafrost
Created November 2, 2023 09:57
Show Gist options
  • Save lucafrost/2e4d1e185bf467fe0dabb456c0791d5f to your computer and use it in GitHub Desktop.
Save lucafrost/2e4d1e185bf467fe0dabb456c0791d5f to your computer and use it in GitHub Desktop.
Entity Linking
class EntityLinkingNode:
"""
EntityLinkingNode performs Named Entity Extraction (NER) and Entity
Linking (EL) through DBpedia [1].
This node replaces all entity mentions in the full-text of an article with
the "universal" name of the entity to ensure entities are referenced in a
consistent manner, and reduce issues with the entailment model.
Notes
-----
**How it works**
First, the DBpedia Spotlight API [2] is used to annotate the text for
named entities and their respective knowledge graph URIs. Second, a SPARQL
query is performed to fetch the "proper" name of each entity. Finally, the
text is modified to replace instances of the original entity with its
proper name to universalise references.
**Internal Methods**
1. self._annotate_entities()
Calls the DBpedia Spotlight API to annotate entities in the full-text.
2. self._build_sparql_payload()
Creates a SPARQL query using the URIs of entities returned by the
DBpedia Spotlight API.
3. self._call_dbpedia()
Executes the SPARQL query.
4. self._extract_names()
Parses the JSON output from the SPARQL endpoint.
5. self._disambiguate_text()
Replaces all original entity mentions with the proper names returned
from the SPARQL query.
References
----------
[1] DBpedia : https://www.dbpedia.org/about/
[2] DBpedia Spotlight API : https://www.dbpedia.org/resources/spotlight/
"""
def __init__(
self,
dbpedia_endpoint: Optional[
str
] = "https://api.dbpedia-spotlight.org/en/annotate",
sparql_endpoint: Optional[str] = "https://dbpedia.org/sparql",
boto_session: Optional[boto3.Session] = None,
) -> None:
"""
Initialise a new EntityLinkingNode object.
Parameters
----------
dbpedia_endpoint : str
URL of a DBpedia Spotlight API. See notes.
sparql_endpoint : str
URL of a DBpedia SPARQL endpoint. See notes.
boto_session : boto3.Session
Boto3 session used for AWS authentication. This parameter is not
used by this class, however, it was included to ensure uniformity
across all APIs.
Notes
-----
The DBpedia Spotlight and SPARQL endpoints can be self-hosted, and we
may wish to host our own endpoints dependent on latency and other
considerations. More information available here [1] [2].
References
----------
[1] Self-hosted Spotlight : https://www.dbpedia.org/resources/spotlight/
[2] Self-hosted SPARQL API : https://www.dbpedia.org/resources/sparql/
"""
self.endpoint = dbpedia_endpoint
self.sparql_endpoint = sparql_endpoint
# included to conform to API norms
self.boto_session = boto_session
def run(
self,
doc: DocumentType,
entity_confidence: Optional[float] = 0.8,
target: Literal["default", "coref"] = "default",
) -> DocumentType:
"""
Perform entity linking and harmonization on a DocumentType.
Harmonization involves replacing all entity mentions with their
"proper" name located in DBpedia.
Parameters
----------
doc : DocumentType
The article to perform entity linking (EL) on.
entity_confidence : float
Confidence threshold for the DBpedia Spotlight API for NER.
Defaults to 0.8
target : Literal["default", "coref"]
Specifies which field to perform entity linking on. If "default",
then doc.fulltext.content is used. If "coref", then
doc.resolved_text is used.
Returns
-------
DocumentType with .entities attribute populated.
"""
# [0.] HANDLING DIFFERENT TARGETS
if target == "default":
text = doc.fulltext.content
elif target == "coref":
text = doc.resolved_text
else:
raise ValueError(f"Target `{target}` is unsupported!")
# [1.] MAIN LOGIC
ents = self._annotate_entities(text=text, confidence=entity_confidence)
query = self._build_sparql_payload(entities=ents)
resp = self._call_dbpedia(query)
ents = self._extract_names(dbpedia_resp=resp, entities=ents)
disambig_text = self._disambiguate_text(text=text, entities=ents)
doc.entities = LinkedEntities(resolved_text=disambig_text, entities=ents)
return doc
def _annotate_entities(
self, text: str, confidence: Optional[float] = 0.8
) -> List[EntityType]:
"""
[INTERNAL] Performs Named Entity Recognition (NER) via the DBpedia
Spotlight API. Spotlight returns the DBpedia URI of each entity
discovered.
Parameters
----------
text : str
Text to analyse.
confidence : float
Confidence threshold for entity extraction. Defaults to 0.8.
Returns
-------
List[EntityType] : List of EntityType objects containing the raw_text
of the entity mention, index of start/end characters, and URI of
located entity in DBpedia.
"""
# [1.] CALL SPOTLIGHT
resp = requests.get(
self.endpoint,
params={"text": text, "confidence": str(confidence)},
headers={"accept": "application/json"},
).json()
# [2.] CREATE LIST OF *UNIQUE* ENTITIES
root_entities = []
for entity in resp["Resources"]:
ent_obj = {"uri": entity["@URI"], "dbp_text": None, "mentions": []}
if ent_obj not in root_entities:
root_entities.append(ent_obj)
root_entities = [EntityType(**ent) for ent in root_entities]
# [3.] ASSIGN MENTION(S) TO RESPECTIVE ENTITY
# see: https://github.com/whispAI/whispy/pull/88#issuecomment-1722229202
for mention in resp["Resources"]:
uri = mention["@URI"]
for ent in root_entities:
if ent.uri == uri:
if mention["@surfaceForm"] not in ent.mentions:
ent.mentions.append(mention["@surfaceForm"])
return root_entities
def _build_sparql_payload(
self, entities: List[EntityType], name_property: Optional[str] = "rdfs:label"
) -> str:
"""
[INTERNAL] Creates a SPARQL payload to query a desired property (see:
name_property param) for the URI of each EntityType.
Parameters
----------
entities : List[EntityType]
List of identified entities.
name_property : str
Name of the DBpedia / KG property desired. Defaults to
"rdfs:label" which is the standard entity name property.
Returns
-------
str : SPARQL query
"""
uris = [e.uri for e in entities]
payload = """
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
SELECT ?name
WHERE {
"""
for idx, uri in enumerate(uris):
payload += """
{{
<{0}> {1} ?name .
FILTER(LANGMATCHES(LANG(?name), "en"))
}}
""".format(
uri, name_property
)
if idx != len(uris) - 1: # if not last URI
payload += "UNION \n"
payload += "}"
return payload
def _call_dbpedia(self, sparql_query: str) -> dict:
"""
[INTERNAL] Provides the SPARQL query to DBpedia and returns a JSON
representation of the response.
Parameters
----------
sparql_query : str
Generated SPARQL query.
Returns
-------
dict : JSON response from DBpedia.
"""
return requests.get(
self.sparql_endpoint, params={"format": "json", "query": sparql_query}
).json()
def _extract_names(
self, dbpedia_resp: dict, entities: List[EntityType]
) -> List[EntityType]:
"""
[INTERNAL] Processes the JSON response from the SPARQL query to extract
the "proper" name for each entity. These names are then used to
populate each EntityType.dbp_text value.
Parameters
----------
dbpedia_resp : dict
JSON response from SPARQL query.
entities : List[EntityType]
List of entities.
Returns
-------
List[EntityType] : with .dbp_text value populated.
"""
for ent, dbp in zip(entities, dbpedia_resp["results"]["bindings"]):
ent.dbp_text = dbp["name"]["value"]
return entities
def _disambiguate_text(self, text: str, entities: List[EntityType]) -> str:
"""
[INTERNAL] Iterates over each EntityType and uses the .replace()
builtin to replace entity mentions with their "proper" name from
DBpedia.
Parameters
----------
text : str
Text to disambiguate.
entities : List[EntityType]
List of entities.
Returns
-------
str : disambiguated text.
"""
for entity in entities:
for mention in entity.mentions:
text = text.replace(mention, entity.dbp_text)
return text
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment