lucafrost/entity_linking.py Secret

## entity_linking.py
class EntityLinkingNode:
    """
    EntityLinkingNode performs Named Entity Extraction (NER) and Entity
    Linking (EL) through DBpedia [1].

    This node replaces all entity mentions in the full-text of an article with
    the "universal" name of the entity to ensure entities are referenced in a
    consistent manner, and reduce issues with the entailment model.

    Notes
    -----
    **How it works**

    First, the DBpedia Spotlight API [2]  is used to annotate the text for
    named entities and their respective knowledge graph URIs. Second, a SPARQL
    query is performed to fetch the "proper" name of each entity. Finally, the
    text is modified to replace instances of the original entity with its
    proper name to universalise references.

    **Internal Methods**

    1. self._annotate_entities()
        Calls the DBpedia Spotlight API to annotate entities in the full-text.
    2. self._build_sparql_payload()
        Creates a SPARQL query using the URIs of entities returned by the
        DBpedia Spotlight API.
    3. self._call_dbpedia()
        Executes the SPARQL query.
    4. self._extract_names()
        Parses the JSON output from the SPARQL endpoint.
    5. self._disambiguate_text()
        Replaces all original entity mentions with the proper names returned
        from the SPARQL query.


    References
    ----------
    [1] DBpedia : https://www.dbpedia.org/about/
    [2] DBpedia Spotlight API : https://www.dbpedia.org/resources/spotlight/
    """

    def __init__(
        self,
        dbpedia_endpoint: Optional[
            str
        ] = "https://api.dbpedia-spotlight.org/en/annotate",
        sparql_endpoint: Optional[str] = "https://dbpedia.org/sparql",
        boto_session: Optional[boto3.Session] = None,
    ) -> None:
        """
        Initialise a new EntityLinkingNode object.

        Parameters
        ----------
        dbpedia_endpoint : str
            URL of a DBpedia Spotlight API. See notes.
        sparql_endpoint : str
            URL of a DBpedia SPARQL endpoint. See notes.
        boto_session : boto3.Session
            Boto3 session used for AWS authentication. This parameter is not
            used by this class, however, it was included to ensure uniformity
            across all APIs.

        Notes
        -----
        The DBpedia Spotlight and SPARQL endpoints can be self-hosted, and we
        may wish to host our own endpoints dependent on latency and other
        considerations. More information available here [1] [2].

        References
        ----------
        [1] Self-hosted Spotlight : https://www.dbpedia.org/resources/spotlight/
        [2] Self-hosted SPARQL API : https://www.dbpedia.org/resources/sparql/
        """
        self.endpoint = dbpedia_endpoint
        self.sparql_endpoint = sparql_endpoint
        # included to conform to API norms
        self.boto_session = boto_session

    def run(
        self,
        doc: DocumentType,
        entity_confidence: Optional[float] = 0.8,
        target: Literal["default", "coref"] = "default",
    ) -> DocumentType:
        """
        Perform entity linking and harmonization on a DocumentType.

        Harmonization involves replacing all entity mentions with their
        "proper" name located in DBpedia.

        Parameters
        ----------
        doc : DocumentType
            The article to perform entity linking (EL) on.
        entity_confidence : float
            Confidence threshold for the DBpedia Spotlight API for NER.
            Defaults to 0.8
        target : Literal["default", "coref"]
            Specifies which field to perform entity linking on. If "default",
            then doc.fulltext.content is used. If "coref", then
            doc.resolved_text is used.

        Returns
        -------
        DocumentType with .entities attribute populated.
        """
        # [0.] HANDLING DIFFERENT TARGETS
        if target == "default":
            text = doc.fulltext.content
        elif target == "coref":
            text = doc.resolved_text
        else:
            raise ValueError(f"Target `{target}` is unsupported!")
        # [1.] MAIN LOGIC
        ents = self._annotate_entities(text=text, confidence=entity_confidence)
        query = self._build_sparql_payload(entities=ents)
        resp = self._call_dbpedia(query)
        ents = self._extract_names(dbpedia_resp=resp, entities=ents)
        disambig_text = self._disambiguate_text(text=text, entities=ents)
        doc.entities = LinkedEntities(resolved_text=disambig_text, entities=ents)
        return doc

    def _annotate_entities(
        self, text: str, confidence: Optional[float] = 0.8
    ) -> List[EntityType]:
        """
        [INTERNAL] Performs Named Entity Recognition (NER) via the DBpedia
        Spotlight API. Spotlight returns the DBpedia URI of each entity
        discovered.

        Parameters
        ----------
        text : str
            Text to analyse.
        confidence : float
            Confidence threshold for entity extraction. Defaults to 0.8.

        Returns
        -------
        List[EntityType] : List of EntityType objects containing the raw_text
        of the entity mention, index of start/end characters, and URI of
        located entity in DBpedia.
        """
        # [1.] CALL SPOTLIGHT
        resp = requests.get(
            self.endpoint,
            params={"text": text, "confidence": str(confidence)},
            headers={"accept": "application/json"},
        ).json()
        # [2.] CREATE LIST OF *UNIQUE* ENTITIES
        root_entities = []
        for entity in resp["Resources"]:
            ent_obj = {"uri": entity["@URI"], "dbp_text": None, "mentions": []}
            if ent_obj not in root_entities:
                root_entities.append(ent_obj)
        root_entities = [EntityType(**ent) for ent in root_entities]
        # [3.] ASSIGN MENTION(S) TO RESPECTIVE ENTITY
        # see: https://github.com/whispAI/whispy/pull/88#issuecomment-1722229202
        for mention in resp["Resources"]:
            uri = mention["@URI"]
            for ent in root_entities:
                if ent.uri == uri:
                    if mention["@surfaceForm"] not in ent.mentions:
                        ent.mentions.append(mention["@surfaceForm"])
        return root_entities

    def _build_sparql_payload(
        self, entities: List[EntityType], name_property: Optional[str] = "rdfs:label"
    ) -> str:
        """
        [INTERNAL] Creates a SPARQL payload to query a desired property (see:
        name_property param) for the URI of each EntityType.

        Parameters
        ----------
        entities : List[EntityType]
            List of identified entities.
        name_property : str
            Name of the DBpedia / KG property desired. Defaults to
            "rdfs:label" which is the standard entity name property.

        Returns
        -------
        str : SPARQL query
        """
        uris = [e.uri for e in entities]
        payload = """
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        SELECT ?name
        WHERE {
        """
        for idx, uri in enumerate(uris):
            payload += """
            {{
                <{0}> {1} ?name .
                FILTER(LANGMATCHES(LANG(?name), "en"))
            }}
            """.format(
                uri, name_property
            )
            if idx != len(uris) - 1:  # if not last URI
                payload += "UNION \n"
        payload += "}"
        return payload

    def _call_dbpedia(self, sparql_query: str) -> dict:
        """
        [INTERNAL] Provides the SPARQL query to DBpedia and returns a JSON
        representation of the response.

        Parameters
        ----------
        sparql_query : str
            Generated SPARQL query.

        Returns
        -------
        dict : JSON response from DBpedia.
        """
        return requests.get(
            self.sparql_endpoint, params={"format": "json", "query": sparql_query}
        ).json()

    def _extract_names(
        self, dbpedia_resp: dict, entities: List[EntityType]
    ) -> List[EntityType]:
        """
        [INTERNAL] Processes the JSON response from the SPARQL query to extract
        the "proper" name for each entity. These names are then used to
        populate each EntityType.dbp_text value.

        Parameters
        ----------
        dbpedia_resp : dict
            JSON response from SPARQL query.
        entities : List[EntityType]
            List of entities.

        Returns
        -------
        List[EntityType] : with .dbp_text value populated.
        """
        for ent, dbp in zip(entities, dbpedia_resp["results"]["bindings"]):
            ent.dbp_text = dbp["name"]["value"]
        return entities

    def _disambiguate_text(self, text: str, entities: List[EntityType]) -> str:
        """
        [INTERNAL] Iterates over each EntityType and uses the .replace()
        builtin to replace entity mentions with their "proper" name from
        DBpedia.

        Parameters
        ----------
        text : str
            Text to disambiguate.
        entities : List[EntityType]
            List of entities.

        Returns
        -------
        str : disambiguated text.
        """
        for entity in entities:
            for mention in entity.mentions:
                text = text.replace(mention, entity.dbp_text)
        return text
	class EntityLinkingNode:
	"""
	EntityLinkingNode performs Named Entity Extraction (NER) and Entity
	Linking (EL) through DBpedia [1].

	This node replaces all entity mentions in the full-text of an article with
	the "universal" name of the entity to ensure entities are referenced in a
	consistent manner, and reduce issues with the entailment model.

	Notes
	-----
	How it works

	First, the DBpedia Spotlight API [2] is used to annotate the text for
	named entities and their respective knowledge graph URIs. Second, a SPARQL
	query is performed to fetch the "proper" name of each entity. Finally, the
	text is modified to replace instances of the original entity with its
	proper name to universalise references.

	Internal Methods

	1. self._annotate_entities()
	Calls the DBpedia Spotlight API to annotate entities in the full-text.
	2. self._build_sparql_payload()
	Creates a SPARQL query using the URIs of entities returned by the
	DBpedia Spotlight API.
	3. self._call_dbpedia()
	Executes the SPARQL query.
	4. self._extract_names()
	Parses the JSON output from the SPARQL endpoint.
	5. self._disambiguate_text()
	Replaces all original entity mentions with the proper names returned
	from the SPARQL query.


	References
	----------
	[1] DBpedia : https://www.dbpedia.org/about/
	[2] DBpedia Spotlight API : https://www.dbpedia.org/resources/spotlight/
	"""

	def __init__(
	self,
	dbpedia_endpoint: Optional[
	str
	] = "https://api.dbpedia-spotlight.org/en/annotate",
	sparql_endpoint: Optional[str] = "https://dbpedia.org/sparql",
	boto_session: Optional[boto3.Session] = None,
	) -> None:
	"""
	Initialise a new EntityLinkingNode object.

	Parameters
	----------
	dbpedia_endpoint : str
	URL of a DBpedia Spotlight API. See notes.
	sparql_endpoint : str
	URL of a DBpedia SPARQL endpoint. See notes.
	boto_session : boto3.Session
	Boto3 session used for AWS authentication. This parameter is not
	used by this class, however, it was included to ensure uniformity
	across all APIs.

	Notes
	-----
	The DBpedia Spotlight and SPARQL endpoints can be self-hosted, and we
	may wish to host our own endpoints dependent on latency and other
	considerations. More information available here [1] [2].

	References
	----------
	[1] Self-hosted Spotlight : https://www.dbpedia.org/resources/spotlight/
	[2] Self-hosted SPARQL API : https://www.dbpedia.org/resources/sparql/
	"""
	self.endpoint = dbpedia_endpoint
	self.sparql_endpoint = sparql_endpoint
	# included to conform to API norms
	self.boto_session = boto_session

	def run(
	self,
	doc: DocumentType,
	entity_confidence: Optional[float] = 0.8,
	target: Literal["default", "coref"] = "default",
	) -> DocumentType:
	"""
	Perform entity linking and harmonization on a DocumentType.

	Harmonization involves replacing all entity mentions with their
	"proper" name located in DBpedia.

	Parameters
	----------
	doc : DocumentType
	The article to perform entity linking (EL) on.
	entity_confidence : float
	Confidence threshold for the DBpedia Spotlight API for NER.
	Defaults to 0.8
	target : Literal["default", "coref"]
	Specifies which field to perform entity linking on. If "default",
	then doc.fulltext.content is used. If "coref", then
	doc.resolved_text is used.

	Returns
	-------
	DocumentType with .entities attribute populated.
	"""
	# [0.] HANDLING DIFFERENT TARGETS
	if target == "default":
	text = doc.fulltext.content
	elif target == "coref":
	text = doc.resolved_text
	else:
	raise ValueError(f"Target `{target}` is unsupported!")
	# [1.] MAIN LOGIC
	ents = self._annotate_entities(text=text, confidence=entity_confidence)
	query = self._build_sparql_payload(entities=ents)
	resp = self._call_dbpedia(query)
	ents = self._extract_names(dbpedia_resp=resp, entities=ents)
	disambig_text = self._disambiguate_text(text=text, entities=ents)
	doc.entities = LinkedEntities(resolved_text=disambig_text, entities=ents)
	return doc

	def _annotate_entities(
	self, text: str, confidence: Optional[float] = 0.8
	) -> List[EntityType]:
	"""
	[INTERNAL] Performs Named Entity Recognition (NER) via the DBpedia
	Spotlight API. Spotlight returns the DBpedia URI of each entity
	discovered.

	Parameters
	----------
	text : str
	Text to analyse.
	confidence : float
	Confidence threshold for entity extraction. Defaults to 0.8.

	Returns
	-------
	List[EntityType] : List of EntityType objects containing the raw_text
	of the entity mention, index of start/end characters, and URI of
	located entity in DBpedia.
	"""
	# [1.] CALL SPOTLIGHT
	resp = requests.get(
	self.endpoint,
	params={"text": text, "confidence": str(confidence)},
	headers={"accept": "application/json"},
	).json()
	# [2.] CREATE LIST OF UNIQUE ENTITIES
	root_entities = []
	for entity in resp["Resources"]:
	ent_obj = {"uri": entity["@URI"], "dbp_text": None, "mentions": []}
	if ent_obj not in root_entities:
	root_entities.append(ent_obj)
	root_entities = [EntityType(**ent) for ent in root_entities]
	# [3.] ASSIGN MENTION(S) TO RESPECTIVE ENTITY
	# see: https://github.com/whispAI/whispy/pull/88#issuecomment-1722229202
	for mention in resp["Resources"]:
	uri = mention["@URI"]
	for ent in root_entities:
	if ent.uri == uri:
	if mention["@surfaceForm"] not in ent.mentions:
	ent.mentions.append(mention["@surfaceForm"])
	return root_entities

	def _build_sparql_payload(
	self, entities: List[EntityType], name_property: Optional[str] = "rdfs:label"
	) -> str:
	"""
	[INTERNAL] Creates a SPARQL payload to query a desired property (see:
	name_property param) for the URI of each EntityType.

	Parameters
	----------
	entities : List[EntityType]
	List of identified entities.
	name_property : str
	Name of the DBpedia / KG property desired. Defaults to
	"rdfs:label" which is the standard entity name property.

	Returns
	-------
	str : SPARQL query
	"""
	uris = [e.uri for e in entities]
	payload = """
	PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
	SELECT ?name
	WHERE {
	"""
	for idx, uri in enumerate(uris):
	payload += """
	{{
	<{0}> {1} ?name .
	FILTER(LANGMATCHES(LANG(?name), "en"))
	}}
	""".format(
	uri, name_property
	)
	if idx != len(uris) - 1: # if not last URI
	payload += "UNION \n"
	payload += "}"
	return payload

	def _call_dbpedia(self, sparql_query: str) -> dict:
	"""
	[INTERNAL] Provides the SPARQL query to DBpedia and returns a JSON
	representation of the response.

	Parameters
	----------
	sparql_query : str
	Generated SPARQL query.

	Returns
	-------
	dict : JSON response from DBpedia.
	"""
	return requests.get(
	self.sparql_endpoint, params={"format": "json", "query": sparql_query}
	).json()

	def _extract_names(
	self, dbpedia_resp: dict, entities: List[EntityType]
	) -> List[EntityType]:
	"""
	[INTERNAL] Processes the JSON response from the SPARQL query to extract
	the "proper" name for each entity. These names are then used to
	populate each EntityType.dbp_text value.

	Parameters
	----------
	dbpedia_resp : dict
	JSON response from SPARQL query.
	entities : List[EntityType]
	List of entities.

	Returns
	-------
	List[EntityType] : with .dbp_text value populated.
	"""
	for ent, dbp in zip(entities, dbpedia_resp["results"]["bindings"]):
	ent.dbp_text = dbp["name"]["value"]
	return entities

	def _disambiguate_text(self, text: str, entities: List[EntityType]) -> str:
	"""
	[INTERNAL] Iterates over each EntityType and uses the .replace()
	builtin to replace entity mentions with their "proper" name from
	DBpedia.

	Parameters
	----------
	text : str
	Text to disambiguate.
	entities : List[EntityType]
	List of entities.

	Returns
	-------
	str : disambiguated text.
	"""
	for entity in entities:
	for mention in entity.mentions:
	text = text.replace(mention, entity.dbp_text)
	return text