Last active
August 26, 2025 22:34
-
-
Save CodeWithOz/0b01d558adac703673aceca48471dca7 to your computer and use it in GitHub Desktop.
Agent for cleaning up named entities in YouTube video transcripts.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ... | |
| class TavilySearchResult(TypedDict): | |
| title: str | |
| content: str | |
| class BaseVerifiedEntity(BaseModel): | |
| canonical_name: str = Field(description="The canonical name of the entity.") | |
| class VerifiedEntity(BaseVerifiedEntity): | |
| extracted_name: str = Field(description="The name extracted from the video.") | |
| ... | |
| class DemoEnrichmentAgent: | |
| ... | |
| entity_verifier_llm: Runnable | |
| def __init__(self): | |
| ... | |
| self.entity_verifier_llm = init_chat_model( | |
| "gpt-4o-mini", model_provider="openai" | |
| ).with_structured_output(BaseVerifiedEntity) | |
| async def get_canonical_name( | |
| self, entity: NamedEntity, search_results: list[TavilySearchResult] | |
| ): | |
| if not search_results: | |
| # return the same entity as verified | |
| return VerifiedEntity( | |
| extracted_name=entity.entity_name, | |
| canonical_name=entity.entity_name, | |
| ) | |
| # now use an LLM to verify the entity | |
| results_txt = "\n\n".join( | |
| [ | |
| f'{result.get("title", "")}\n{result.get("content", "")}' | |
| for result in search_results | |
| ] | |
| ) | |
| human_message = f"EXTRACTED_ENTITY_NAME:\n{entity.entity_name}\n\nSEARCH_RESULTS_TEXT:\n{results_txt}" | |
| print( | |
| f"Getting canonical name from search results for entity {entity.entity_name!r}" | |
| ) | |
| verifier_res: BaseVerifiedEntity = await self.entity_verifier_llm.ainvoke( | |
| [ | |
| SystemMessage( | |
| content=( | |
| "You are a perceptive and highly skilled data extractor. " | |
| "You will receive 2 inputs: " | |
| "(1) an entity name that was extracted from a video transcript; " | |
| "(2) a series of newline-separated search results from a web " | |
| "search about that entity. " | |
| "Your task is to extract the canonical name of the entity from " | |
| "the search results. Your output MUST ONLY be a JSON object with ONLY " | |
| "one key `canonical_name`. " | |
| 'For example, `{"canonical_name": "<confirmed-canonical-name>"}.`' | |
| ) | |
| ), | |
| HumanMessage(content=human_message), | |
| ] | |
| ) | |
| print( | |
| f"LLM verified entity {entity.entity_name!r} as {verifier_res.canonical_name!r}" | |
| ) | |
| return VerifiedEntity( | |
| extracted_name=entity.entity_name, | |
| canonical_name=verifier_res.canonical_name, | |
| ) | |
| ... |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment