Skip to content

Instantly share code, notes, and snippets.

@dhuynh95
Created May 19, 2024 16:22
Show Gist options
  • Save dhuynh95/0764daf8050efa0dbcb14f980c731154 to your computer and use it in GitHub Desktop.
Save dhuynh95/0764daf8050efa0dbcb14f980c731154 to your computer and use it in GitHub Desktop.
Summarize the content of an HTML page using Llama Index
from llama_index.core import Document, VectorStoreIndex
from llama_index.core import Settings
import trafilatura
class PageSummarizer:
def __init__(self, llm, embed_model):
self.llm = llm
self.embed_model = embed_model
def summarize(self, html: str) -> str:
Settings.llm = self.llm
Settings.embed_model = self.embed_model
page_content = trafilatura.extract(html)
documents = [Document(text=page_content)]
index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine()
instruction = "Provide a detailled summary of this text"
page_content_summary = query_engine.query(instruction).response
return page_content_summary
from llama_index.llms.groq import Groq
model = "llama3-8b-8192"
llm = Groq(model=model, temperature=0.1)
embed_model = context.embedding
page_summarizer = PageSummarizer(llm, embed_model)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment