Skip to content

Instantly share code, notes, and snippets.

@bogdansolga
Forked from cpursley/ai_web_search.ex
Created July 10, 2024 05:04
Show Gist options
  • Save bogdansolga/0ace828fe9f8428d2b16814bb937e2df to your computer and use it in GitHub Desktop.
Save bogdansolga/0ace828fe9f8428d2b16814bb937e2df to your computer and use it in GitHub Desktop.
AI Web Search
# You will need to install https://github.com/cpursley/html2markdown
defmodule Webpage do
@moduledoc false
defstruct [:url, :title, :description, :summary, :page_age]
end
defmodule WebSearch do
@moduledoc """
Web search summarization chain
"""
alias LangChain.Chains.LLMChain
alias LangChain.Message
defstruct [:query, :summary, :sources]
@doc """
Summarizes the search results for a given query and returns a structured summary of the web pages and their sources.
TODO:
- Shorten-up prompt instructions
- More robust relevance and ranking
- Make max_token_length configurable
- Don't trim long context, recursively split it into sections
- Consider allowing PDF web result parsing
- Extract out to Langchain Chain with config that allows different llm(s) to be passed in
"""
def summarize(search_query, limit \\ 8, timeout \\ 10_000) do
with %{"web" => %{"results" => results}} <- Brave.search(search_query, limit),
summarized_search_results when is_list(summarized_search_results) <-
summarize_search_results(search_query, results, timeout),
summary when is_binary(summary) <- webpages_summarizer(search_query, summarized_search_results),
sources when is_list(sources) <- map_sources(summarized_search_results) do
{
:ok,
%WebSearch{
query: search_query,
summary: summary,
sources: sources
}
}
else
{:error, error} ->
{:error, error}
_ ->
{:error, "Failed to summarize search results"}
end
end
def summarize_webpage(search_query, %{"url" => url} = result) when is_binary(url) do
with content when is_binary(content) <- request_content(url),
parsed_content when is_binary(parsed_content) <- preprocess_webpage_content(content),
summary when is_binary(summary) <- webpage_summarizer(search_query, parsed_content) do
%Webpage{
url: url,
title: result["title"],
description: Html2Markdown.convert(result["description"]),
summary: summary,
page_age: cast_page_age(result["page_age"])
}
else
{:error, error} ->
{:error, error}
_ ->
{:error, "Failed to summarize webpage"}
end
end
defp summarize_search_results(search_query, results, timeout) do
results
|> Enum.map(fn result -> Map.take(result, ["title", "description", "url", "page_age"]) end)
|> Task.async_stream(&summarize_webpage(search_query, &1), timeout: timeout, on_timeout: :kill_task)
|> Enum.filter(fn
{:ok, %Webpage{} = result} -> result
_ -> false
end)
|> Enum.map(fn {:ok, result} -> result end)
end
defp webpages_summarizer(search_query, results) do
llm = Models.llama_v3_8b_instruct()
system_message = """
You are a helpful web search results summarizer. Your task is to deliver a concise and accurate response to a user's query, drawing from the provided search result summaries.
Please combine the following web page summaries into a single, comprehensive summary. The individual summaries have been generated by an LLM and cover different aspects or sections of a larger topic.
Before combining the summaries, consider the following:
- Assess the relevance of each individual summary to the original user query.
- Give higher preference to summaries with a newer page age when the topic of the user's query is time-sensitive.
- Filter out summaries that are not strongly relevant to the user's query or do not contribute significantly to answering the question.
- Ignore any summaries that are empty, have no content, or contain only whitespace characters.
When creating the combined summary, consider the following:
- Identify common themes, topics, or ideas across the relevant individual summaries.
- Organize the information in a logical and coherent manner, ensuring a smooth flow between the different sections.
- Synthesize the key points and main takeaways from each relevant summary, while avoiding repetition or redundancy.
- Maintain the accuracy and integrity of the information presented in the original summaries.
- Use clear and concise language to convey the combined summary effectively using an unbiased and journalistic tone.
- If there are any contradictory or conflicting points across the summaries, try to reconcile them or present them objectively.
- Don't use phrases like "here is", "this article", "this webpage", "the page", "the content" or other hedging language.
- Focus on providing information that directly addresses the user's query and helps answer their question comprehensively.
Combining data:
- If you encounter similar or overlapping data lists or tables across multiple summaries, merge them into a single, comprehensive list or table.
- Identify the common fields or properties present in the overlapping data lists.
- Merge the data from the overlapping lists and table, ensuring that each unique entry is represented only once in the combined list or table.
- If there are conflicting values for the same entry across different lists, use your best judgment to determine the most accurate or relevant value to include in the combined list.
Formatting:
- Use appropriate headings, subheadings, or bullet points to organize the information.
- If the data lends itself well to a tabular format (e.g., comparisons, lists with multiple properties), consider presenting it in a markdown table.
- If a table is not suitable, use other appropriate markdown formatting such as lists, code blocks, or blockquotes to present the information effectively.
- Do not trim or remove any relevant data from the tables or lists and don't use placeholders
- Do not list your sources and never write URLs or links!
Please provide your response in well structured markdown format. But don't mention "markdown" in your response.
"""
user_message = """
User query: #{search_query}
## Individual web page summaries:
#{map_summaries(results)}
"""
messages = [
Message.new_system!(system_message),
Message.new_user!(user_message)
]
run_chain(llm, messages)
end
defp webpage_summarizer(search_query, content) do
llm = Models.phi_3_mini_128k_instruct()
system_message = """
You are a helpful web page data extractor and summarizer.
Please analyze the following web page content and extract the key meanings into a summary without losing any important information and extract the structured data without modifying its format.
Before summarizing the content, consider the following:
- Assess the relevance of the content to the original user query.
- Filter out content that are not strongly relevant to the user's query or do not contribute significantly to answering the question.
- Ignore any content that is empty, or contain only whitespace characters.
Summary:
- Identify the main themes, topics, or ideas discussed in the content.
- Recognize important facts, figures, or examples that support the main points.
- Capture any essential context or background information necessary for understanding the content.
- Avoid repetition and eliminate any redundant or less critical information.
- Organize the summary by grouping related meanings together under relevant headings or sections.
- Don't return any promotional or irrelevant information.
- Use clear and concise language to convey the content effectively using an unbiased and journalistic tone.
- Don't use phrases like "this article", "this webpage", "the page", "the content" or other hedging language.
- Focus on providing information that directly addresses the user's query and helps answer their question comprehensively.
Data:
- Identify and extract tables, lists, code snippets, or any other formatted data present in the content.
- Maintain the original structure and formatting of the extracted data.
- Ensure that no information is lost or altered during the extraction process.
- If there are multiple instances of structured data, extract each instance separately.
Please provide your response in well structured markdown format. But don't mention "markdown" in your response.
"""
user_message = """
User query: #{search_query}
## Web page content to summarize:
```html
#{content}
```
"""
messages = [
Message.new_system!(system_message),
Message.new_user!(user_message)
]
run_chain(llm, messages)
end
defp run_chain(llm, messages) do
%{llm: llm, verbose: false}
|> LLMChain.new!()
|> LLMChain.add_messages(messages)
|> LLMChain.run(mode: :while_needs_response)
|> case do
{:ok, _chain, %{content: content}} ->
content
error ->
error
end
end
defp map_sources(summarized_webpages) do
Enum.map(summarized_webpages, fn summarized_webpage ->
%{
url: summarized_webpage.url,
title: summarized_webpage.title
}
end)
end
defp map_summaries(results) do
# Llama 3 estimated token length (with some wiggle-room): (string length) / 4
max_token_length = 7_200 * 4
results
|> Enum.with_index()
|> Enum.map_join("\n", fn {result, index} ->
"""
### Web Page #{index + 1}:
Title: #{result.title}
Description: #{result.description}
Summary: #{result.summary}
Page Age: #{calculate_page_age(result.page_age)}
"""
end)
|> maybe_trim_to_context_limit(max_token_length)
end
defp request_content(url) do
case URI.new(url) do
{:ok, %URI{scheme: "https", path: path}} ->
unless is_pdf_uri?(path) do
url
|> fetch_content()
|> Html2Markdown.convert()
end
_ ->
nil
end
end
defp is_pdf_uri?(path), do: Path.extname(path) == ".pdf"
defp fetch_content(url) do
case Req.get(url) do
{:ok, %Req.Response{status: 200, body: content}} -> content
{:ok, request} -> {:error, request}
{:error, error} -> {:error, error}
end
end
defp preprocess_webpage_content(content) do
# Phi 3 estimated token length (with some wiggle-room): (string length) / 4
max_token_length = 85_000 * 4
maybe_trim_to_context_limit(content, max_token_length)
end
defp maybe_trim_to_context_limit(content, max_token_length)
when is_binary(content) and byte_size(content) <= max_token_length do
content
end
defp maybe_trim_to_context_limit(content, max_token_length)
when is_binary(content) and byte_size(content) >= max_token_length do
content
|> String.slice(0, max_token_length)
|> String.trim()
end
defp cast_page_age(date_string) when is_binary(date_string) do
case NaiveDateTime.from_iso8601(date_string) do
{:ok, parsed_date} ->
parsed_date
{:error, _error} ->
nil
end
end
defp cast_page_age(_date_string), do: nil
defp calculate_page_age(nil), do: "Unknown age"
defp calculate_page_age(%NaiveDateTime{} = page_age) do
total_days =
NaiveDateTime.utc_now()
|> NaiveDateTime.diff(page_age, :second)
|> div(86_400)
cond do
total_days < 60 ->
"#{total_days} " <> Inflex.inflect("day", total_days)
total_days < 365 ->
months = div(total_days, 30)
"#{months} " <> Inflex.inflect("month", months)
true ->
years = div(total_days, 365)
"#{years} " <> Inflex.inflect("year", years)
end
end
end
defmodule Requests.Brave do
@moduledoc """
Web search using Brave
Docs: https://api.search.brave.com/app/documentation/web-search/get-started
"""
@brave_search_url "https://api.search.brave.com/res/v1/web/search"
@brave_api_key Application.compile_env(:your_app, :brave_api_key)
@headers [
{"Accept", "application/json"},
{"Accept-Encoding", "gzip"},
{"X-Subscription-Token", @brave_api_key}
]
def search(query, count \\ 20, result_filter \\ "query, web") do
params = %{q: query, result_filter: result_filter, count: count}
case Req.get(@brave_search_url, headers: @headers, params: params) do
{:ok, %Req.Response{status: 200, body: body}} ->
body
{:ok, %Req.Response{} = response} ->
{:error, response}
{:error, reason} ->
{:error, reason}
end
end
defp get(url, params) do
Req.get(url, headers: @headers, params: params)
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment