Skip to content

Instantly share code, notes, and snippets.

@cpursley
Last active July 10, 2024 09:20
Show Gist options
  • Save cpursley/b4af2ff3b56c912f659bd5300e422790 to your computer and use it in GitHub Desktop.
Save cpursley/b4af2ff3b56c912f659bd5300e422790 to your computer and use it in GitHub Desktop.
AI Web Search
# You will need to install https://github.com/cpursley/html2markdown
defmodule Webpage do
@moduledoc false
defstruct [:url, :title, :description, :summary, :page_age]
end
defmodule WebSearch do
@moduledoc """
Web search summarization chain
"""
alias LangChain.Chains.LLMChain
alias LangChain.Message
defstruct [:query, :summary, :sources]
@doc """
Summarizes the search results for a given query and returns a structured summary of the web pages and their sources.
TODO:
- Shorten-up prompt instructions
- More robust relevance and ranking
- Make max_token_length configurable
- Don't trim long context, recursively split it into sections
- Consider allowing PDF web result parsing
- Extract out to Langchain Chain with config that allows different llm(s) to be passed in
"""
def summarize(search_query, limit \\ 8, timeout \\ 10_000) do
with %{"web" => %{"results" => results}} <- Brave.search(search_query, limit),
summarized_search_results when is_list(summarized_search_results) <-
summarize_search_results(search_query, results, timeout),
summary when is_binary(summary) <- webpages_summarizer(search_query, summarized_search_results),
sources when is_list(sources) <- map_sources(summarized_search_results) do
{
:ok,
%WebSearch{
query: search_query,
summary: summary,
sources: sources
}
}
else
{:error, error} ->
{:error, error}
_ ->
{:error, "Failed to summarize search results"}
end
end
def summarize_webpage(search_query, %{"url" => url} = result) when is_binary(url) do
with content when is_binary(content) <- request_content(url),
parsed_content when is_binary(parsed_content) <- preprocess_webpage_content(content),
summary when is_binary(summary) <- webpage_summarizer(search_query, parsed_content) do
%Webpage{
url: url,
title: result["title"],
description: Html2Markdown.convert(result["description"]),
summary: summary,
page_age: cast_page_age(result["page_age"])
}
else
{:error, error} ->
{:error, error}
_ ->
{:error, "Failed to summarize webpage"}
end
end
defp summarize_search_results(search_query, results, timeout) do
results
|> Enum.map(fn result -> Map.take(result, ["title", "description", "url", "page_age"]) end)
|> Task.async_stream(&summarize_webpage(search_query, &1), timeout: timeout, on_timeout: :kill_task)
|> Enum.filter(fn
{:ok, %Webpage{} = result} -> result
_ -> false
end)
|> Enum.map(fn {:ok, result} -> result end)
end
defp webpages_summarizer(search_query, results) do
llm = Models.llama_v3_8b_instruct()
system_message = """
You are a helpful web search results summarizer. Your task is to deliver a concise and accurate response to a user's query, drawing from the provided search result summaries.
Please combine the following web page summaries into a single, comprehensive summary. The individual summaries have been generated by an LLM and cover different aspects or sections of a larger topic.
Before combining the summaries, consider the following:
- Assess the relevance of each individual summary to the original user query.
- Give higher preference to summaries with a newer page age when the topic of the user's query is time-sensitive.
- Filter out summaries that are not strongly relevant to the user's query or do not contribute significantly to answering the question.
- Ignore any summaries that are empty, have no content, or contain only whitespace characters.
When creating the combined summary, consider the following:
- Identify common themes, topics, or ideas across the relevant individual summaries.
- Organize the information in a logical and coherent manner, ensuring a smooth flow between the different sections.
- Synthesize the key points and main takeaways from each relevant summary, while avoiding repetition or redundancy.
- Maintain the accuracy and integrity of the information presented in the original summaries.
- Use clear and concise language to convey the combined summary effectively using an unbiased and journalistic tone.
- If there are any contradictory or conflicting points across the summaries, try to reconcile them or present them objectively.
- Don't use phrases like "here is", "this article", "this webpage", "the page", "the content" or other hedging language.
- Focus on providing information that directly addresses the user's query and helps answer their question comprehensively.
Combining data:
- If you encounter similar or overlapping data lists or tables across multiple summaries, merge them into a single, comprehensive list or table.
- Identify the common fields or properties present in the overlapping data lists.
- Merge the data from the overlapping lists and table, ensuring that each unique entry is represented only once in the combined list or table.
- If there are conflicting values for the same entry across different lists, use your best judgment to determine the most accurate or relevant value to include in the combined list.
Formatting:
- Use appropriate headings, subheadings, or bullet points to organize the information.
- If the data lends itself well to a tabular format (e.g., comparisons, lists with multiple properties), consider presenting it in a markdown table.
- If a table is not suitable, use other appropriate markdown formatting such as lists, code blocks, or blockquotes to present the information effectively.
- Do not trim or remove any relevant data from the tables or lists and don't use placeholders
- Do not list your sources and never write URLs or links!
Please provide your response in well structured markdown format. But don't mention "markdown" in your response.
"""
user_message = """
User query: #{search_query}
## Individual web page summaries:
#{map_summaries(results)}
"""
messages = [
Message.new_system!(system_message),
Message.new_user!(user_message)
]
run_chain(llm, messages)
end
defp webpage_summarizer(search_query, content) do
llm = Models.phi_3_mini_128k_instruct()
system_message = """
You are a helpful web page data extractor and summarizer.
Please analyze the following web page content and extract the key meanings into a summary without losing any important information and extract the structured data without modifying its format.
Before summarizing the content, consider the following:
- Assess the relevance of the content to the original user query.
- Filter out content that are not strongly relevant to the user's query or do not contribute significantly to answering the question.
- Ignore any content that is empty, or contain only whitespace characters.
Summary:
- Identify the main themes, topics, or ideas discussed in the content.
- Recognize important facts, figures, or examples that support the main points.
- Capture any essential context or background information necessary for understanding the content.
- Avoid repetition and eliminate any redundant or less critical information.
- Organize the summary by grouping related meanings together under relevant headings or sections.
- Don't return any promotional or irrelevant information.
- Use clear and concise language to convey the content effectively using an unbiased and journalistic tone.
- Don't use phrases like "this article", "this webpage", "the page", "the content" or other hedging language.
- Focus on providing information that directly addresses the user's query and helps answer their question comprehensively.
Data:
- Identify and extract tables, lists, code snippets, or any other formatted data present in the content.
- Maintain the original structure and formatting of the extracted data.
- Ensure that no information is lost or altered during the extraction process.
- If there are multiple instances of structured data, extract each instance separately.
Please provide your response in well structured markdown format. But don't mention "markdown" in your response.
"""
user_message = """
User query: #{search_query}
## Web page content to summarize:
```html
#{content}
```
"""
messages = [
Message.new_system!(system_message),
Message.new_user!(user_message)
]
run_chain(llm, messages)
end
defp run_chain(llm, messages) do
%{llm: llm, verbose: false}
|> LLMChain.new!()
|> LLMChain.add_messages(messages)
|> LLMChain.run(mode: :while_needs_response)
|> case do
{:ok, _chain, %{content: content}} ->
content
error ->
error
end
end
defp map_sources(summarized_webpages) do
Enum.map(summarized_webpages, fn summarized_webpage ->
%{
url: summarized_webpage.url,
title: summarized_webpage.title
}
end)
end
defp map_summaries(results) do
# Llama 3 estimated token length (with some wiggle-room): (string length) / 4
max_token_length = 7_200 * 4
results
|> Enum.with_index()
|> Enum.map_join("\n", fn {result, index} ->
"""
### Web Page #{index + 1}:
Title: #{result.title}
Description: #{result.description}
Summary: #{result.summary}
Page Age: #{calculate_page_age(result.page_age)}
"""
end)
|> maybe_trim_to_context_limit(max_token_length)
end
defp request_content(url) do
case URI.new(url) do
{:ok, %URI{scheme: "https", path: path}} ->
unless is_pdf_uri?(path) do
url
|> fetch_content()
|> Html2Markdown.convert()
end
_ ->
nil
end
end
defp is_pdf_uri?(path), do: Path.extname(path) == ".pdf"
defp fetch_content(url) do
case Req.get(url) do
{:ok, %Req.Response{status: 200, body: content}} -> content
{:ok, request} -> {:error, request}
{:error, error} -> {:error, error}
end
end
defp preprocess_webpage_content(content) do
# Phi 3 estimated token length (with some wiggle-room): (string length) / 4
max_token_length = 85_000 * 4
maybe_trim_to_context_limit(content, max_token_length)
end
defp maybe_trim_to_context_limit(content, max_token_length)
when is_binary(content) and byte_size(content) <= max_token_length do
content
end
defp maybe_trim_to_context_limit(content, max_token_length)
when is_binary(content) and byte_size(content) >= max_token_length do
content
|> String.slice(0, max_token_length)
|> String.trim()
end
defp cast_page_age(date_string) when is_binary(date_string) do
case NaiveDateTime.from_iso8601(date_string) do
{:ok, parsed_date} ->
parsed_date
{:error, _error} ->
nil
end
end
defp cast_page_age(_date_string), do: nil
defp calculate_page_age(nil), do: "Unknown age"
defp calculate_page_age(%NaiveDateTime{} = page_age) do
total_days =
NaiveDateTime.utc_now()
|> NaiveDateTime.diff(page_age, :second)
|> div(86_400)
cond do
total_days < 60 ->
"#{total_days} " <> Inflex.inflect("day", total_days)
total_days < 365 ->
months = div(total_days, 30)
"#{months} " <> Inflex.inflect("month", months)
true ->
years = div(total_days, 365)
"#{years} " <> Inflex.inflect("year", years)
end
end
end
defmodule Requests.Brave do
@moduledoc """
Web search using Brave
Docs: https://api.search.brave.com/app/documentation/web-search/get-started
"""
@brave_search_url "https://api.search.brave.com/res/v1/web/search"
@brave_api_key Application.compile_env(:your_app, :brave_api_key)
@headers [
{"Accept", "application/json"},
{"Accept-Encoding", "gzip"},
{"X-Subscription-Token", @brave_api_key}
]
def search(query, count \\ 20, result_filter \\ "query, web") do
params = %{q: query, result_filter: result_filter, count: count}
case Req.get(@brave_search_url, headers: @headers, params: params) do
{:ok, %Req.Response{status: 200, body: body}} ->
body
{:ok, %Req.Response{} = response} ->
{:error, response}
{:error, reason} ->
{:error, reason}
end
end
defp get(url, params) do
Req.get(url, headers: @headers, params: params)
end
end
@bogdansolga
Copy link

Hello, @cpursley!

I have (accidentally / incidentally) found your Elixir files, and I consider them very useful.

Do you have or know any 'how to' use them, as a non Elixir user? :-)

@cpursley
Copy link
Author

cpursley commented Jul 10, 2024

Hi @bogdansolga I think the most interesting thing are the prompts. So you could borrow that and use Brave Search API to build your own version. You might even be able to drop this code into ChatGPT or Claude and ask it to rewrite it in your preferred language (just tell it which LLM adapter and http library to use).

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment