cpursley/ai_web_search.ex

## ai_web_search.ex
# You will need to install https://github.com/cpursley/html2markdown

defmodule Webpage do
  @moduledoc false
  defstruct [:url, :title, :description, :summary, :page_age]
end

defmodule WebSearch do
  @moduledoc """
  Web search summarization chain
  """
  alias LangChain.Chains.LLMChain
  alias LangChain.Message

  defstruct [:query, :summary, :sources]

  @doc """
  Summarizes the search results for a given query and returns a structured summary of the web pages and their sources.

  TODO:
  - Shorten-up prompt instructions
  - More robust relevance and ranking
  - Make max_token_length configurable
  - Don't trim long context, recursively split it into sections
  - Consider allowing PDF web result parsing
  - Extract out to Langchain Chain with config that allows different llm(s) to be passed in
  """
  def summarize(search_query, limit \\ 8, timeout \\ 10_000) do
    with %{"web" => %{"results" => results}} <- Brave.search(search_query, limit),
         summarized_search_results when is_list(summarized_search_results) <-
           summarize_search_results(search_query, results, timeout),
         summary when is_binary(summary) <- webpages_summarizer(search_query, summarized_search_results),
         sources when is_list(sources) <- map_sources(summarized_search_results) do
      {
        :ok,
        %WebSearch{
          query: search_query,
          summary: summary,
          sources: sources
        }
      }
    else
      {:error, error} ->
        {:error, error}

      _ ->
        {:error, "Failed to summarize search results"}
    end
  end

  def summarize_webpage(search_query, %{"url" => url} = result) when is_binary(url) do
    with content when is_binary(content) <- request_content(url),
         parsed_content when is_binary(parsed_content) <- preprocess_webpage_content(content),
         summary when is_binary(summary) <- webpage_summarizer(search_query, parsed_content) do
      %Webpage{
        url: url,
        title: result["title"],
        description: Html2Markdown.convert(result["description"]),
        summary: summary,
        page_age: cast_page_age(result["page_age"])
      }
    else
      {:error, error} ->
        {:error, error}

      _ ->
        {:error, "Failed to summarize webpage"}
    end
  end

  defp summarize_search_results(search_query, results, timeout) do
    results
    |> Enum.map(fn result -> Map.take(result, ["title", "description", "url", "page_age"]) end)
    |> Task.async_stream(&summarize_webpage(search_query, &1), timeout: timeout, on_timeout: :kill_task)
    |> Enum.filter(fn
      {:ok, %Webpage{} = result} -> result
      _ -> false
    end)
    |> Enum.map(fn {:ok, result} -> result end)
  end

  defp webpages_summarizer(search_query, results) do
    llm = Models.llama_v3_8b_instruct()

    system_message = """
    You are a helpful web search results summarizer. Your task is to deliver a concise and accurate response to a user's query, drawing from the provided search result summaries.

    Please combine the following web page summaries into a single, comprehensive summary. The individual summaries have been generated by an LLM and cover different aspects or sections of a larger topic.

    Before combining the summaries, consider the following:
    - Assess the relevance of each individual summary to the original user query.
    - Give higher preference to summaries with a newer page age when the topic of the user's query is time-sensitive.
    - Filter out summaries that are not strongly relevant to the user's query or do not contribute significantly to answering the question.
    - Ignore any summaries that are empty, have no content, or contain only whitespace characters.

    When creating the combined summary, consider the following:
    - Identify common themes, topics, or ideas across the relevant individual summaries.
    - Organize the information in a logical and coherent manner, ensuring a smooth flow between the different sections.
    - Synthesize the key points and main takeaways from each relevant summary, while avoiding repetition or redundancy.
    - Maintain the accuracy and integrity of the information presented in the original summaries.
    - Use clear and concise language to convey the combined summary effectively using an unbiased and journalistic tone.
    - If there are any contradictory or conflicting points across the summaries, try to reconcile them or present them objectively.
    - Don't use phrases like "here is", "this article", "this webpage", "the page", "the content" or other hedging language.
    - Focus on providing information that directly addresses the user's query and helps answer their question comprehensively.

    Combining data:
    - If you encounter similar or overlapping data lists or tables across multiple summaries, merge them into a single, comprehensive list or table.
    - Identify the common fields or properties present in the overlapping data lists.
    - Merge the data from the overlapping lists and table, ensuring that each unique entry is represented only once in the combined list or table.
    - If there are conflicting values for the same entry across different lists, use your best judgment to determine the most accurate or relevant value to include in the combined list.

    Formatting:
    - Use appropriate headings, subheadings, or bullet points to organize the information.
    - If the data lends itself well to a tabular format (e.g., comparisons, lists with multiple properties), consider presenting it in a markdown table.
    - If a table is not suitable, use other appropriate markdown formatting such as lists, code blocks, or blockquotes to present the information effectively.
    - Do not trim or remove any relevant data from the tables or lists and don't use placeholders
    - Do not list your sources and never write URLs or links!

    Please provide your response in well structured markdown format. But don't mention "markdown" in your response.
    """

    user_message = """
    User query: #{search_query}

    ## Individual web page summaries:

    #{map_summaries(results)}
    """

    messages = [
      Message.new_system!(system_message),
      Message.new_user!(user_message)
    ]

    run_chain(llm, messages)
  end

  defp webpage_summarizer(search_query, content) do
    llm = Models.phi_3_mini_128k_instruct()

    system_message = """
    You are a helpful web page data extractor and summarizer.

    Please analyze the following web page content and extract the key meanings into a summary without losing any important information and extract the structured data without modifying its format.

    Before summarizing the content, consider the following:
    - Assess the relevance of the content to the original user query.
    - Filter out content that are not strongly relevant to the user's query or do not contribute significantly to answering the question.
    - Ignore any content that is empty, or contain only whitespace characters.

    Summary:
    - Identify the main themes, topics, or ideas discussed in the content.
    - Recognize important facts, figures, or examples that support the main points.
    - Capture any essential context or background information necessary for understanding the content.
    - Avoid repetition and eliminate any redundant or less critical information.
    - Organize the summary by grouping related meanings together under relevant headings or sections.
    - Don't return any promotional or irrelevant information.
    - Use clear and concise language to convey the content effectively using an unbiased and journalistic tone.
    - Don't use phrases like "this article", "this webpage", "the page", "the content" or other hedging language.
    - Focus on providing information that directly addresses the user's query and helps answer their question comprehensively.

    Data:
    - Identify and extract tables, lists, code snippets, or any other formatted data present in the content.
    - Maintain the original structure and formatting of the extracted data.
    - Ensure that no information is lost or altered during the extraction process.
    - If there are multiple instances of structured data, extract each instance separately.

    Please provide your response in well structured markdown format. But don't mention "markdown" in your response.
    """

    user_message = """
    User query: #{search_query}

    ## Web page content to summarize:

    ```html
    #{content}
    ```
    """

    messages = [
      Message.new_system!(system_message),
      Message.new_user!(user_message)
    ]

    run_chain(llm, messages)
  end

  defp run_chain(llm, messages) do
    %{llm: llm, verbose: false}
    |> LLMChain.new!()
    |> LLMChain.add_messages(messages)
    |> LLMChain.run(mode: :while_needs_response)
    |> case do
      {:ok, _chain, %{content: content}} ->
        content

      error ->
        error
    end
  end

  defp map_sources(summarized_webpages) do
    Enum.map(summarized_webpages, fn summarized_webpage ->
      %{
        url: summarized_webpage.url,
        title: summarized_webpage.title
      }
    end)
  end

  defp map_summaries(results) do
    # Llama 3 estimated token length (with some wiggle-room): (string length) / 4
    max_token_length = 7_200 * 4

    results
    |> Enum.with_index()
    |> Enum.map_join("\n", fn {result, index} ->
      """
      ### Web Page #{index + 1}:

      Title: #{result.title}
      Description: #{result.description}
      Summary: #{result.summary}
      Page Age: #{calculate_page_age(result.page_age)}
      """
    end)
    |> maybe_trim_to_context_limit(max_token_length)
  end

  defp request_content(url) do
    case URI.new(url) do
      {:ok, %URI{scheme: "https", path: path}} ->
        unless is_pdf_uri?(path) do
          url
          |> fetch_content()
          |> Html2Markdown.convert()
        end

      _ ->
        nil
    end
  end

  defp is_pdf_uri?(path), do: Path.extname(path) == ".pdf"

  defp fetch_content(url) do
    case Req.get(url) do
      {:ok, %Req.Response{status: 200, body: content}} -> content
      {:ok, request} -> {:error, request}
      {:error, error} -> {:error, error}
    end
  end

  defp preprocess_webpage_content(content) do
    # Phi 3 estimated token length (with some wiggle-room): (string length) / 4
    max_token_length = 85_000 * 4

    maybe_trim_to_context_limit(content, max_token_length)
  end

  defp maybe_trim_to_context_limit(content, max_token_length)
       when is_binary(content) and byte_size(content) <= max_token_length do
    content
  end

  defp maybe_trim_to_context_limit(content, max_token_length)
       when is_binary(content) and byte_size(content) >= max_token_length do
    content
    |> String.slice(0, max_token_length)
    |> String.trim()
  end

  defp cast_page_age(date_string) when is_binary(date_string) do
    case NaiveDateTime.from_iso8601(date_string) do
      {:ok, parsed_date} ->
        parsed_date

      {:error, _error} ->
        nil
    end
  end

  defp cast_page_age(_date_string), do: nil

  defp calculate_page_age(nil), do: "Unknown age"

  defp calculate_page_age(%NaiveDateTime{} = page_age) do
    total_days =
      NaiveDateTime.utc_now()
      |> NaiveDateTime.diff(page_age, :second)
      |> div(86_400)

    cond do
      total_days < 60 ->
        "#{total_days} " <> Inflex.inflect("day", total_days)

      total_days < 365 ->
        months = div(total_days, 30)
        "#{months} " <> Inflex.inflect("month", months)

      true ->
        years = div(total_days, 365)
        "#{years} " <> Inflex.inflect("year", years)
    end
  end
end

## brave.ex
defmodule Requests.Brave do
  @moduledoc """
  Web search using Brave

  Docs: https://api.search.brave.com/app/documentation/web-search/get-started
  """

  @brave_search_url "https://api.search.brave.com/res/v1/web/search"
  @brave_api_key Application.compile_env(:your_app, :brave_api_key)
  @headers [
    {"Accept", "application/json"},
    {"Accept-Encoding", "gzip"},
    {"X-Subscription-Token", @brave_api_key}
  ]

  def search(query, count \\ 20, result_filter \\ "query, web") do
    params = %{q: query, result_filter: result_filter, count: count}

    case Req.get(@brave_search_url, headers: @headers, params: params) do
      {:ok, %Req.Response{status: 200, body: body}} ->
        body

      {:ok, %Req.Response{} = response} ->
        {:error, response}

      {:error, reason} ->
        {:error, reason}
    end
  end

  defp get(url, params) do
    Req.get(url, headers: @headers, params: params)
  end
end
	# You will need to install https://github.com/cpursley/html2markdown

	defmodule Webpage do
	@moduledoc false
	defstruct [:url, :title, :description, :summary, :page_age]
	end

	defmodule WebSearch do
	@moduledoc """
	Web search summarization chain
	"""
	alias LangChain.Chains.LLMChain
	alias LangChain.Message

	defstruct [:query, :summary, :sources]

	@doc """
	Summarizes the search results for a given query and returns a structured summary of the web pages and their sources.

	TODO:
	- Shorten-up prompt instructions
	- More robust relevance and ranking
	- Make max_token_length configurable
	- Don't trim long context, recursively split it into sections
	- Consider allowing PDF web result parsing
	- Extract out to Langchain Chain with config that allows different llm(s) to be passed in
	"""
	def summarize(search_query, limit \\ 8, timeout \\ 10_000) do
	with %{"web" => %{"results" => results}} <- Brave.search(search_query, limit),
	summarized_search_results when is_list(summarized_search_results) <-
	summarize_search_results(search_query, results, timeout),
	summary when is_binary(summary) <- webpages_summarizer(search_query, summarized_search_results),
	sources when is_list(sources) <- map_sources(summarized_search_results) do
	{
	:ok,
	%WebSearch{
	query: search_query,
	summary: summary,
	sources: sources
	}
	}
	else
	{:error, error} ->
	{:error, error}

	_ ->
	{:error, "Failed to summarize search results"}
	end
	end

	def summarize_webpage(search_query, %{"url" => url} = result) when is_binary(url) do
	with content when is_binary(content) <- request_content(url),
	parsed_content when is_binary(parsed_content) <- preprocess_webpage_content(content),
	summary when is_binary(summary) <- webpage_summarizer(search_query, parsed_content) do
	%Webpage{
	url: url,
	title: result["title"],
	description: Html2Markdown.convert(result["description"]),
	summary: summary,
	page_age: cast_page_age(result["page_age"])
	}
	else
	{:error, error} ->
	{:error, error}

	_ ->
	{:error, "Failed to summarize webpage"}
	end
	end

	defp summarize_search_results(search_query, results, timeout) do
	results
	\|> Enum.map(fn result -> Map.take(result, ["title", "description", "url", "page_age"]) end)
	\|> Task.async_stream(&summarize_webpage(search_query, &1), timeout: timeout, on_timeout: :kill_task)
	\|> Enum.filter(fn
	{:ok, %Webpage{} = result} -> result
	_ -> false
	end)
	\|> Enum.map(fn {:ok, result} -> result end)
	end

	defp webpages_summarizer(search_query, results) do
	llm = Models.llama_v3_8b_instruct()

	system_message = """
	You are a helpful web search results summarizer. Your task is to deliver a concise and accurate response to a user's query, drawing from the provided search result summaries.

	Please combine the following web page summaries into a single, comprehensive summary. The individual summaries have been generated by an LLM and cover different aspects or sections of a larger topic.

	Before combining the summaries, consider the following:
	- Assess the relevance of each individual summary to the original user query.
	- Give higher preference to summaries with a newer page age when the topic of the user's query is time-sensitive.
	- Filter out summaries that are not strongly relevant to the user's query or do not contribute significantly to answering the question.
	- Ignore any summaries that are empty, have no content, or contain only whitespace characters.

	When creating the combined summary, consider the following:
	- Identify common themes, topics, or ideas across the relevant individual summaries.
	- Organize the information in a logical and coherent manner, ensuring a smooth flow between the different sections.
	- Synthesize the key points and main takeaways from each relevant summary, while avoiding repetition or redundancy.
	- Maintain the accuracy and integrity of the information presented in the original summaries.
	- Use clear and concise language to convey the combined summary effectively using an unbiased and journalistic tone.
	- If there are any contradictory or conflicting points across the summaries, try to reconcile them or present them objectively.
	- Don't use phrases like "here is", "this article", "this webpage", "the page", "the content" or other hedging language.
	- Focus on providing information that directly addresses the user's query and helps answer their question comprehensively.

	Combining data:
	- If you encounter similar or overlapping data lists or tables across multiple summaries, merge them into a single, comprehensive list or table.
	- Identify the common fields or properties present in the overlapping data lists.
	- Merge the data from the overlapping lists and table, ensuring that each unique entry is represented only once in the combined list or table.
	- If there are conflicting values for the same entry across different lists, use your best judgment to determine the most accurate or relevant value to include in the combined list.

	Formatting:
	- Use appropriate headings, subheadings, or bullet points to organize the information.
	- If the data lends itself well to a tabular format (e.g., comparisons, lists with multiple properties), consider presenting it in a markdown table.
	- If a table is not suitable, use other appropriate markdown formatting such as lists, code blocks, or blockquotes to present the information effectively.
	- Do not trim or remove any relevant data from the tables or lists and don't use placeholders
	- Do not list your sources and never write URLs or links!

	Please provide your response in well structured markdown format. But don't mention "markdown" in your response.
	"""

	user_message = """
	User query: #{search_query}

	## Individual web page summaries:

	#{map_summaries(results)}
	"""

	messages = [
	Message.new_system!(system_message),
	Message.new_user!(user_message)
	]

	run_chain(llm, messages)
	end

	defp webpage_summarizer(search_query, content) do
	llm = Models.phi_3_mini_128k_instruct()

	system_message = """
	You are a helpful web page data extractor and summarizer.

	Please analyze the following web page content and extract the key meanings into a summary without losing any important information and extract the structured data without modifying its format.

	Before summarizing the content, consider the following:
	- Assess the relevance of the content to the original user query.
	- Filter out content that are not strongly relevant to the user's query or do not contribute significantly to answering the question.
	- Ignore any content that is empty, or contain only whitespace characters.

	Summary:
	- Identify the main themes, topics, or ideas discussed in the content.
	- Recognize important facts, figures, or examples that support the main points.
	- Capture any essential context or background information necessary for understanding the content.
	- Avoid repetition and eliminate any redundant or less critical information.
	- Organize the summary by grouping related meanings together under relevant headings or sections.
	- Don't return any promotional or irrelevant information.
	- Use clear and concise language to convey the content effectively using an unbiased and journalistic tone.
	- Don't use phrases like "this article", "this webpage", "the page", "the content" or other hedging language.
	- Focus on providing information that directly addresses the user's query and helps answer their question comprehensively.

	Data:
	- Identify and extract tables, lists, code snippets, or any other formatted data present in the content.
	- Maintain the original structure and formatting of the extracted data.
	- Ensure that no information is lost or altered during the extraction process.
	- If there are multiple instances of structured data, extract each instance separately.

	Please provide your response in well structured markdown format. But don't mention "markdown" in your response.
	"""

	user_message = """
	User query: #{search_query}

	## Web page content to summarize:

	```html
	#{content}
	```
	"""

	messages = [
	Message.new_system!(system_message),
	Message.new_user!(user_message)
	]

	run_chain(llm, messages)
	end

	defp run_chain(llm, messages) do
	%{llm: llm, verbose: false}
	\|> LLMChain.new!()
	\|> LLMChain.add_messages(messages)
	\|> LLMChain.run(mode: :while_needs_response)
	\|> case do
	{:ok, _chain, %{content: content}} ->
	content

	error ->
	error
	end
	end

	defp map_sources(summarized_webpages) do
	Enum.map(summarized_webpages, fn summarized_webpage ->
	%{
	url: summarized_webpage.url,
	title: summarized_webpage.title
	}
	end)
	end

	defp map_summaries(results) do
	# Llama 3 estimated token length (with some wiggle-room): (string length) / 4
	max_token_length = 7_200 * 4

	results
	\|> Enum.with_index()
	\|> Enum.map_join("\n", fn {result, index} ->
	"""
	### Web Page #{index + 1}:

	Title: #{result.title}
	Description: #{result.description}
	Summary: #{result.summary}
	Page Age: #{calculate_page_age(result.page_age)}
	"""
	end)
	\|> maybe_trim_to_context_limit(max_token_length)
	end

	defp request_content(url) do
	case URI.new(url) do
	{:ok, %URI{scheme: "https", path: path}} ->
	unless is_pdf_uri?(path) do
	url
	\|> fetch_content()
	\|> Html2Markdown.convert()
	end

	_ ->
	nil
	end
	end

	defp is_pdf_uri?(path), do: Path.extname(path) == ".pdf"

	defp fetch_content(url) do
	case Req.get(url) do
	{:ok, %Req.Response{status: 200, body: content}} -> content
	{:ok, request} -> {:error, request}
	{:error, error} -> {:error, error}
	end
	end

	defp preprocess_webpage_content(content) do
	# Phi 3 estimated token length (with some wiggle-room): (string length) / 4
	max_token_length = 85_000 * 4

	maybe_trim_to_context_limit(content, max_token_length)
	end

	defp maybe_trim_to_context_limit(content, max_token_length)
	when is_binary(content) and byte_size(content) <= max_token_length do
	content
	end

	defp maybe_trim_to_context_limit(content, max_token_length)
	when is_binary(content) and byte_size(content) >= max_token_length do
	content
	\|> String.slice(0, max_token_length)
	\|> String.trim()
	end

	defp cast_page_age(date_string) when is_binary(date_string) do
	case NaiveDateTime.from_iso8601(date_string) do
	{:ok, parsed_date} ->
	parsed_date

	{:error, _error} ->
	nil
	end
	end

	defp cast_page_age(_date_string), do: nil

	defp calculate_page_age(nil), do: "Unknown age"

	defp calculate_page_age(%NaiveDateTime{} = page_age) do
	total_days =
	NaiveDateTime.utc_now()
	\|> NaiveDateTime.diff(page_age, :second)
	\|> div(86_400)

	cond do
	total_days < 60 ->
	"#{total_days} " <> Inflex.inflect("day", total_days)

	total_days < 365 ->
	months = div(total_days, 30)
	"#{months} " <> Inflex.inflect("month", months)

	true ->
	years = div(total_days, 365)
	"#{years} " <> Inflex.inflect("year", years)
	end
	end
	end
	defmodule Requests.Brave do
	@moduledoc """
	Web search using Brave

	Docs: https://api.search.brave.com/app/documentation/web-search/get-started
	"""

	@brave_search_url "https://api.search.brave.com/res/v1/web/search"
	@brave_api_key Application.compile_env(:your_app, :brave_api_key)
	@headers [
	{"Accept", "application/json"},
	{"Accept-Encoding", "gzip"},
	{"X-Subscription-Token", @brave_api_key}
	]

	def search(query, count \\ 20, result_filter \\ "query, web") do
	params = %{q: query, result_filter: result_filter, count: count}

	case Req.get(@brave_search_url, headers: @headers, params: params) do
	{:ok, %Req.Response{status: 200, body: body}} ->
	body

	{:ok, %Req.Response{} = response} ->
	{:error, response}

	{:error, reason} ->
	{:error, reason}
	end
	end

	defp get(url, params) do
	Req.get(url, headers: @headers, params: params)
	end
	end