Skip to content

Instantly share code, notes, and snippets.

@jeregrine
Last active May 22, 2023 21:16
Show Gist options
  • Save jeregrine/a5c70dac5e6341d32153657922ea25fa to your computer and use it in GitHub Desktop.
Save jeregrine/a5c70dac5e6341d32153657922ea25fa to your computer and use it in GitHub Desktop.

Xmerl

Mix.install([
  {:tesla, "~> 1.7"},
  {:fast_xml, "~> 1.1"}
])

Data

num_posts = 4
doc = Tesla.get!("https://fly.io/phoenix-files/feed.xml").body

Recursive

defmodule FlyFeedRecursive do
  def parse(doc, n \\ 4) do
    doc
    |> :fxml_stream.parse_element()
    |> parse_el(n)
  end

  def parse_el({:xmlel, "feed", _, doc}, n) do
    parse_el([], doc, n)
  end

  def parse_el(acc, _, n) when length(acc) == n do
    acc
  end

  def parse_el(acc, [], _n) do
    acc
  end

  def parse_el(acc, [{:xmlel, "entry", _, entry} | doc], n) do
    entry =
      Enum.map(entry, fn
        {:xmlel, key, attr, child} ->
          {key, node_to_val(attr, child)}

        _ ->
          nil
      end)
      |> Enum.reject(&is_nil/1)
      |> Enum.into(%{})

    parse_el([entry | acc], doc, n)
  end

  def parse_el(acc, [_ | doc], n) do
    parse_el(acc, doc, n)
  end

  def node_to_val(attr, children)
  def node_to_val(_, [{:xmlcdata, val}]), do: val
  def node_to_val(attr, children) do
    if href = has_href?(attr) do
      href
    else
      %{attr: attr, children: children}
    end
  end

  def has_href?(list) do
    Enum.find_value(list, nil, fn
      {"href", val} -> val
      _ -> false
    end)
  end
end

FlyFeedRecursive.parse(doc, num_posts)

ReduceWhile

defmodule FlyFeed do
  def parse(doc, n \\ 4) do
    doc
    |> :fxml_stream.parse_element()
    |> parse_el(n)
  end

  def parse_el({:xmlel, "feed", _, doc}, n) do
    Enum.reduce_while(doc, [], fn 
      _doc, acc when length(acc) == n -> 
        {:halt, acc}
      {:xmlel, "entry", _, entry}, acc ->
        entry = handle_entry(entry)
        {:cont, [entry | acc]}
      _, acc ->
        {:cont, acc}
    end)
  end

  def handle_entry(entry) do
    Enum.map(entry, fn
      {:xmlel, key, attr, child} ->
        {key, node_to_val(attr, child)}
      _ ->
        nil
    end)
    |> Enum.reject(&is_nil/1)
    |> Enum.into(%{})
  end

  def node_to_val(attr, children)
  def node_to_val(_, [{:xmlcdata, val}]), do: val
  def node_to_val(attr, children) do
    if href = has_href?(attr) do
      href
    else
      %{attr: attr, children: children}
    end
  end

  def has_href?(list) do
    Enum.find_value(list, nil, fn 
      {"href", val} -> val 
      _ -> false
    end)
  end
end

FlyFeed.parse(doc, num_posts)
@nbw
Copy link

nbw commented May 22, 2023

Here's my updated module for now:

defmodule FlyFeed do
  @module """
  Parses the XML output of Fly's Blogs (https://fly.io/blog/feed.xml)

  Credit to Jason Stiebs for most of the code.
  """
  def parse(doc, n \\ 4) do
    doc
    |> :fxml_stream.parse_element()
    |> parse_el(n)
    |> Enum.reverse()
  end

  def parse_el({:xmlel, "feed", _, doc}, n) do
    Enum.reduce_while(doc, [], fn
      _doc, acc when length(acc) == n ->
        {:halt, acc}

      {:xmlel, "entry", _, entry}, acc ->
        entry = handle_entry(entry)
        {:cont, [entry | acc]}

      _, acc ->
        {:cont, acc}
    end)
  end

  def handle_entry(entry) do
    Enum.map(entry, fn
      {:xmlel, key, attr, child} ->
        {key, node_to_val(attr, child)}

      _ ->
        nil
    end)
    |> Enum.reject(&is_nil/1)
    |> Enum.into(%{})
  end

  def node_to_val(attr, children)
  def node_to_val(_, [{:xmlcdata, val}]), do: val

  def node_to_val(attr, children) do
    item =
      Enum.reduce(attr, %{}, fn {key, val}, acc ->
        case key do
          # ignore rel
          "rel" -> acc
          key -> Map.put(acc, key, val)
        end
      end)

    Enum.reduce(children, item, fn c, acc ->
      case c do
        {:xmlel, key, [], [xmlcdata: val]} ->
          Map.put(acc, key, val)

        _ ->
          acc
      end
    end)
  end
end

it reverses the final output, handles author name a bit better, removed the way we handle href, etc.. This is the output.

[
  %{
    "author" => %{"name" => "Mariusz Felisiak"},
    "content" => "[[content omitted]]...",
    "id" => "https://fly.io/blog/dry-template-rendering-with-context-processors/",
    "link" => %{
      "href" => "https://fly.io/blog/dry-template-rendering-with-context-processors/"
    },
    "published" => "2023-05-11T00:00:00+00:00",
    "thumbnail" => "https://fly.io/blog/2023-05-11/dry-template-rendering-with-context-processors-thumb.webp",
    "title" => "DRY: Template Rendering with Context Processors",
    "updated" => "2023-05-16T19:06:30+00:00"
  },
 ....
]

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment