Skip to content

Instantly share code, notes, and snippets.

@sax
Last active July 7, 2024 16:21
Show Gist options
  • Save sax/2028fa6f8059cc4e0336fb890380f267 to your computer and use it in GitHub Desktop.
Save sax/2028fa6f8059cc4e0336fb890380f267 to your computer and use it in GitHub Desktop.
Elixir OpenTelemetry LiveView / crash tracking
import Config
## ....
config :logger,
backends: [
:console,
OpenTelemetryCrashLogger
]
config :logger, :open_telemetry_crash_logger, level: :error
defmodule MyApp.OpenTelemetry do
@moduledoc """
Configures OpenTelemetry to work with Phoenix/LiveView.
"""
@dialyzer {:nowarn_function, connection_status: 1}
@dialyzer {:nowarn_function, create_child_span: 3}
@dialyzer {:nowarn_function, create_parent_ctx: 1}
@dialyzer {:nowarn_function, handle_exception: 4}
@dialyzer {:nowarn_function, handle_cast: 2}
@dialyzer {:nowarn_function, get_peer_data: 1}
@dialyzer {:nowarn_function, get_user_agent: 1}
@dialyzer {:nowarn_function, open_child_span: 4}
use GenServer
require Logger
require OpenTelemetry.Tracer
alias OpenTelemetry.Ctx
alias OpenTelemetry.Span
alias OpenTelemetry.Tracer
alias OpentelemetryPhoenix.Reason
def start_link(args) do
GenServer.start_link(__MODULE__, args, name: __MODULE__)
end
def init(_args) do
setup()
{:ok, nil}
end
@doc """
Sets up telemetry attachments. Some bindings are registered via `OpentelemetryPhoenix`
and `OpentelemetryEcto`, but since neither of those handle LiveView we attach our own
bindings for those events.
Some bindings are attached before the library code, so that we can add some normalized
attributes across most events.
"""
def setup do
#### exception attachments need to be registered *before* we setup
#### OpentelemetryPhoenix or OpentelemetryEcto, so we can alter
#### the OT spans before those libraries end any spans
:telemetry.attach_many(
"opentelemetry-error-handler",
[
[:phoenix, :endpoint, :exception],
[:phoenix, :router_dispatch, :exception]
],
&handle_errors/4,
[]
)
OpentelemetryPhoenix.setup()
OpentelemetryEcto.setup([:my_app, :repo], time_unit: :millisecond)
#### start attachments need to be registered *after* we setup
#### OpentelemetryPhoenix or OpentelemetryEcto, so OT spans are already
#### created for the current process.
:telemetry.attach_many(
"opentelemetry-phoenix-handler",
[
[:my_app, :endpoint, :start],
[:phoenix, :endpoint, :start],
[:phoenix, :router_dispatch, :start],
[:phoenix, :error_rendered]
],
&add_ot_span_to_logger/4,
[]
)
#### Our custom telemetry handlers operate outside the scope of the
#### external libraries, so can be attached in any order.
:telemetry.attach_many(
"opentelemetry-exception-handler",
[
[:my_app, :opentelemetry, :exit],
[:phoenix, :live_view, :handle_event, :exception],
[:phoenix, :live_view, :mount, :exception]
],
&handle_exception/4,
[]
)
:telemetry.attach_many(
"opentelemetry-start-handler",
[
[:phoenix, :live_view, :handle_event, :start],
[:phoenix, :live_view, :mount, :start]
],
&open_child_span/4,
[]
)
:telemetry.attach_many(
"opentelemetry-stop-handler",
[
[:phoenix, :live_view, :handle_event, :stop],
[:phoenix, :live_view, :mount, :stop]
],
&handle_success/4,
[]
)
end
@doc """
Given a telemetry event signaling that an action has begun, open a new child span.
Makes sure that a parent span is either created, or attached to, so that events appear
as siblings in a single trace spanning the LiveView session.
Note that child spans pull attributes off of the socket that are only present if the
`:connect_info` is configured to include them in the Endpoint:
socket "/live", Phoenix.LiveView.Socket,
websocket: [
connect_info: [
:peer_data,
:trace_context_headers,
:user_agent,
:x_headers,
session: @session_options
]
]
"""
def open_child_span([:phoenix, :live_view, :handle_event, :start], _payload, meta, _config) do
get_parent_ctx()
|> create_child_span("HANDLE_EVENT #{meta.event}", meta.socket)
end
def open_child_span([:phoenix, :live_view, :mount, :start], _payload, meta, _config) do
meta.socket
|> create_parent_ctx()
|> create_child_span("MOUNT #{to_module(meta.socket.view)}", meta.socket)
end
defp create_child_span(parent_ctx, span_name, socket) do
new_ctx = Tracer.start_span(parent_ctx, span_name, %{kind: :SERVER})
_ = Tracer.set_current_span(new_ctx)
spans =
Logger.metadata()
|> Keyword.get(:ot_spans, [])
|> List.insert_at(0, new_ctx)
Logger.metadata(ot_spans: spans)
peer_data = get_peer_data(socket)
user_agent = get_user_agent(socket)
# peer_ip = Map.get(peer_data, :address)
attributes = [
# "http.client_ip": client_ip(conn),
# "http.flavor": http_flavor(adapter),
"http.host": socket.host_uri.host,
# "http.method": conn.method,
"http.scheme": socket.host_uri.scheme,
# "http.target": conn.request_path,
"http.user_agent": user_agent,
"live_view.connection_status": connection_status(socket),
# "net.host.ip": to_string(:inet_parse.ntoa(conn.remote_ip)),
"net.host.port": socket.host_uri.port,
# "net.peer.ip": to_string(:inet_parse.ntoa(peer_ip)),
"net.peer.port": peer_data.port,
"net.transport": :"IP.TCP"
]
Span.set_attributes(new_ctx, attributes)
end
defp create_parent_ctx(socket) do
parent_span = Tracer.start_span("LIVE #{to_module(socket.view)}", %{kind: :SERVER})
parent_ctx = Tracer.set_current_span(Ctx.new(), parent_span)
_prev_ctx = Ctx.attach(parent_ctx)
Span.end_span(parent_span)
Process.put(:ot_parent_ctx, parent_ctx)
parent_ctx
end
defp get_parent_ctx do
parent_ctx = Process.get(:ot_parent_ctx)
_prev_ctx = Ctx.attach(parent_ctx)
parent_ctx
end
@doc """
Caught exits are cast to ourselves, so that we do not interrupt the logger.
"""
def handle_cast({:caught_exit, _payload, %{spans: spans, reason: reason, stacktrace: stacktrace}}, state) do
crash_attrs =
Keyword.merge(
[
type: :exit,
stacktrace: Exception.format_stacktrace(stacktrace)
],
Reason.normalize(reason)
)
for span_ctx <- spans do
Span.set_attribute(span_ctx, :status, :crash)
Span.add_event(span_ctx, "crash", crash_attrs)
Span.set_status(span_ctx, OpenTelemetry.status(:Error, "Error"))
OpenTelemetry.Span.end_span(span_ctx)
end
{:noreply, state}
end
@doc """
Intended for exceptions that happen during spans that have been opened by OpentelemetryPhoenix or OpentelemetryEcto.
Those libraries do not put a high-level `:status` key in the data, which we do for our spans to normalize across different
event types.
"""
def handle_errors(_event, _payload, _metadata, _config) do
span_ctx = Tracer.current_span_ctx()
Span.set_attribute(span_ctx, :status, :error)
Span.end_span(span_ctx)
end
@doc """
Catch message around exits and exceptions, to make sure that spans are marked with crash/exception status and
closed.
"""
def handle_exception([:my_app, :opentelemetry, :exit], payload, metadata, _config) do
GenServer.cast(__MODULE__, {:caught_exit, payload, metadata})
end
def handle_exception([:phoenix, :live_view, _, :exception], _payload, %{kind: kind, reason: reason, stacktrace: stacktrace}, _config) do
exception_attrs =
Keyword.merge(
[
type: kind,
stacktrace: Exception.format_stacktrace(stacktrace)
],
Reason.normalize(reason)
)
status = OpenTelemetry.status(:Error, "Error")
span_ctx = Tracer.current_span_ctx()
Span.set_attribute(span_ctx, :status, :error)
Span.add_event(span_ctx, "exception", exception_attrs)
Span.set_status(span_ctx, status)
Span.end_span(span_ctx)
pop_span_ctx()
end
@doc """
Set status to `:ok` when a `:stop` event fires, which suggests that whatever happened
was successful, ie did not crash or raise.
"""
def handle_success([:phoenix, :live_view, _, :stop], _payload, _meta, _config) do
span_ctx = Tracer.current_span_ctx()
Span.set_attribute(span_ctx, :status, :ok)
Span.end_span(span_ctx)
pop_span_ctx()
end
@doc """
When crashes or exits occur, rather than exceptions, no telemetry fires to close open
spans. To get around this, we keep a list of open spans in the `Logger.metadata`, and
check for crashes in a `OpenTelemetryCrashLogger`.
If a crash occurs where any `:ot_spans` live in the metadata, a `[:my_app, :opentelemetry, :exit]`
telemetry event is fired, which gets picked up by our `handle_exception` binding.
"""
def add_ot_span_to_logger(event, _payload, _meta, _config) do
Tracer.current_span_ctx()
|> case do
:undefined ->
Logger.warn("[#{__MODULE__}] (#{inspect(event)}) expected current process have an OpenTelemetry span, but does not")
span_ctx ->
spans =
Logger.metadata()
|> Keyword.get(:ot_spans, [])
|> List.insert_at(0, span_ctx)
Logger.metadata(ot_spans: spans)
end
end
#### See if any of this info is present on a socket in the context
#### of GCP / DO.
# defp client_ip(%{remote_ip: remote_ip} = conn) do
# case Plug.Conn.get_req_header(conn, "x-forwarded-for") do
# [] ->
# to_string(:inet_parse.ntoa(remote_ip))
# [client | _] ->
# client
# end
# end
defp pop_span_ctx do
spans =
Logger.metadata()
|> Keyword.get(:ot_spans, [])
|> case do
[_span | rest] -> rest
[] -> []
end
Logger.metadata(ot_spans: spans)
spans
|> case do
[parent | _] -> parent
_ -> :undefined
end
|> Tracer.set_current_span()
end
defp get_peer_data(%{private: %{connect_info: %{peer_data: peer_data}}}), do: peer_data
defp get_peer_data(_), do: %{port: nil, address: nil, ssl_cert: nil}
defp get_user_agent(%{private: %{connect_info: %{user_agent: user_agent}}}), do: user_agent
defp get_user_agent(_), do: ""
defp connection_status(%{connected?: true}), do: "connected"
defp connection_status(%{connected?: false}), do: "disconnected"
defp to_module(module) do
module
|> to_string()
|> case do
"Elixir." <> name -> name
erlang_module -> ":#{erlang_module}"
end
end
end
defmodule OpenTelemetryCrashLogger do
@moduledoc """
Try to capture exits and report them in OpenTelemetry.
References:
* https://github.com/elixir-lang/elixir/blob/v1.11.3/lib/logger/lib/logger/backends/console.ex
"""
@behaviour :gen_event
defstruct level: nil
def init(__MODULE__), do: init({__MODULE__, :open_telemetry_crash_logger})
def init({__MODULE__, config_name}) do
config = Application.get_env(:logger, config_name)
{:ok, new(config, %__MODULE__{})}
end
def handle_call({:configure, _options}, state) do
{:ok, :ok, state}
end
def handle_event({:info, _, _}, state), do: {:ok, state}
def handle_event({:error, _pid, {Logger, _, _timestamp, metadata}}, state) do
case Keyword.get(metadata, :crash_reason) do
nil ->
{:ok, state}
{crash_reason, stacktrace} ->
case Keyword.get(metadata, :ot_spans) do
nil ->
{:ok, state}
spans when is_list(spans) ->
:telemetry.execute([:my_app, :opentelemetry, :exit], %{}, %{spans: spans, reason: crash_reason, stacktrace: stacktrace})
{:ok, state}
end
end
{:ok, state}
end
def handle_event({level, _gl, _thing} = _event, state) do
%{level: log_level} = state
if meet_level?(level, log_level) do
{:ok, state}
else
{:ok, state}
end
end
def handle_event(:flush, state) do
{:ok, flush(state)}
end
def handle_event(_msg, state) do
{:ok, state}
end
def handle_info({:io_reply, _ref, _msg}, state), do: {:ok, state}
def handle_info({:EXIT, _pid, _reason}, state), do: {:ok, state}
defp flush(state), do: state
defp meet_level?(_lvl, nil), do: true
defp meet_level?(lvl, min), do: Logger.compare_levels(lvl, min) != :lt
defp new(config, state) do
level = Keyword.get(config, :level, :error)
%{
state
| level: level
}
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment