Skip to content

Instantly share code, notes, and snippets.

@garthk
Last active July 7, 2020 23:42
Show Gist options
  • Save garthk/8ba26f5bd0d81196dd3b47020a9a207a to your computer and use it in GitHub Desktop.
Save garthk/8ba26f5bd0d81196dd3b47020a9a207a to your computer and use it in GitHub Desktop.
Using telemetry to capture OpenCensus trace span in Elixir
defmodule TelemetryTracingExperimentTest do
use ExUnit.Case, async: true
@doc """
Dump a variable to standard output, ANSI formatted, and pretty.
"""
def dump(v) do
opts = IEx.Config.inspect_opts()
[:reset, "\n", get_in(opts, [:syntax_colors, :reset]) || :reset, inspect(v, opts), :reset]
|> IO.ANSI.format()
|> IO.puts()
end
@doc """
Dump a string back out, formatted as the function call.
"""
def explain(s) do
opts = IEx.Config.inspect_opts()
[
:reset,
"\nexplain ",
get_in(opts, [:syntax_colors, :string]) || :reset,
"\"\"\"",
for line <- s |> String.trim_trailing() |> String.split("\n") do
["\n", line]
end,
"\n\"\"\"",
:reset
]
|> IO.ANSI.format()
|> IO.puts()
end
@doc """
Dump some code to standard output, then run it, and `dump/1` its result also.
"""
defmacro loudly(do: block) do
code =
Macro.to_string(
quote generated: true do
loudly do
unquote(block)
end
end
)
quote do
IO.puts(["\n", unquote(code)])
dump(unquote(block))
end
end
@tag :experiments
test "the whole thing" do
explain("""
How long is a second, in :timer native units? Let's measure a tenth of a second:
""")
loudly do
{d, :ok} = :timer.tc(fn -> 1 |> :timer.seconds() |> div(10) |> :timer.sleep() end)
d * 10
end
explain("""
1_000_000 and change, right? Looks like microseconds. Let's shift it to milliseconds.
If we get 100, we're good:
""")
loudly do
System.convert_time_unit(d, :microsecond, :millisecond)
end
explain("""
How many times can we call :telemetry.span/3 in a second if there are no handlers?
First, let's enforce our assumptions:
""")
loudly do
event_prefix = [:d597a6960262870e]
for %{id: id} <- :telemetry.list_handlers(event_prefix), do: :telemetry.detach(id)
end
explain("""
If we separate out, as much as possible:
* The code being measured (`{:ok, arg}`)
* The code instrumenting it with `:telemetry.span/3` (the rest of `measure_me`)
* The benchmark (the call to `:timer.tc/1`)
... and measure 10K results, we'll get a rough idea of the overhead:
""")
loudly do
measure_me = fn arg ->
ref = make_ref()
start_and_exception_metadata = %{arg: arg, ref: ref}
:telemetry.span(event_prefix, start_and_exception_metadata, fn ->
# THIS IS OUR ONLY LINE OF BUSINESS LOGIC CODE:
result = {:ok, arg}
# Back to the overhead:
stop_metadata = %{arg: arg, ref: ref, result: result}
{result, stop_metadata}
end)
end
count = 10_000
{d, result} = :timer.tc(fn -> for i <- 1..count, do: measure_me.(i) end)
assert length(result) == count
assert hd(result) == {:ok, 1}
assert result |> Enum.reverse() |> hd() == {:ok, count}
%{calls_per_sec: div(1_000_000 * count, d), each_µs: Float.floor(d / count, 1)}
end
explain("""
~900K per second on a server CPU? Library vendors should feel safe sprinkling `:telemetry`
about. Anyone attaching handlers should think of the extra impact, though. What if there's a
handler attached to :start?
""")
loudly do
handler_id = make_ref()
handler = fn _n, _m10s, _m6a, _c -> :... end
:telemetry.attach(handler_id, event_prefix ++ [:start], handler, nil)
{d, _} = :timer.tc(fn -> for i <- 1..count, do: measure_me.(i) end)
:telemetry.detach(handler_id)
%{calls_per_sec: div(1_000_000 * count, d), each_µs: Float.floor(d / count, 1)}
end
explain("""
~800K calls per second isn't bad, either.
How about three handlers, one of which is almost never called?
""")
loudly do
handler_id = make_ref()
handler = fn _n, _m10s, _m6a, _c -> :... end
:telemetry.attach(handler_id, event_prefix ++ [:start], handler, nil)
:telemetry.attach(handler_id, event_prefix ++ [:stop], handler, nil)
:telemetry.attach(handler_id, event_prefix ++ [:exception], handler, nil)
{d, _} = :timer.tc(fn -> for i <- 1..count, do: measure_me.(i) end)
:telemetry.detach(handler_id)
%{calls_per_sec: div(1_000_000 * count, d), each_µs: Float.floor(d / count, 1)}
end
explain("""
Three handlers that try to start and stop spans?
""")
loudly do
handler_id = make_ref()
handle_start = fn _n, _m10s, %{ref: _}, _c ->
:ocp.with_child_span("benchmark")
end
handle_stop = fn _n, _m10s, %{ref: _}, _c ->
:ocp.set_status(0, "OK")
:ocp.finish_span()
end
handle_exception = fn _n, _m10s, %{ref: _}, _c ->
:ocp.set_status(2, "FAIL")
:ocp.finish_span()
end
:telemetry.attach({handler_id, :start}, event_prefix ++ [:start], handle_start, nil)
:telemetry.attach({handler_id, :stop}, event_prefix ++ [:stop], handle_stop, nil)
:telemetry.attach(
{handler_id, :exception},
event_prefix ++ [:exception],
handle_exception,
nil
)
{d, _} = :timer.tc(fn -> for i <- 1..count, do: measure_me.(i) end)
:telemetry.detach(handler_id)
%{calls_per_sec: div(1_000_000 * count, d), each_µs: Float.floor(d / count, 1)}
end
explain("""
~50K? Ouch! Not bad if we're wrapping something taking 1ms (~2% overhead?) or more,
but tracing is definitely heavier than telemetry.
We got ahead of ourselves, though. How about if we did tracing directly?
""")
loudly do
self_tracing = fn arg ->
:ocp.with_child_span("benchmark")
try do
{:ok, arg}
after
:ocp.set_status(0, "OK")
:ocp.finish_span()
end
end
{d, _} = :timer.tc(fn -> for i <- 1..count, do: self_tracing.(i) end)
%{calls_per_sec: div(1_000_000 * count, d), each_µs: Float.floor(d / count, 1)}
end
explain("""
Faster than doing it via :telemetry.span/3, but not that much faster. Back to the original
version, let's get defensive. TODO explain concerns.
""")
loudly do
pdict_key = :d597a6960262870e
handler_id = make_ref()
handle_start = fn _n, _m10s, %{ref: ref}, _c ->
previous_span_ctx = :ocp.current_span_ctx()
span_ctx = :oc_trace.start_span("benchmark", previous_span_ctx)
:ocp.with_span_ctx(span_ctx)
pdict_key
|> Process.get(%{})
|> put_in([ref], {previous_span_ctx, span_ctx})
|> Process.put(pdict_key)
end
handle_stop = fn _n, _m10s, %{ref: ref}, _c ->
state = Process.get(pdict_key, %{})
{previously, state} = Map.pop(state, ref)
Process.put(pdict_key, state)
case {previously, :ocp.current_span_ctx()} do
{nil, span_ctx} ->
# can't be defensive; started without ref?
previous_span_ctx = :oc_trace.parent_span_ctx(span_ctx)
:oc_trace.set_status(0, "OK", span_ctx)
:oc_trace.finish_span(span_ctx)
:ocp.with_span_ctx(previous_span_ctx)
{{previous_span_ctx, span_ctx}, span_ctx} ->
# nothing bad happened!
:oc_trace.set_status(0, "OK", span_ctx)
:oc_trace.finish_span(span_ctx)
:ocp.with_span_ctx(previous_span_ctx)
{{previous_span_ctx, span_ctx}, other_span_ctx} ->
IO.warn("span mismatch")
# expecting more push than pop, let's wind up the other one
:oc_trace.set_status(2, "UNKNOWN", other_span_ctx)
:oc_trace.finish_span(other_span_ctx)
# ... and then back to normal:
:oc_trace.set_status(0, "OK", span_ctx)
:oc_trace.finish_span(span_ctx)
:ocp.with_span_ctx(previous_span_ctx)
end
end
handle_exception = fn _n, _m10s, %{ref: _}, _c ->
# TODO also do all that defensive span popping:
:ocp.set_status(2, "FAIL")
:ocp.finish_span()
end
:telemetry.attach({handler_id, :start}, event_prefix ++ [:start], handle_start, nil)
:telemetry.attach({handler_id, :stop}, event_prefix ++ [:stop], handle_stop, nil)
:telemetry.attach(
{handler_id, :exception},
event_prefix ++ [:exception],
handle_exception,
nil
)
{d, _} = :timer.tc(fn -> for i <- 1..count, do: measure_me.(i) end)
:telemetry.detach(handler_id)
%{calls_per_sec: div(1_000_000 * count, d), each_µs: Float.floor(d / count, 1)}
end
end
end
defmodule ViaTelemetryTest do
@moduledoc "Tracing hooked up with Telemetry, OpenTelemetry version."
use ExUnit.Case, async: true
alias OpenTelemetry.Tracer
require OpenTelemetry.Tracer
@doc """
Dump a variable to standard output, ANSI formatted, and pretty.
"""
def dump(v) do
opts = IEx.Config.inspect_opts()
[:reset, "\n", get_in(opts, [:syntax_colors, :reset]) || :reset, inspect(v, opts), :reset]
|> IO.ANSI.format()
|> IO.puts()
end
@doc """
Dump a string back out, formatted as the function call.
"""
def explain(s) do
opts = IEx.Config.inspect_opts()
[
:reset,
"\nexplain ",
get_in(opts, [:syntax_colors, :string]) || :reset,
"\"\"\"",
for line <- s |> String.trim_trailing() |> String.split("\n") do
["\n", line]
end,
"\n\"\"\"",
:reset
]
|> IO.ANSI.format()
|> IO.puts()
end
@doc """
Dump some code to standard output, then run it, and `dump/1` its result also.
"""
defmacro loudly(do: block) do
code =
Macro.to_string(
quote generated: true do
loudly do
unquote(block)
end
end
)
quote do
IO.puts(["\n", unquote(code)])
unquote(block)
|> dump()
end
end
@tag :experiments
test "the whole thing" do
explain """
TODO explain why I'm checking this
"""
loudly do
:opentelemetry.get_tracer(__MODULE__)
end
explain """
How long is a second, in :timer native units? Let's measure a tenth of a second:
"""
loudly do
{d, :ok} = :timer.tc(fn -> 1 |> :timer.seconds() |> div(10) |> :timer.sleep() end)
d * 10
end
explain """
1_000_000 and change, right? Looks like microseconds. Let's shift it to milliseconds.
If we get 100, we're good:
"""
loudly do
System.convert_time_unit(d, :microsecond, :millisecond)
end
explain """
How many times can we call :telemetry.span/3 in a second if there are no handlers?
First, let's enforce our assumptions:
"""
loudly do
event_prefix = [:d597a6960262870e]
for %{id: id} <- :telemetry.list_handlers(event_prefix), do: :telemetry.detach(id)
end
explain """
If we separate out, as much as possible:
* The code being measured (`{:ok, arg}`)
* The code instrumenting it with `:telemetry.span/3` (the rest of `measure_me`)
* The benchmark (the call to `:timer.tc/1`)
... and measure 10K results, we'll get a rough idea of the overhead:
"""
loudly do
measure_me = fn arg ->
ref = make_ref()
start_and_exception_metadata = %{arg: arg, ref: ref}
:telemetry.span(event_prefix, start_and_exception_metadata, fn ->
# THIS IS OUR ONLY LINE OF BUSINESS LOGIC CODE:
result = {:ok, arg}
# Back to the overhead:
stop_metadata = %{arg: arg, ref: ref, result: result}
{result, stop_metadata}
end)
end
count = 10_000
{d, result} = :timer.tc(fn -> for i <- 1..count, do: measure_me.(i) end)
assert length(result) == count
assert hd(result) == {:ok, 1}
assert result |> Enum.reverse() |> hd() == {:ok, count}
%{calls_per_sec: div(1_000_000 * count, d), each_µs: Float.floor(d / count, 1)}
end
explain """
~900K per second on a server CPU? Library vendors should feel safe sprinkling `:telemetry`
about. Anyone attaching handlers should think of the extra impact, though. What if there's a
handler attached to :start?
"""
loudly do
handler_id = make_ref()
handler = fn _n, _m10s, _m6a, _c -> :... end
:telemetry.attach(handler_id, event_prefix ++ [:start], handler, nil)
{d, result} = :timer.tc(fn -> for i <- 1..count, do: measure_me.(i) end)
:telemetry.detach(handler_id)
%{calls_per_sec: div(1_000_000 * count, d), each_µs: Float.floor(d / count, 1)}
end
explain """
~800K calls per second isn't bad, either.
How about three handlers, one of which is almost never called?
"""
loudly do
handler_id = make_ref()
handler = fn _n, _m10s, _m6a, _c -> :... end
:telemetry.attach(handler_id, event_prefix ++ [:start], handler, nil)
:telemetry.attach(handler_id, event_prefix ++ [:stop], handler, nil)
:telemetry.attach(handler_id, event_prefix ++ [:exception], handler, nil)
{d, result} = :timer.tc(fn -> for i <- 1..count, do: measure_me.(i) end)
:telemetry.detach(handler_id)
%{calls_per_sec: div(1_000_000 * count, d), each_µs: Float.floor(d / count, 1)}
end
explain """
Three handlers that try to start and stop spans?
"""
loudly do
handler_id = make_ref()
handle_start = fn _n, _m10s, %{ref: _}, _c ->
Tracer.start_span("benchmark")
end
handle_stop = fn _n, _m10s, %{ref: _}, _c ->
Tracer.end_span()
end
handle_exception = fn _n, _m10s, %{ref: _}, _c ->
Tracer.set_status("Unknown")
Tracer.end_span()
end
:telemetry.attach({handler_id, :start}, event_prefix ++ [:start], handle_start, nil)
:telemetry.attach({handler_id, :stop}, event_prefix ++ [:stop], handle_stop, nil)
:telemetry.attach(
{handler_id, :exception},
event_prefix ++ [:exception],
handle_exception,
nil
)
{d, result} = :timer.tc(fn -> for i <- 1..count, do: measure_me.(i) end)
for event <- [:start, :stop, :exception], do: :telemetry.detach({handler_id, event})
%{calls_per_sec: div(1_000_000 * count, d), each_µs: Float.floor(d / count, 1)}
end
loudly do
{:embarrassing_trailing_span_ctx?, OpenTelemetry.Tracer.current_span_ctx()}
end
explain """
~50K? Ouch! Not bad if we're wrapping something taking 1ms (~2% overhead?) or more,
but tracing is definitely heavier than telemetry.
We got ahead of ourselves, though. How about if we did tracing directly?
"""
loudly do
self_tracing = fn arg ->
Tracer.start_span("benchmark")
try do
{:ok, arg}
after
Tracer.end_span()
end
end
{d, result} = :timer.tc(fn -> for i <- 1..count, do: self_tracing.(i) end)
%{calls_per_sec: div(1_000_000 * count, d), each_µs: Float.floor(d / count, 1)}
end
loudly do
{:embarrassing_trailing_span_ctx?, OpenTelemetry.Tracer.current_span_ctx()}
end
explain """
Faster than doing it via :telemetry.span/3, but not that much faster. Back to the original
version, let's get defensive. TODO explain concerns.
"""
loudly do
# :ok = :dbg.stop_clear()
# {:ok, _} =
# :dbg.tracer(
# :process,
# {fn message, :state ->
# dump(message)
# :state
# end, :state}
# )
for m <- [:ot_tracer, Process], do: :dbg.tp(m, [{:_, [], [{:return_trace}]}])
:dbg.p(:all, :c)
pdict_key = :d597a6960262870e
handler_id = make_ref()
tracer = :opentelemetry.get_tracer(__MODULE__)
handle_start = fn _n, _m10s, %{ref: ref}, _c ->
# get the previous span_ctx manually
# TODO find way to reach create_span to avoid repeating :ot_ctx.get_value/2
previous_span_ctx = :ot_tracer.current_span_ctx(tracer)
span_ctx = :ot_tracer.start_span(tracer, "benchmark", %{})
state = Process.get(pdict_key, %{})
state = put_in(state, [ref], {previous_span_ctx, span_ctx})
Process.put(pdict_key, state)
end
handle_stop = fn _n, _m10s, %{ref: ref}, _c ->
state = Process.get(pdict_key, %{})
{previously, state} = Map.pop(state, ref)
Process.put(pdict_key, state)
case {previously, Tracer.current_span_ctx()} do
{nil, span_ctx} ->
IO.puts("NO PREVIOUS #{__ENV__.file}:#{__ENV__.line}")
# can't be defensive; started without ref?
:ot_tracer.end_span(tracer, span_ctx)
# automatically sets parent?
# TODO remove crasher:
0 = 1
{{previous_span_ctx, span_ctx}, span_ctx} ->
# nothing bad happened! end span, implicitly taking previous_span_ctx
:ot_tracer.end_span(tracer, span_ctx)
# assert :ot_tracer.current_span_ctx(tracer) == previous_span_ctx
{{previous_span_ctx, span_ctx}, other_span_ctx} ->
IO.puts("MISMATCH #{__ENV__.file}:#{__ENV__.line}")
IO.warn("span mismatch")
# expecting more push than pop, let's wind up the other one
:ot_span.set_status(tracer, other_span_ctx, "Unknown")
:ot_tracer.end_span(tracer, other_span_ctx)
# ... and then back to normal:
:ot_tracer.end_span(tracer, span_ctx)
# :ot_tracer.set_span(tracer, previous_span_ctx)
assert :ot_tracer.current_span_ctx(tracer) == previous_span_ctx
end
end
handle_exception = fn _n, _m10s, %{ref: _}, _c ->
# TODO also do all that defensive span popping:
:ocp.set_status(2, "FAIL")
Tracer.end_span()
end
:telemetry.attach({handler_id, :start}, event_prefix ++ [:start], handle_start, nil)
:telemetry.attach({handler_id, :stop}, event_prefix ++ [:stop], handle_stop, nil)
:telemetry.attach(
{handler_id, :exception},
event_prefix ++ [:exception],
handle_exception,
nil
)
{d, _} = :timer.tc(fn -> for i <- 1..count, do: measure_me.(i) end)
for event <- [:start, :stop, :exception], do: :telemetry.detach({handler_id, event})
:ok = :dbg.stop_clear()
%{calls_per_sec: div(1_000_000 * count, d), each_µs: Float.floor(d / count, 1)}
end
end
end
@garthk
Copy link
Author

garthk commented Jul 5, 2020

Updated with defensive span popping.

@garthk
Copy link
Author

garthk commented Jul 7, 2020

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment