Skip to content

Instantly share code, notes, and snippets.

@garthk
Last active July 7, 2020 23:42
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save garthk/8ba26f5bd0d81196dd3b47020a9a207a to your computer and use it in GitHub Desktop.
Save garthk/8ba26f5bd0d81196dd3b47020a9a207a to your computer and use it in GitHub Desktop.
Using telemetry to capture OpenCensus trace span in Elixir
defmodule TelemetryTracingExperimentTest do
use ExUnit.Case, async: true
@doc """
Dump a variable to standard output, ANSI formatted, and pretty.
"""
def dump(v) do
opts = IEx.Config.inspect_opts()
[:reset, "\n", get_in(opts, [:syntax_colors, :reset]) || :reset, inspect(v, opts), :reset]
|> IO.ANSI.format()
|> IO.puts()
end
@doc """
Dump a string back out, formatted as the function call.
"""
def explain(s) do
opts = IEx.Config.inspect_opts()
[
:reset,
"\nexplain ",
get_in(opts, [:syntax_colors, :string]) || :reset,
"\"\"\"",
for line <- s |> String.trim_trailing() |> String.split("\n") do
["\n", line]
end,
"\n\"\"\"",
:reset
]
|> IO.ANSI.format()
|> IO.puts()
end
@doc """
Dump some code to standard output, then run it, and `dump/1` its result also.
"""
defmacro loudly(do: block) do
code =
Macro.to_string(
quote generated: true do
loudly do
unquote(block)
end
end
)
quote do
IO.puts(["\n", unquote(code)])
dump(unquote(block))
end
end
@tag :experiments
test "the whole thing" do
explain("""
How long is a second, in :timer native units? Let's measure a tenth of a second:
""")
loudly do
{d, :ok} = :timer.tc(fn -> 1 |> :timer.seconds() |> div(10) |> :timer.sleep() end)
d * 10
end
explain("""
1_000_000 and change, right? Looks like microseconds. Let's shift it to milliseconds.
If we get 100, we're good:
""")
loudly do
System.convert_time_unit(d, :microsecond, :millisecond)
end
explain("""
How many times can we call :telemetry.span/3 in a second if there are no handlers?
First, let's enforce our assumptions:
""")
loudly do
event_prefix = [:d597a6960262870e]
for %{id: id} <- :telemetry.list_handlers(event_prefix), do: :telemetry.detach(id)
end
explain("""
If we separate out, as much as possible:
* The code being measured (`{:ok, arg}`)
* The code instrumenting it with `:telemetry.span/3` (the rest of `measure_me`)
* The benchmark (the call to `:timer.tc/1`)
... and measure 10K results, we'll get a rough idea of the overhead:
""")
loudly do
measure_me = fn arg ->
ref = make_ref()
start_and_exception_metadata = %{arg: arg, ref: ref}
:telemetry.span(event_prefix, start_and_exception_metadata, fn ->
# THIS IS OUR ONLY LINE OF BUSINESS LOGIC CODE:
result = {:ok, arg}
# Back to the overhead:
stop_metadata = %{arg: arg, ref: ref, result: result}
{result, stop_metadata}
end)
end
count = 10_000
{d, result} = :timer.tc(fn -> for i <- 1..count, do: measure_me.(i) end)
assert length(result) == count
assert hd(result) == {:ok, 1}
assert result |> Enum.reverse() |> hd() == {:ok, count}
%{calls_per_sec: div(1_000_000 * count, d), each_µs: Float.floor(d / count, 1)}
end
explain("""
~900K per second on a server CPU? Library vendors should feel safe sprinkling `:telemetry`
about. Anyone attaching handlers should think of the extra impact, though. What if there's a
handler attached to :start?
""")
loudly do
handler_id = make_ref()
handler = fn _n, _m10s, _m6a, _c -> :... end
:telemetry.attach(handler_id, event_prefix ++ [:start], handler, nil)
{d, _} = :timer.tc(fn -> for i <- 1..count, do: measure_me.(i) end)
:telemetry.detach(handler_id)
%{calls_per_sec: div(1_000_000 * count, d), each_µs: Float.floor(d / count, 1)}
end
explain("""
~800K calls per second isn't bad, either.
How about three handlers, one of which is almost never called?
""")
loudly do
handler_id = make_ref()
handler = fn _n, _m10s, _m6a, _c -> :... end
:telemetry.attach(handler_id, event_prefix ++ [:start], handler, nil)
:telemetry.attach(handler_id, event_prefix ++ [:stop], handler, nil)
:telemetry.attach(handler_id, event_prefix ++ [:exception], handler, nil)
{d, _} = :timer.tc(fn -> for i <- 1..count, do: measure_me.(i) end)
:telemetry.detach(handler_id)
%{calls_per_sec: div(1_000_000 * count, d), each_µs: Float.floor(d / count, 1)}
end
explain("""
Three handlers that try to start and stop spans?
""")
loudly do
handler_id = make_ref()
handle_start = fn _n, _m10s, %{ref: _}, _c ->
:ocp.with_child_span("benchmark")
end
handle_stop = fn _n, _m10s, %{ref: _}, _c ->
:ocp.set_status(0, "OK")
:ocp.finish_span()
end
handle_exception = fn _n, _m10s, %{ref: _}, _c ->
:ocp.set_status(2, "FAIL")
:ocp.finish_span()
end
:telemetry.attach({handler_id, :start}, event_prefix ++ [:start], handle_start, nil)
:telemetry.attach({handler_id, :stop}, event_prefix ++ [:stop], handle_stop, nil)
:telemetry.attach(
{handler_id, :exception},
event_prefix ++ [:exception],
handle_exception,
nil
)
{d, _} = :timer.tc(fn -> for i <- 1..count, do: measure_me.(i) end)
:telemetry.detach(handler_id)
%{calls_per_sec: div(1_000_000 * count, d), each_µs: Float.floor(d / count, 1)}
end
explain("""
~50K? Ouch! Not bad if we're wrapping something taking 1ms (~2% overhead?) or more,
but tracing is definitely heavier than telemetry.
We got ahead of ourselves, though. How about if we did tracing directly?
""")
loudly do
self_tracing = fn arg ->
:ocp.with_child_span("benchmark")
try do
{:ok, arg}
after
:ocp.set_status(0, "OK")
:ocp.finish_span()
end
end
{d, _} = :timer.tc(fn -> for i <- 1..count, do: self_tracing.(i) end)
%{calls_per_sec: div(1_000_000 * count, d), each_µs: Float.floor(d / count, 1)}
end
explain("""
Faster than doing it via :telemetry.span/3, but not that much faster. Back to the original
version, let's get defensive. TODO explain concerns.
""")
loudly do
pdict_key = :d597a6960262870e
handler_id = make_ref()
handle_start = fn _n, _m10s, %{ref: ref}, _c ->
previous_span_ctx = :ocp.current_span_ctx()
span_ctx = :oc_trace.start_span("benchmark", previous_span_ctx)
:ocp.with_span_ctx(span_ctx)
pdict_key
|> Process.get(%{})
|> put_in([ref], {previous_span_ctx, span_ctx})
|> Process.put(pdict_key)
end
handle_stop = fn _n, _m10s, %{ref: ref}, _c ->
state = Process.get(pdict_key, %{})
{previously, state} = Map.pop(state, ref)
Process.put(pdict_key, state)
case {previously, :ocp.current_span_ctx()} do
{nil, span_ctx} ->
# can't be defensive; started without ref?
previous_span_ctx = :oc_trace.parent_span_ctx(span_ctx)
:oc_trace.set_status(0, "OK", span_ctx)
:oc_trace.finish_span(span_ctx)
:ocp.with_span_ctx(previous_span_ctx)
{{previous_span_ctx, span_ctx}, span_ctx} ->
# nothing bad happened!
:oc_trace.set_status(0, "OK", span_ctx)
:oc_trace.finish_span(span_ctx)
:ocp.with_span_ctx(previous_span_ctx)
{{previous_span_ctx, span_ctx}, other_span_ctx} ->
IO.warn("span mismatch")
# expecting more push than pop, let's wind up the other one
:oc_trace.set_status(2, "UNKNOWN", other_span_ctx)
:oc_trace.finish_span(other_span_ctx)
# ... and then back to normal:
:oc_trace.set_status(0, "OK", span_ctx)
:oc_trace.finish_span(span_ctx)
:ocp.with_span_ctx(previous_span_ctx)
end
end
handle_exception = fn _n, _m10s, %{ref: _}, _c ->
# TODO also do all that defensive span popping:
:ocp.set_status(2, "FAIL")
:ocp.finish_span()
end
:telemetry.attach({handler_id, :start}, event_prefix ++ [:start], handle_start, nil)
:telemetry.attach({handler_id, :stop}, event_prefix ++ [:stop], handle_stop, nil)
:telemetry.attach(
{handler_id, :exception},
event_prefix ++ [:exception],
handle_exception,
nil
)
{d, _} = :timer.tc(fn -> for i <- 1..count, do: measure_me.(i) end)
:telemetry.detach(handler_id)
%{calls_per_sec: div(1_000_000 * count, d), each_µs: Float.floor(d / count, 1)}
end
end
end
defmodule ViaTelemetryTest do
@moduledoc "Tracing hooked up with Telemetry, OpenTelemetry version."
use ExUnit.Case, async: true
alias OpenTelemetry.Tracer
require OpenTelemetry.Tracer
@doc """
Dump a variable to standard output, ANSI formatted, and pretty.
"""
def dump(v) do
opts = IEx.Config.inspect_opts()
[:reset, "\n", get_in(opts, [:syntax_colors, :reset]) || :reset, inspect(v, opts), :reset]
|> IO.ANSI.format()
|> IO.puts()
end
@doc """
Dump a string back out, formatted as the function call.
"""
def explain(s) do
opts = IEx.Config.inspect_opts()
[
:reset,
"\nexplain ",
get_in(opts, [:syntax_colors, :string]) || :reset,
"\"\"\"",
for line <- s |> String.trim_trailing() |> String.split("\n") do
["\n", line]
end,
"\n\"\"\"",
:reset
]
|> IO.ANSI.format()
|> IO.puts()
end
@doc """
Dump some code to standard output, then run it, and `dump/1` its result also.
"""
defmacro loudly(do: block) do
code =
Macro.to_string(
quote generated: true do
loudly do
unquote(block)
end
end
)
quote do
IO.puts(["\n", unquote(code)])
unquote(block)
|> dump()
end
end
@tag :experiments
test "the whole thing" do
explain """
TODO explain why I'm checking this
"""
loudly do
:opentelemetry.get_tracer(__MODULE__)
end
explain """
How long is a second, in :timer native units? Let's measure a tenth of a second:
"""
loudly do
{d, :ok} = :timer.tc(fn -> 1 |> :timer.seconds() |> div(10) |> :timer.sleep() end)
d * 10
end
explain """
1_000_000 and change, right? Looks like microseconds. Let's shift it to milliseconds.
If we get 100, we're good:
"""
loudly do
System.convert_time_unit(d, :microsecond, :millisecond)
end
explain """
How many times can we call :telemetry.span/3 in a second if there are no handlers?
First, let's enforce our assumptions:
"""
loudly do
event_prefix = [:d597a6960262870e]
for %{id: id} <- :telemetry.list_handlers(event_prefix), do: :telemetry.detach(id)
end
explain """
If we separate out, as much as possible:
* The code being measured (`{:ok, arg}`)
* The code instrumenting it with `:telemetry.span/3` (the rest of `measure_me`)
* The benchmark (the call to `:timer.tc/1`)
... and measure 10K results, we'll get a rough idea of the overhead:
"""
loudly do
measure_me = fn arg ->
ref = make_ref()
start_and_exception_metadata = %{arg: arg, ref: ref}
:telemetry.span(event_prefix, start_and_exception_metadata, fn ->
# THIS IS OUR ONLY LINE OF BUSINESS LOGIC CODE:
result = {:ok, arg}
# Back to the overhead:
stop_metadata = %{arg: arg, ref: ref, result: result}
{result, stop_metadata}
end)
end
count = 10_000
{d, result} = :timer.tc(fn -> for i <- 1..count, do: measure_me.(i) end)
assert length(result) == count
assert hd(result) == {:ok, 1}
assert result |> Enum.reverse() |> hd() == {:ok, count}
%{calls_per_sec: div(1_000_000 * count, d), each_µs: Float.floor(d / count, 1)}
end
explain """
~900K per second on a server CPU? Library vendors should feel safe sprinkling `:telemetry`
about. Anyone attaching handlers should think of the extra impact, though. What if there's a
handler attached to :start?
"""
loudly do
handler_id = make_ref()
handler = fn _n, _m10s, _m6a, _c -> :... end
:telemetry.attach(handler_id, event_prefix ++ [:start], handler, nil)
{d, result} = :timer.tc(fn -> for i <- 1..count, do: measure_me.(i) end)
:telemetry.detach(handler_id)
%{calls_per_sec: div(1_000_000 * count, d), each_µs: Float.floor(d / count, 1)}
end
explain """
~800K calls per second isn't bad, either.
How about three handlers, one of which is almost never called?
"""
loudly do
handler_id = make_ref()
handler = fn _n, _m10s, _m6a, _c -> :... end
:telemetry.attach(handler_id, event_prefix ++ [:start], handler, nil)
:telemetry.attach(handler_id, event_prefix ++ [:stop], handler, nil)
:telemetry.attach(handler_id, event_prefix ++ [:exception], handler, nil)
{d, result} = :timer.tc(fn -> for i <- 1..count, do: measure_me.(i) end)
:telemetry.detach(handler_id)
%{calls_per_sec: div(1_000_000 * count, d), each_µs: Float.floor(d / count, 1)}
end
explain """
Three handlers that try to start and stop spans?
"""
loudly do
handler_id = make_ref()
handle_start = fn _n, _m10s, %{ref: _}, _c ->
Tracer.start_span("benchmark")
end
handle_stop = fn _n, _m10s, %{ref: _}, _c ->
Tracer.end_span()
end
handle_exception = fn _n, _m10s, %{ref: _}, _c ->
Tracer.set_status("Unknown")
Tracer.end_span()
end
:telemetry.attach({handler_id, :start}, event_prefix ++ [:start], handle_start, nil)
:telemetry.attach({handler_id, :stop}, event_prefix ++ [:stop], handle_stop, nil)
:telemetry.attach(
{handler_id, :exception},
event_prefix ++ [:exception],
handle_exception,
nil
)
{d, result} = :timer.tc(fn -> for i <- 1..count, do: measure_me.(i) end)
for event <- [:start, :stop, :exception], do: :telemetry.detach({handler_id, event})
%{calls_per_sec: div(1_000_000 * count, d), each_µs: Float.floor(d / count, 1)}
end
loudly do
{:embarrassing_trailing_span_ctx?, OpenTelemetry.Tracer.current_span_ctx()}
end
explain """
~50K? Ouch! Not bad if we're wrapping something taking 1ms (~2% overhead?) or more,
but tracing is definitely heavier than telemetry.
We got ahead of ourselves, though. How about if we did tracing directly?
"""
loudly do
self_tracing = fn arg ->
Tracer.start_span("benchmark")
try do
{:ok, arg}
after
Tracer.end_span()
end
end
{d, result} = :timer.tc(fn -> for i <- 1..count, do: self_tracing.(i) end)
%{calls_per_sec: div(1_000_000 * count, d), each_µs: Float.floor(d / count, 1)}
end
loudly do
{:embarrassing_trailing_span_ctx?, OpenTelemetry.Tracer.current_span_ctx()}
end
explain """
Faster than doing it via :telemetry.span/3, but not that much faster. Back to the original
version, let's get defensive. TODO explain concerns.
"""
loudly do
# :ok = :dbg.stop_clear()
# {:ok, _} =
# :dbg.tracer(
# :process,
# {fn message, :state ->
# dump(message)
# :state
# end, :state}
# )
for m <- [:ot_tracer, Process], do: :dbg.tp(m, [{:_, [], [{:return_trace}]}])
:dbg.p(:all, :c)
pdict_key = :d597a6960262870e
handler_id = make_ref()
tracer = :opentelemetry.get_tracer(__MODULE__)
handle_start = fn _n, _m10s, %{ref: ref}, _c ->
# get the previous span_ctx manually
# TODO find way to reach create_span to avoid repeating :ot_ctx.get_value/2
previous_span_ctx = :ot_tracer.current_span_ctx(tracer)
span_ctx = :ot_tracer.start_span(tracer, "benchmark", %{})
state = Process.get(pdict_key, %{})
state = put_in(state, [ref], {previous_span_ctx, span_ctx})
Process.put(pdict_key, state)
end
handle_stop = fn _n, _m10s, %{ref: ref}, _c ->
state = Process.get(pdict_key, %{})
{previously, state} = Map.pop(state, ref)
Process.put(pdict_key, state)
case {previously, Tracer.current_span_ctx()} do
{nil, span_ctx} ->
IO.puts("NO PREVIOUS #{__ENV__.file}:#{__ENV__.line}")
# can't be defensive; started without ref?
:ot_tracer.end_span(tracer, span_ctx)
# automatically sets parent?
# TODO remove crasher:
0 = 1
{{previous_span_ctx, span_ctx}, span_ctx} ->
# nothing bad happened! end span, implicitly taking previous_span_ctx
:ot_tracer.end_span(tracer, span_ctx)
# assert :ot_tracer.current_span_ctx(tracer) == previous_span_ctx
{{previous_span_ctx, span_ctx}, other_span_ctx} ->
IO.puts("MISMATCH #{__ENV__.file}:#{__ENV__.line}")
IO.warn("span mismatch")
# expecting more push than pop, let's wind up the other one
:ot_span.set_status(tracer, other_span_ctx, "Unknown")
:ot_tracer.end_span(tracer, other_span_ctx)
# ... and then back to normal:
:ot_tracer.end_span(tracer, span_ctx)
# :ot_tracer.set_span(tracer, previous_span_ctx)
assert :ot_tracer.current_span_ctx(tracer) == previous_span_ctx
end
end
handle_exception = fn _n, _m10s, %{ref: _}, _c ->
# TODO also do all that defensive span popping:
:ocp.set_status(2, "FAIL")
Tracer.end_span()
end
:telemetry.attach({handler_id, :start}, event_prefix ++ [:start], handle_start, nil)
:telemetry.attach({handler_id, :stop}, event_prefix ++ [:stop], handle_stop, nil)
:telemetry.attach(
{handler_id, :exception},
event_prefix ++ [:exception],
handle_exception,
nil
)
{d, _} = :timer.tc(fn -> for i <- 1..count, do: measure_me.(i) end)
for event <- [:start, :stop, :exception], do: :telemetry.detach({handler_id, event})
:ok = :dbg.stop_clear()
%{calls_per_sec: div(1_000_000 * count, d), each_µs: Float.floor(d / count, 1)}
end
end
end
@garthk
Copy link
Author

garthk commented Jul 4, 2020

As I tweeted:

Benchmarking :opencensus tracing via :telemetry. 20µs per span isn’t bad vs events measured in ms, but it’s a trivial example so far. I want to make it robust vs code crashing, poor library use etc and tracking state in the process dictionary will slow me down.

Also, experimenting with “literate programming”, telling stories mixing text and code and output like a notebook but with just Elixir, two functions, and a macro. Here’s my code in a unit test, and its console output:

console output

@garthk
Copy link
Author

garthk commented Jul 5, 2020

Updated with defensive span popping.

@garthk
Copy link
Author

garthk commented Jul 7, 2020

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment