Created
April 5, 2019 15:47
-
-
Save felixbarny/0d2f9b35259d9710c54b6c46ab02e665 to your computer and use it in GitHub Desktop.
Metrics-based breakdown graphs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Metric-based transaction breakdown example | |
# Paste in Kibana Dev Tools to try it out | |
# This example contains two hosts (host-a and host-b). | |
# One transaction (APIRestController#orders) which only has spans of type `db` | |
# On host a, there were 10 transactions each taking 40ms and consisting of 10 `db` calls, each taking 1ms on average (30ms transaction self_time) | |
# On host b, there were 5 transactions each taking 40ms and consisting of 15 `db` calls, each taking 1.333ms on average (20ms transaction self_time) | |
DELETE /apm-metrics?ignore_unavailable=true | |
PUT /apm-metrics | |
{ | |
"settings": { | |
"number_of_shards": 1 | |
}, | |
"mappings": { | |
"_doc": { | |
"properties": { | |
"@timestamp": { | |
"type": "date" | |
} | |
} | |
} | |
} | |
} | |
# Index metrics for host a | |
# transaction.name and transaction.type have to be on the top-level as opposed to be nested under tags/labels. | |
# That is because it has to match the field names for regular transaction documents | |
# so that there is a unified behavior when using the query bar to filter for a particular transaction type or name. | |
# I'm not yet sure how to bring those to the top level. Some ideas: | |
# - APM Server knows about "well-known" tags like `transaction_name` and `transaction_type` and brings them to top-level | |
# - Extend the metricset spec (https://github.com/elastic/apm-server/blob/master/docs/spec/metricsets/metricset.json) | |
# to allow for top-level key/value pairs (dots in the keys have to be allowed) | |
# After a transaction ends, agents tack the self times of all spans, grouped by the `span.type`. | |
# for each type do: | |
# var labels = {"span.type": type, "transaction.name", transaction.name, "transaction.type": transaction.type} | |
# metrics.timer(labels=labels, timerName="self_time").update(duration=totalSelfTimeOfType, count=countOfSpanOfType) | |
# That increments the `sum` and `count` counters for the respective timer | |
POST /apm-metrics/_doc | |
{ | |
"@timestamp": "1970-01-01T00:00:00", | |
"host.hostname": "host-a", | |
"service.name": "opbeans-java", | |
"transaction.name": "APIRestController#orders", | |
"transaction.type": "request", | |
"span.type": "db", | |
"self_time.sum": 100, | |
"self_time.count": 100 | |
} | |
# reporting the avg value is not necessary | |
# reporting the count is important in every case in order to be able to calculate weighted averages | |
# The self time of the transaction is also reported. | |
# This is the time which can't be explained by direct child spans. | |
# The transaction also has a `span.type` which is the special value `transaction`. | |
# This is necessary in order to be able to do a terms aggregation on `span.type` (group by type) | |
# The UI might relabel this special value to "other", "app", "JVM" or similar. | |
# After a transaction ends, agents tack the self time for that transaction. | |
# var labels = {"span.type": "transaction", "transaction.name", transaction.name, "transaction.type": transaction.type} | |
# metrics.timer(labels=labels, timerName="self_time").update(duration=transaction.self_time, count=1) | |
# | |
# Optionally, they can also track the total duration of the transaction. | |
# This enables graphs showing the average transaction duration, without relying on individual transaction documents | |
# var labels = {"span.type": "transaction", "transaction.name", transaction.name, "transaction.type": transaction.type} | |
# metrics.timer(labels=labels, timerName="duration").update(duration=transaction.duration, count=1) | |
POST /apm-metrics/_doc | |
{ | |
"@timestamp": "1970-01-01T00:00:00", | |
"host.hostname": "host-a", | |
"service.name": "opbeans-java", | |
"transaction.name": "APIRestController#orders", | |
"transaction.type": "request", | |
"span.type": "transaction", | |
"self_time.sum": 200, | |
"self_time.count": 10, | |
"duration.sum": 300, | |
"duration.count": 10 | |
} | |
# Index metrics for host b | |
POST /apm-metrics/_doc | |
{ | |
"@timestamp": "1970-01-01T00:00:00", | |
"host.hostname": "host-b", | |
"service.name": "opbeans-java", | |
"transaction.name": "APIRestController#orders", | |
"transaction.type": "request", | |
"span.type": "db", | |
"self_time.sum": 100, | |
"self_time.count": 75 | |
} | |
POST /apm-metrics/_doc?refresh | |
{ | |
"@timestamp": "1970-01-01T00:00:00", | |
"host.hostname": "host-b", | |
"service.name": "opbeans-java", | |
"transaction.name": "APIRestController#orders", | |
"transaction.type": "request", | |
"span.type": "transaction", | |
"self_time.sum": 200, | |
"self_time.count": 5, | |
"duration.sum": 300, | |
"duration.count": 5 | |
} | |
# UI | |
# The metrics will power a stacked area chart which stacks the percentage of the self_time for a particular span type in relation to the sum of all self_times. | |
# This chart is present on the service details page (by not filtering by `transaction.name`) and the transaction details page (by filtering by the corresponding `transaction.name`). | |
# I'm aware that there's basically no difference between the trace group view and the transaction group view. | |
# The MVP does not break down the entire trace but only the root transaction. | |
# In upcoming iterations we can improve that but for now it should be good enough. | |
POST /apm-metrics/_search | |
{ | |
"size": 0, | |
"query": { | |
"bool": { | |
"must": [ | |
{ | |
"term": { | |
"transaction.name.keyword": { | |
"value": "APIRestController#orders" | |
} | |
} | |
}, | |
{ | |
"term": { | |
"transaction.type.keyword": { | |
"value": "request" | |
} | |
} | |
} | |
] | |
} | |
}, | |
"aggs": { | |
"breakdown_per_interval": { | |
"date_histogram": { | |
"field": "@timestamp", | |
"interval": "minute" | |
}, | |
"aggs": { | |
"types": { | |
"terms": { | |
"field": "span.type.keyword", | |
"size": 10 | |
}, | |
"aggs": { | |
"total_self_time_per_type": { | |
"sum": { | |
"field": "self_time.sum" | |
} | |
}, | |
"total_span_count_per_type": { | |
"sum": { | |
"field": "self_time.count" | |
} | |
} | |
} | |
}, | |
"sum_all_self_times": { | |
"sum": { | |
"field": "self_time.sum" | |
} | |
}, | |
"total_transaction_count": { | |
"filter": { | |
"term": { | |
"span.type.keyword": "transaction" | |
} | |
}, | |
"aggs": { | |
"sum_transaction_count": { | |
"sum": { | |
"field": "self_time.count" | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
} | |
# result: | |
# { | |
# "took" : 5, | |
# "timed_out" : false, | |
# "_shards" : { | |
# "total" : 1, | |
# "successful" : 1, | |
# "skipped" : 0, | |
# "failed" : 0 | |
# }, | |
# "hits" : { | |
# "total" : 4, | |
# "max_score" : 0.0, | |
# "hits" : [ ] | |
# }, | |
# "aggregations" : { | |
# "breakdown_per_interval" : { | |
# "buckets" : [ | |
# { | |
# "key_as_string" : "1970-01-01T00:00:00.000Z", | |
# "key" : 0, | |
# "doc_count" : 4, | |
# "total_transaction_count" : { | |
# "doc_count" : 2, | |
# "sum_transaction_count" : { | |
# "value" : 15.0 | |
# } | |
# }, | |
# "sum_all_self_times" : { | |
# "value" : 600.0 | |
# }, | |
# "types" : { | |
# "doc_count_error_upper_bound" : 0, | |
# "sum_other_doc_count" : 0, | |
# "buckets" : [ | |
# { | |
# "key" : "db", | |
# "doc_count" : 2, | |
# to calculate the percentage, do total_self_time_per_type/sum_all_self_times | |
# "total_self_time_per_type" : { | |
# "value" : 200.0 (33.3%) | |
# }, | |
# to calculate the average count of db spans per transaction, do total_span_count_per_type/sum_transaction_count | |
# "total_span_count_per_type" : { | |
# "value" : 175.0 (11.66) | |
# } | |
# }, | |
# { | |
# "key" : "transaction", | |
# "doc_count" : 2, | |
# "total_self_time_per_type" : { | |
# "value" : 400.0 (66.6%) | |
# }, | |
# "total_span_count_per_type" : { | |
# "value" : 15.0 (1) | |
# } | |
# } | |
# ] | |
# } | |
# } | |
# ] | |
# } | |
# } | |
# } | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment