Skip to content

Instantly share code, notes, and snippets.

@felixbarny
Created April 5, 2019 15:47
Show Gist options
  • Save felixbarny/0d2f9b35259d9710c54b6c46ab02e665 to your computer and use it in GitHub Desktop.
Save felixbarny/0d2f9b35259d9710c54b6c46ab02e665 to your computer and use it in GitHub Desktop.
Metrics-based breakdown graphs
# Metric-based transaction breakdown example
# Paste in Kibana Dev Tools to try it out
# This example contains two hosts (host-a and host-b).
# One transaction (APIRestController#orders) which only has spans of type `db`
# On host a, there were 10 transactions each taking 40ms and consisting of 10 `db` calls, each taking 1ms on average (30ms transaction self_time)
# On host b, there were 5 transactions each taking 40ms and consisting of 15 `db` calls, each taking 1.333ms on average (20ms transaction self_time)
DELETE /apm-metrics?ignore_unavailable=true
PUT /apm-metrics
{
"settings": {
"number_of_shards": 1
},
"mappings": {
"_doc": {
"properties": {
"@timestamp": {
"type": "date"
}
}
}
}
}
# Index metrics for host a
# transaction.name and transaction.type have to be on the top-level as opposed to be nested under tags/labels.
# That is because it has to match the field names for regular transaction documents
# so that there is a unified behavior when using the query bar to filter for a particular transaction type or name.
# I'm not yet sure how to bring those to the top level. Some ideas:
# - APM Server knows about "well-known" tags like `transaction_name` and `transaction_type` and brings them to top-level
# - Extend the metricset spec (https://github.com/elastic/apm-server/blob/master/docs/spec/metricsets/metricset.json)
# to allow for top-level key/value pairs (dots in the keys have to be allowed)
# After a transaction ends, agents tack the self times of all spans, grouped by the `span.type`.
# for each type do:
# var labels = {"span.type": type, "transaction.name", transaction.name, "transaction.type": transaction.type}
# metrics.timer(labels=labels, timerName="self_time").update(duration=totalSelfTimeOfType, count=countOfSpanOfType)
# That increments the `sum` and `count` counters for the respective timer
POST /apm-metrics/_doc
{
"@timestamp": "1970-01-01T00:00:00",
"host.hostname": "host-a",
"service.name": "opbeans-java",
"transaction.name": "APIRestController#orders",
"transaction.type": "request",
"span.type": "db",
"self_time.sum": 100,
"self_time.count": 100
}
# reporting the avg value is not necessary
# reporting the count is important in every case in order to be able to calculate weighted averages
# The self time of the transaction is also reported.
# This is the time which can't be explained by direct child spans.
# The transaction also has a `span.type` which is the special value `transaction`.
# This is necessary in order to be able to do a terms aggregation on `span.type` (group by type)
# The UI might relabel this special value to "other", "app", "JVM" or similar.
# After a transaction ends, agents tack the self time for that transaction.
# var labels = {"span.type": "transaction", "transaction.name", transaction.name, "transaction.type": transaction.type}
# metrics.timer(labels=labels, timerName="self_time").update(duration=transaction.self_time, count=1)
#
# Optionally, they can also track the total duration of the transaction.
# This enables graphs showing the average transaction duration, without relying on individual transaction documents
# var labels = {"span.type": "transaction", "transaction.name", transaction.name, "transaction.type": transaction.type}
# metrics.timer(labels=labels, timerName="duration").update(duration=transaction.duration, count=1)
POST /apm-metrics/_doc
{
"@timestamp": "1970-01-01T00:00:00",
"host.hostname": "host-a",
"service.name": "opbeans-java",
"transaction.name": "APIRestController#orders",
"transaction.type": "request",
"span.type": "transaction",
"self_time.sum": 200,
"self_time.count": 10,
"duration.sum": 300,
"duration.count": 10
}
# Index metrics for host b
POST /apm-metrics/_doc
{
"@timestamp": "1970-01-01T00:00:00",
"host.hostname": "host-b",
"service.name": "opbeans-java",
"transaction.name": "APIRestController#orders",
"transaction.type": "request",
"span.type": "db",
"self_time.sum": 100,
"self_time.count": 75
}
POST /apm-metrics/_doc?refresh
{
"@timestamp": "1970-01-01T00:00:00",
"host.hostname": "host-b",
"service.name": "opbeans-java",
"transaction.name": "APIRestController#orders",
"transaction.type": "request",
"span.type": "transaction",
"self_time.sum": 200,
"self_time.count": 5,
"duration.sum": 300,
"duration.count": 5
}
# UI
# The metrics will power a stacked area chart which stacks the percentage of the self_time for a particular span type in relation to the sum of all self_times.
# This chart is present on the service details page (by not filtering by `transaction.name`) and the transaction details page (by filtering by the corresponding `transaction.name`).
# I'm aware that there's basically no difference between the trace group view and the transaction group view.
# The MVP does not break down the entire trace but only the root transaction.
# In upcoming iterations we can improve that but for now it should be good enough.
POST /apm-metrics/_search
{
"size": 0,
"query": {
"bool": {
"must": [
{
"term": {
"transaction.name.keyword": {
"value": "APIRestController#orders"
}
}
},
{
"term": {
"transaction.type.keyword": {
"value": "request"
}
}
}
]
}
},
"aggs": {
"breakdown_per_interval": {
"date_histogram": {
"field": "@timestamp",
"interval": "minute"
},
"aggs": {
"types": {
"terms": {
"field": "span.type.keyword",
"size": 10
},
"aggs": {
"total_self_time_per_type": {
"sum": {
"field": "self_time.sum"
}
},
"total_span_count_per_type": {
"sum": {
"field": "self_time.count"
}
}
}
},
"sum_all_self_times": {
"sum": {
"field": "self_time.sum"
}
},
"total_transaction_count": {
"filter": {
"term": {
"span.type.keyword": "transaction"
}
},
"aggs": {
"sum_transaction_count": {
"sum": {
"field": "self_time.count"
}
}
}
}
}
}
}
}
# result:
# {
# "took" : 5,
# "timed_out" : false,
# "_shards" : {
# "total" : 1,
# "successful" : 1,
# "skipped" : 0,
# "failed" : 0
# },
# "hits" : {
# "total" : 4,
# "max_score" : 0.0,
# "hits" : [ ]
# },
# "aggregations" : {
# "breakdown_per_interval" : {
# "buckets" : [
# {
# "key_as_string" : "1970-01-01T00:00:00.000Z",
# "key" : 0,
# "doc_count" : 4,
# "total_transaction_count" : {
# "doc_count" : 2,
# "sum_transaction_count" : {
# "value" : 15.0
# }
# },
# "sum_all_self_times" : {
# "value" : 600.0
# },
# "types" : {
# "doc_count_error_upper_bound" : 0,
# "sum_other_doc_count" : 0,
# "buckets" : [
# {
# "key" : "db",
# "doc_count" : 2,
# to calculate the percentage, do total_self_time_per_type/sum_all_self_times
# "total_self_time_per_type" : {
# "value" : 200.0 (33.3%)
# },
# to calculate the average count of db spans per transaction, do total_span_count_per_type/sum_transaction_count
# "total_span_count_per_type" : {
# "value" : 175.0 (11.66)
# }
# },
# {
# "key" : "transaction",
# "doc_count" : 2,
# "total_self_time_per_type" : {
# "value" : 400.0 (66.6%)
# },
# "total_span_count_per_type" : {
# "value" : 15.0 (1)
# }
# }
# ]
# }
# }
# ]
# }
# }
# }
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment