Instantly share code, notes, and snippets.

Embed
What would you like to do?
bosun macro's, template and grafana dashboard for fault detection on seasonal data (see blog post)
alert requests_by_country {
$duration = 60m
$period = 7d
$q_global = transformNull(sum(stats.pweb*.request.web),0)
$q_segg = aliasByNode(transformNull(stats._sum_pweb.request_by_country.*,0),3)
$s_tag = country
$s_num = 2
$s_min_med_diff = -5
$s_max_num_issues_crit = 10
$g_min_strength = 0.7
$g_max_erratic_warn = 6
$g_max_erratic_crit = 9
$g_erratic_period = 10m
macro = dm-load
$importance = $default_importance
macro = dm-logic
$tpl_s_tag = country
$tpl_s_tag_plural = countries
$tpl_unit = Web requests
$tpl_g_ge = statsd request.web sum by n1 from -1week||
$tpl_s_ge = request by country _sum_pweb from -1hour||
$tpl_s_ge_fmt = request by country _sum_pweb n3=%s from -1week||
$tpl_g_gr = sum(stats.pweb*.request.web)
$tpl_s_gr_fmt = stats._sum_pweb.request_by_country.%s
warnNotification = web
critNotification = web
}
# loads all in the data
# macro is in 2 steps so that you can use some of these vars for lookupSeries etc
macro dm-load {
template = dm
# global levels then and now, to check strength
$g_hist = graphiteBand("$q_global", "$duration", "$period", "", 1)
$g_now = graphite("$q_global", "$duration", "", "")
$g_hist_med = median($g_hist)
$g_now_med = median($g_now)
$g_hist_dev = dev($g_hist)
$g_now_dev = dev($g_now)
# global deviation (erraticness) checking
# erraticness is current deviation against old deviation, adjusted for traffic ratio
# note that our period is usually shorter than what we typically use for median checks
# because we need to quickly find possibly short spikes
# we exclude the last 1m because graphite might not be up to date. actually this shouldn't
# be needed because graphite will report None which bosun should filter out, but q_global often has transformNull to 0 enabled
# because due to statsd that's what it typically means
#g_reqs for viz only
$g_reqs = graphite("$q_global", "1d", "1m", "")
$g_NOW = graphite("$q_global", "$g_erratic_period", "1m", "")
# we add 0.01 so that we can be sure never to divide by zero.
$g_erratic = (dev($g_NOW) * $g_hist_med) / ( ($g_hist_dev * median($g_NOW)) + 0.01)
# levels seggregated by tag, for median checks
$s_hist = graphiteBand("$q_segg", "$duration", "$period" ,"$s_tag", $s_num)
$s_now = graphite("$q_segg", "$duration", "", "$s_tag")
$s_hist_med = median($s_hist)
$s_hist_dev= dev($s_hist)
$s_now_med = median($s_now)
$s_now_dev = dev($s_now)
$default_importance = ($s_hist_med * 90)/ max(t($s_hist_med, "")) + 10
}
# do the actual processing
macro dm-logic {
# global strength
$g_strength = $g_now_med/$g_hist_med
# median check on seggregated by tag
# which deviation to use?
# * now: if traffic drops to 0 than dev = 0 and this blows up, which is good but i rather use a dedicated "has become 0" check which is more clear
# * hist: if there was an outage in the past, the same happens, except hist_med was 0 so diff would still be large, so we'd correctly not trigger here
# (though we should always use past data that was not an outage for this detection to work reliably. that's a bosun todo)
$s_med_diff = ($s_now_med - $s_hist_med)/($s_hist_dev+0.01)
$s_med_bad = $s_med_diff < $s_min_med_diff
$s_med_issues = sum(t($s_med_bad,""))
warn = $s_med_issues > 0 || ($g_max_erratic_warn != -1 && $g_erratic > $g_max_erratic_warn)
crit = $s_med_issues > $s_max_num_issues_crit || $g_strength < $g_min_strength || ($g_max_erratic_crit != -1 && $g_erratic > $g_max_erratic_crit)
}
template dm {
body = `
<a href="{{.Ack}}">Acknowledge alert</a>
<br/>
<h2>{{.Alert.Vars.tpl_unit}}, Global</h2>
Total amount now should not be much less than in the past
<table>
<tr>
<td>{{.Alert.Vars.period}} ago</td>
<td>{{.Eval .Alert.Vars.g_hist_med | printf "%.0f"}}</td>
</tr>
{{if lt (.Eval .Alert.Vars.g_strength) (.Eval .Alert.Vars.g_min_strength)}}
<tr style="color:red;">
{{else if lt (.Eval .Alert.Vars.g_strength) 1.0}}
<tr>
{{else}}
<tr style="color:green;">
{{end}}
<td>Now</td>
<td>{{.Eval .Alert.Vars.g_now_med | printf "%.0f"}}</td>
</tr>
</table>
{{if gt (.Eval .Alert.Vars.g_max_erratic_warn) -1.0}}
<h2>Erraticness of {{.Alert.Vars.tpl_unit}}, Global</h2>
Erraticness - increased deviation - could be indicative of a spike or drop.
Low values up to {{.Eval .Alert.Vars.g_max_erratic_warn}} are ok. {{.Eval .Alert.Vars.g_max_erratic_crit}} are critical
<br>
{{if gt (.Eval .Alert.Vars.g_erratic) (.Eval .Alert.Vars.g_max_erratic_crit)}}
<span style="color:red;">
{{else if lt (.Eval .Alert.Vars.g_erratic) (.Eval .Alert.Vars.g_max_erratic_warn)}}
<span style="color:green;">
{{else}}
<span>
{{end}}
{{.Eval .Alert.Vars.g_erratic}}</span>
<div style="height: 450px">
{{.Graph .Alert.Vars.g_reqs }}
</div>
{{end}}
<br>
<a href="http://grafana/#/dashboard/db/templatetimeshift?var-patt={{.Alert.Vars.tpl_g_gr}}">view timeshift</a>
<br>
<a href="http://graphexplorer/index/{{.Alert.Vars.tpl_g_ge}}">view in GE</a>
<br>
<br>
<h2>{{.Alert.Vars.tpl_unit}}, per {{.Alert.Vars.tpl_s_tag}}</h2>
<br><a href="http://graphexplorer/index/ {{.Alert.Vars.tpl_s_ge}}">GE graph</a>
<br>median diff lower than {{.Alert.Vars.s_min_med_diff}} is bad (in red).
<table style="border-spacing: 13px 3px;">
<tr>
<th>{{.Alert.Vars.tpl_s_tag}}</th>
<th>importance</th>
<th>{{.Alert.Vars.period}} ago (med +- dev)</th>
<th>now (med +- dev)</th>
<th>med diff (in devs)</th>
</tr>
{{ $sorted_importance := (.EvalAll .Alert.Vars.importance).DescByValue}}
{{range $r := .LeftJoin $sorted_importance .Alert.Vars.s_hist_med .Alert.Vars.s_hist_dev .Alert.Vars.s_now_med .Alert.Vars.s_now_dev .Alert.Vars.s_med_diff .Alert.Vars.s_med_bad}}
{{$importance := (index $r 0).Value}}
{{$hist_med := (index $r 1).Value}}
{{$hist_dev := (index $r 2).Value}}
{{$now_med := (index $r 3).Value}}
{{$now_dev := (index $r 4).Value}}
{{$med_diff := (index $r 5).Value}}
{{$med_bad := (index $r 6).Value}}
{{$s_val := index (index $r 0).Group $.Alert.Vars.tpl_s_tag}}
<tr>
<td>{{$s_val}}</td>
<td style="width:100px; background-color:GhostWhite; border-left:{{$importance}}px solid lightsteelblue;"></td>
<td>{{printf "%.0f" $hist_med}} +- {{ printf "%.0f" $hist_dev}}</td>
<td>{{printf "%.0f" $now_med}} +- {{ printf "%.0f" $now_dev}}</td>
{{if gt $med_bad 0.0}}
<td style="color:red;" >{{printf "%.0f" $med_diff}}</td>
{{else}}
<td style="color:green;"> {{printf "%.0f" $med_diff}}</td>
{{end}}
<td><a href="http://grafana/#/dashboard/db/templatetimeshift?var-patt={{printf $.Alert.Vars.tpl_s_gr_fmt $s_val}}">view timeshift</a></td>
<td><a href="http://graphexplorer/index/{{printf $.Alert.Vars.tpl_s_ge_fmt $s_val}}">view in GE</a></td>
</tr>
{{end}}
</table>`
subject =`
{{.Last.Status}}: {{.Alert.Name}} :
Global {{.Alert.Vars.tpl_unit}} strength: {{.Eval .Alert.Vars.g_strength | printf "%.3f"}}
{{if gt (.Eval .Alert.Vars.g_max_erratic_warn) -1.0}}
- erraticness: {{.Eval .Alert.Vars.g_erratic | printf "%.3f"}}
{{end}}
| {{.Eval .Alert.Vars.s_med_issues | printf "%.0f"}} {{.Alert.Vars.tpl_s_tag_plural}} with dropped median:
{{range $r :=.EvalAll .Alert.Vars.s_med_bad}}
{{ if gt $r.Value 0.0}}
{{ index $r.Group $.Alert.Vars.tpl_s_tag }}
{{end}}
{{end}}`
}
{
"annotations": {
"enable": true,
"list": [
{
"datasource": "elasticsearch",
"enable": false,
"iconColor": "#C0C6BE",
"iconSize": 13,
"index": "anthracite",
"lineColor": "rgba(33, 16, 218, 0.59)",
"name": "anthracite",
"query": "",
"showLine": true,
"timeField": "date"
}
]
},
"editable": true,
"hideAllLegends": false,
"hideControls": false,
"id": null,
"nav": [
{
"collapse": false,
"enable": true,
"notice": false,
"now": true,
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"status": "Stable",
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
],
"type": "timepicker"
}
],
"originalTitle": "template-timeshift",
"refresh": false,
"rows": [
{
"collapse": false,
"editable": true,
"height": "600px",
"panels": [
{
"aliasColors": {
"now": "#890F02",
"now-14d": "#1F78C1",
"now-21d": "#052B51",
"now-7d": "#CFFAFF"
},
"bars": false,
"datasource": null,
"editable": true,
"error": false,
"fill": 0,
"grid": {
"leftMax": null,
"leftMin": null,
"rightMax": null,
"rightMin": null,
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"id": 1,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [
{
"alias": "now",
"lines": false,
"pointradius": 1,
"points": true
}
],
"span": 12,
"stack": false,
"steppedLine": false,
"targets": [
{
"target": "alias($patt, 'now')"
},
{
"target": "alias(timeShift($patt, '7d'), 'now-7d')"
},
{
"target": "alias(timeShift($patt, '14d'), 'now-14d')"
},
{
"target": "alias(timeShift($patt, '21d'), 'now-21d')"
}
],
"title": "$patt",
"tooltip": {
"shared": false,
"value_type": "cumulative"
},
"type": "graph",
"x-axis": true,
"y-axis": true,
"y_formats": [
"short",
"short"
]
}
],
"title": "Row1"
}
],
"sharedCrosshair": false,
"style": "dark",
"tags": [],
"templating": {
"enable": true,
"list": [
{
"allFormat": "glob",
"current": {
"text": "",
"value": ""
},
"datasource": null,
"includeAll": false,
"name": "patt",
"options": [
{
"text": "",
"value": ""
}
],
"query": "",
"refresh_on_load": false,
"type": "custom"
}
]
},
"time": {
"from": "now-7d",
"to": "now"
},
"timezone": "browser",
"title": "template-timeshift",
"version": 6
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment