Create a gist now

Instantly share code, notes, and snippets.

Embed
What would you like to do?
bosun macro's, template and grafana dashboard for fault detection on seasonal data (see blog post)
alert requests_by_country {
$duration = 60m
$period = 7d
$q_global = transformNull(sum(stats.pweb*.request.web),0)
$q_segg = aliasByNode(transformNull(stats._sum_pweb.request_by_country.*,0),3)
$s_tag = country
$s_num = 2
$s_min_med_diff = -5
$s_max_num_issues_crit = 10
$g_min_strength = 0.7
$g_max_erratic_warn = 6
$g_max_erratic_crit = 9
$g_erratic_period = 10m
macro = dm-load
$importance = $default_importance
macro = dm-logic
$tpl_s_tag = country
$tpl_s_tag_plural = countries
$tpl_unit = Web requests
$tpl_g_ge = statsd request.web sum by n1 from -1week||
$tpl_s_ge = request by country _sum_pweb from -1hour||
$tpl_s_ge_fmt = request by country _sum_pweb n3=%s from -1week||
$tpl_g_gr = sum(stats.pweb*.request.web)
$tpl_s_gr_fmt = stats._sum_pweb.request_by_country.%s
warnNotification = web
critNotification = web
}
# loads all in the data
# macro is in 2 steps so that you can use some of these vars for lookupSeries etc
macro dm-load {
template = dm
# global levels then and now, to check strength
$g_hist = graphiteBand("$q_global", "$duration", "$period", "", 1)
$g_now = graphite("$q_global", "$duration", "", "")
$g_hist_med = median($g_hist)
$g_now_med = median($g_now)
$g_hist_dev = dev($g_hist)
$g_now_dev = dev($g_now)
# global deviation (erraticness) checking
# erraticness is current deviation against old deviation, adjusted for traffic ratio
# note that our period is usually shorter than what we typically use for median checks
# because we need to quickly find possibly short spikes
# we exclude the last 1m because graphite might not be up to date. actually this shouldn't
# be needed because graphite will report None which bosun should filter out, but q_global often has transformNull to 0 enabled
# because due to statsd that's what it typically means
#g_reqs for viz only
$g_reqs = graphite("$q_global", "1d", "1m", "")
$g_NOW = graphite("$q_global", "$g_erratic_period", "1m", "")
# we add 0.01 so that we can be sure never to divide by zero.
$g_erratic = (dev($g_NOW) * $g_hist_med) / ( ($g_hist_dev * median($g_NOW)) + 0.01)
# levels seggregated by tag, for median checks
$s_hist = graphiteBand("$q_segg", "$duration", "$period" ,"$s_tag", $s_num)
$s_now = graphite("$q_segg", "$duration", "", "$s_tag")
$s_hist_med = median($s_hist)
$s_hist_dev= dev($s_hist)
$s_now_med = median($s_now)
$s_now_dev = dev($s_now)
$default_importance = ($s_hist_med * 90)/ max(t($s_hist_med, "")) + 10
}
# do the actual processing
macro dm-logic {
# global strength
$g_strength = $g_now_med/$g_hist_med
# median check on seggregated by tag
# which deviation to use?
# * now: if traffic drops to 0 than dev = 0 and this blows up, which is good but i rather use a dedicated "has become 0" check which is more clear
# * hist: if there was an outage in the past, the same happens, except hist_med was 0 so diff would still be large, so we'd correctly not trigger here
# (though we should always use past data that was not an outage for this detection to work reliably. that's a bosun todo)
$s_med_diff = ($s_now_med - $s_hist_med)/($s_hist_dev+0.01)
$s_med_bad = $s_med_diff < $s_min_med_diff
$s_med_issues = sum(t($s_med_bad,""))
warn = $s_med_issues > 0 || ($g_max_erratic_warn != -1 && $g_erratic > $g_max_erratic_warn)
crit = $s_med_issues > $s_max_num_issues_crit || $g_strength < $g_min_strength || ($g_max_erratic_crit != -1 && $g_erratic > $g_max_erratic_crit)
}
template dm {
body = `
<a href="{{.Ack}}">Acknowledge alert</a>
<br/>
<h2>{{.Alert.Vars.tpl_unit}}, Global</h2>
Total amount now should not be much less than in the past
<table>
<tr>
<td>{{.Alert.Vars.period}} ago</td>
<td>{{.Eval .Alert.Vars.g_hist_med | printf "%.0f"}}</td>
</tr>
{{if lt (.Eval .Alert.Vars.g_strength) (.Eval .Alert.Vars.g_min_strength)}}
<tr style="color:red;">
{{else if lt (.Eval .Alert.Vars.g_strength) 1.0}}
<tr>
{{else}}
<tr style="color:green;">
{{end}}
<td>Now</td>
<td>{{.Eval .Alert.Vars.g_now_med | printf "%.0f"}}</td>
</tr>
</table>
{{if gt (.Eval .Alert.Vars.g_max_erratic_warn) -1.0}}
<h2>Erraticness of {{.Alert.Vars.tpl_unit}}, Global</h2>
Erraticness - increased deviation - could be indicative of a spike or drop.
Low values up to {{.Eval .Alert.Vars.g_max_erratic_warn}} are ok. {{.Eval .Alert.Vars.g_max_erratic_crit}} are critical
<br>
{{if gt (.Eval .Alert.Vars.g_erratic) (.Eval .Alert.Vars.g_max_erratic_crit)}}
<span style="color:red;">
{{else if lt (.Eval .Alert.Vars.g_erratic) (.Eval .Alert.Vars.g_max_erratic_warn)}}
<span style="color:green;">
{{else}}
<span>
{{end}}
{{.Eval .Alert.Vars.g_erratic}}</span>
<div style="height: 450px">
{{.Graph .Alert.Vars.g_reqs }}
</div>
{{end}}
<br>
<a href="http://grafana/#/dashboard/db/templatetimeshift?var-patt={{.Alert.Vars.tpl_g_gr}}">view timeshift</a>
<br>
<a href="http://graphexplorer/index/{{.Alert.Vars.tpl_g_ge}}">view in GE</a>
<br>
<br>
<h2>{{.Alert.Vars.tpl_unit}}, per {{.Alert.Vars.tpl_s_tag}}</h2>
<br><a href="http://graphexplorer/index/ {{.Alert.Vars.tpl_s_ge}}">GE graph</a>
<br>median diff lower than {{.Alert.Vars.s_min_med_diff}} is bad (in red).
<table style="border-spacing: 13px 3px;">
<tr>
<th>{{.Alert.Vars.tpl_s_tag}}</th>
<th>importance</th>
<th>{{.Alert.Vars.period}} ago (med +- dev)</th>
<th>now (med +- dev)</th>
<th>med diff (in devs)</th>
</tr>
{{ $sorted_importance := (.EvalAll .Alert.Vars.importance).DescByValue}}
{{range $r := .LeftJoin $sorted_importance .Alert.Vars.s_hist_med .Alert.Vars.s_hist_dev .Alert.Vars.s_now_med .Alert.Vars.s_now_dev .Alert.Vars.s_med_diff .Alert.Vars.s_med_bad}}
{{$importance := (index $r 0).Value}}
{{$hist_med := (index $r 1).Value}}
{{$hist_dev := (index $r 2).Value}}
{{$now_med := (index $r 3).Value}}
{{$now_dev := (index $r 4).Value}}
{{$med_diff := (index $r 5).Value}}
{{$med_bad := (index $r 6).Value}}
{{$s_val := index (index $r 0).Group $.Alert.Vars.tpl_s_tag}}
<tr>
<td>{{$s_val}}</td>
<td style="width:100px; background-color:GhostWhite; border-left:{{$importance}}px solid lightsteelblue;"></td>
<td>{{printf "%.0f" $hist_med}} +- {{ printf "%.0f" $hist_dev}}</td>
<td>{{printf "%.0f" $now_med}} +- {{ printf "%.0f" $now_dev}}</td>
{{if gt $med_bad 0.0}}
<td style="color:red;" >{{printf "%.0f" $med_diff}}</td>
{{else}}
<td style="color:green;"> {{printf "%.0f" $med_diff}}</td>
{{end}}
<td><a href="http://grafana/#/dashboard/db/templatetimeshift?var-patt={{printf $.Alert.Vars.tpl_s_gr_fmt $s_val}}">view timeshift</a></td>
<td><a href="http://graphexplorer/index/{{printf $.Alert.Vars.tpl_s_ge_fmt $s_val}}">view in GE</a></td>
</tr>
{{end}}
</table>`
subject =`
{{.Last.Status}}: {{.Alert.Name}} :
Global {{.Alert.Vars.tpl_unit}} strength: {{.Eval .Alert.Vars.g_strength | printf "%.3f"}}
{{if gt (.Eval .Alert.Vars.g_max_erratic_warn) -1.0}}
- erraticness: {{.Eval .Alert.Vars.g_erratic | printf "%.3f"}}
{{end}}
| {{.Eval .Alert.Vars.s_med_issues | printf "%.0f"}} {{.Alert.Vars.tpl_s_tag_plural}} with dropped median:
{{range $r :=.EvalAll .Alert.Vars.s_med_bad}}
{{ if gt $r.Value 0.0}}
{{ index $r.Group $.Alert.Vars.tpl_s_tag }}
{{end}}
{{end}}`
}
{
"annotations": {
"enable": true,
"list": [
{
"datasource": "elasticsearch",
"enable": false,
"iconColor": "#C0C6BE",
"iconSize": 13,
"index": "anthracite",
"lineColor": "rgba(33, 16, 218, 0.59)",
"name": "anthracite",
"query": "",
"showLine": true,
"timeField": "date"
}
]
},
"editable": true,
"hideAllLegends": false,
"hideControls": false,
"id": null,
"nav": [
{
"collapse": false,
"enable": true,
"notice": false,
"now": true,
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"status": "Stable",
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
],
"type": "timepicker"
}
],
"originalTitle": "template-timeshift",
"refresh": false,
"rows": [
{
"collapse": false,
"editable": true,
"height": "600px",
"panels": [
{
"aliasColors": {
"now": "#890F02",
"now-14d": "#1F78C1",
"now-21d": "#052B51",
"now-7d": "#CFFAFF"
},
"bars": false,
"datasource": null,
"editable": true,
"error": false,
"fill": 0,
"grid": {
"leftMax": null,
"leftMin": null,
"rightMax": null,
"rightMin": null,
"threshold1": null,
"threshold1Color": "rgba(216, 200, 27, 0.27)",
"threshold2": null,
"threshold2Color": "rgba(234, 112, 112, 0.22)"
},
"id": 1,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [
{
"alias": "now",
"lines": false,
"pointradius": 1,
"points": true
}
],
"span": 12,
"stack": false,
"steppedLine": false,
"targets": [
{
"target": "alias($patt, 'now')"
},
{
"target": "alias(timeShift($patt, '7d'), 'now-7d')"
},
{
"target": "alias(timeShift($patt, '14d'), 'now-14d')"
},
{
"target": "alias(timeShift($patt, '21d'), 'now-21d')"
}
],
"title": "$patt",
"tooltip": {
"shared": false,
"value_type": "cumulative"
},
"type": "graph",
"x-axis": true,
"y-axis": true,
"y_formats": [
"short",
"short"
]
}
],
"title": "Row1"
}
],
"sharedCrosshair": false,
"style": "dark",
"tags": [],
"templating": {
"enable": true,
"list": [
{
"allFormat": "glob",
"current": {
"text": "",
"value": ""
},
"datasource": null,
"includeAll": false,
"name": "patt",
"options": [
{
"text": "",
"value": ""
}
],
"query": "",
"refresh_on_load": false,
"type": "custom"
}
]
},
"time": {
"from": "now-7d",
"to": "now"
},
"timezone": "browser",
"title": "template-timeshift",
"version": 6
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment