Last active
June 16, 2016 01:41
-
-
Save Dieterbe/d1892fa0b4454b892216 to your computer and use it in GitHub Desktop.
bosun macro's, template and grafana dashboard for fault detection on seasonal data (see blog post)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
alert requests_by_country { | |
$duration = 60m | |
$period = 7d | |
$q_global = transformNull(sum(stats.pweb*.request.web),0) | |
$q_segg = aliasByNode(transformNull(stats._sum_pweb.request_by_country.*,0),3) | |
$s_tag = country | |
$s_num = 2 | |
$s_min_med_diff = -5 | |
$s_max_num_issues_crit = 10 | |
$g_min_strength = 0.7 | |
$g_max_erratic_warn = 6 | |
$g_max_erratic_crit = 9 | |
$g_erratic_period = 10m | |
macro = dm-load | |
$importance = $default_importance | |
macro = dm-logic | |
$tpl_s_tag = country | |
$tpl_s_tag_plural = countries | |
$tpl_unit = Web requests | |
$tpl_g_ge = statsd request.web sum by n1 from -1week|| | |
$tpl_s_ge = request by country _sum_pweb from -1hour|| | |
$tpl_s_ge_fmt = request by country _sum_pweb n3=%s from -1week|| | |
$tpl_g_gr = sum(stats.pweb*.request.web) | |
$tpl_s_gr_fmt = stats._sum_pweb.request_by_country.%s | |
warnNotification = web | |
critNotification = web | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# loads all in the data | |
# macro is in 2 steps so that you can use some of these vars for lookupSeries etc | |
macro dm-load { | |
template = dm | |
# global levels then and now, to check strength | |
$g_hist = graphiteBand("$q_global", "$duration", "$period", "", 1) | |
$g_now = graphite("$q_global", "$duration", "", "") | |
$g_hist_med = median($g_hist) | |
$g_now_med = median($g_now) | |
$g_hist_dev = dev($g_hist) | |
$g_now_dev = dev($g_now) | |
# global deviation (erraticness) checking | |
# erraticness is current deviation against old deviation, adjusted for traffic ratio | |
# note that our period is usually shorter than what we typically use for median checks | |
# because we need to quickly find possibly short spikes | |
# we exclude the last 1m because graphite might not be up to date. actually this shouldn't | |
# be needed because graphite will report None which bosun should filter out, but q_global often has transformNull to 0 enabled | |
# because due to statsd that's what it typically means | |
#g_reqs for viz only | |
$g_reqs = graphite("$q_global", "1d", "1m", "") | |
$g_NOW = graphite("$q_global", "$g_erratic_period", "1m", "") | |
# we add 0.01 so that we can be sure never to divide by zero. | |
$g_erratic = (dev($g_NOW) * $g_hist_med) / ( ($g_hist_dev * median($g_NOW)) + 0.01) | |
# levels seggregated by tag, for median checks | |
$s_hist = graphiteBand("$q_segg", "$duration", "$period" ,"$s_tag", $s_num) | |
$s_now = graphite("$q_segg", "$duration", "", "$s_tag") | |
$s_hist_med = median($s_hist) | |
$s_hist_dev= dev($s_hist) | |
$s_now_med = median($s_now) | |
$s_now_dev = dev($s_now) | |
$default_importance = ($s_hist_med * 90)/ max(t($s_hist_med, "")) + 10 | |
} | |
# do the actual processing | |
macro dm-logic { | |
# global strength | |
$g_strength = $g_now_med/$g_hist_med | |
# median check on seggregated by tag | |
# which deviation to use? | |
# * now: if traffic drops to 0 than dev = 0 and this blows up, which is good but i rather use a dedicated "has become 0" check which is more clear | |
# * hist: if there was an outage in the past, the same happens, except hist_med was 0 so diff would still be large, so we'd correctly not trigger here | |
# (though we should always use past data that was not an outage for this detection to work reliably. that's a bosun todo) | |
$s_med_diff = ($s_now_med - $s_hist_med)/($s_hist_dev+0.01) | |
$s_med_bad = $s_med_diff < $s_min_med_diff | |
$s_med_issues = sum(t($s_med_bad,"")) | |
warn = $s_med_issues > 0 || ($g_max_erratic_warn != -1 && $g_erratic > $g_max_erratic_warn) | |
crit = $s_med_issues > $s_max_num_issues_crit || $g_strength < $g_min_strength || ($g_max_erratic_crit != -1 && $g_erratic > $g_max_erratic_crit) | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
template dm { | |
body = ` | |
<a href="{{.Ack}}">Acknowledge alert</a> | |
<br/> | |
<h2>{{.Alert.Vars.tpl_unit}}, Global</h2> | |
Total amount now should not be much less than in the past | |
<table> | |
<tr> | |
<td>{{.Alert.Vars.period}} ago</td> | |
<td>{{.Eval .Alert.Vars.g_hist_med | printf "%.0f"}}</td> | |
</tr> | |
{{if lt (.Eval .Alert.Vars.g_strength) (.Eval .Alert.Vars.g_min_strength)}} | |
<tr style="color:red;"> | |
{{else if lt (.Eval .Alert.Vars.g_strength) 1.0}} | |
<tr> | |
{{else}} | |
<tr style="color:green;"> | |
{{end}} | |
<td>Now</td> | |
<td>{{.Eval .Alert.Vars.g_now_med | printf "%.0f"}}</td> | |
</tr> | |
</table> | |
{{if gt (.Eval .Alert.Vars.g_max_erratic_warn) -1.0}} | |
<h2>Erraticness of {{.Alert.Vars.tpl_unit}}, Global</h2> | |
Erraticness - increased deviation - could be indicative of a spike or drop. | |
Low values up to {{.Eval .Alert.Vars.g_max_erratic_warn}} are ok. {{.Eval .Alert.Vars.g_max_erratic_crit}} are critical | |
<br> | |
{{if gt (.Eval .Alert.Vars.g_erratic) (.Eval .Alert.Vars.g_max_erratic_crit)}} | |
<span style="color:red;"> | |
{{else if lt (.Eval .Alert.Vars.g_erratic) (.Eval .Alert.Vars.g_max_erratic_warn)}} | |
<span style="color:green;"> | |
{{else}} | |
<span> | |
{{end}} | |
{{.Eval .Alert.Vars.g_erratic}}</span> | |
<div style="height: 450px"> | |
{{.Graph .Alert.Vars.g_reqs }} | |
</div> | |
{{end}} | |
<br> | |
<a href="http://grafana/#/dashboard/db/templatetimeshift?var-patt={{.Alert.Vars.tpl_g_gr}}">view timeshift</a> | |
<br> | |
<a href="http://graphexplorer/index/{{.Alert.Vars.tpl_g_ge}}">view in GE</a> | |
<br> | |
<br> | |
<h2>{{.Alert.Vars.tpl_unit}}, per {{.Alert.Vars.tpl_s_tag}}</h2> | |
<br><a href="http://graphexplorer/index/ {{.Alert.Vars.tpl_s_ge}}">GE graph</a> | |
<br>median diff lower than {{.Alert.Vars.s_min_med_diff}} is bad (in red). | |
<table style="border-spacing: 13px 3px;"> | |
<tr> | |
<th>{{.Alert.Vars.tpl_s_tag}}</th> | |
<th>importance</th> | |
<th>{{.Alert.Vars.period}} ago (med +- dev)</th> | |
<th>now (med +- dev)</th> | |
<th>med diff (in devs)</th> | |
</tr> | |
{{ $sorted_importance := (.EvalAll .Alert.Vars.importance).DescByValue}} | |
{{range $r := .LeftJoin $sorted_importance .Alert.Vars.s_hist_med .Alert.Vars.s_hist_dev .Alert.Vars.s_now_med .Alert.Vars.s_now_dev .Alert.Vars.s_med_diff .Alert.Vars.s_med_bad}} | |
{{$importance := (index $r 0).Value}} | |
{{$hist_med := (index $r 1).Value}} | |
{{$hist_dev := (index $r 2).Value}} | |
{{$now_med := (index $r 3).Value}} | |
{{$now_dev := (index $r 4).Value}} | |
{{$med_diff := (index $r 5).Value}} | |
{{$med_bad := (index $r 6).Value}} | |
{{$s_val := index (index $r 0).Group $.Alert.Vars.tpl_s_tag}} | |
<tr> | |
<td>{{$s_val}}</td> | |
<td style="width:100px; background-color:GhostWhite; border-left:{{$importance}}px solid lightsteelblue;"></td> | |
<td>{{printf "%.0f" $hist_med}} +- {{ printf "%.0f" $hist_dev}}</td> | |
<td>{{printf "%.0f" $now_med}} +- {{ printf "%.0f" $now_dev}}</td> | |
{{if gt $med_bad 0.0}} | |
<td style="color:red;" >{{printf "%.0f" $med_diff}}</td> | |
{{else}} | |
<td style="color:green;"> {{printf "%.0f" $med_diff}}</td> | |
{{end}} | |
<td><a href="http://grafana/#/dashboard/db/templatetimeshift?var-patt={{printf $.Alert.Vars.tpl_s_gr_fmt $s_val}}">view timeshift</a></td> | |
<td><a href="http://graphexplorer/index/{{printf $.Alert.Vars.tpl_s_ge_fmt $s_val}}">view in GE</a></td> | |
</tr> | |
{{end}} | |
</table>` | |
subject =` | |
{{.Last.Status}}: {{.Alert.Name}} : | |
Global {{.Alert.Vars.tpl_unit}} strength: {{.Eval .Alert.Vars.g_strength | printf "%.3f"}} | |
{{if gt (.Eval .Alert.Vars.g_max_erratic_warn) -1.0}} | |
- erraticness: {{.Eval .Alert.Vars.g_erratic | printf "%.3f"}} | |
{{end}} | |
| {{.Eval .Alert.Vars.s_med_issues | printf "%.0f"}} {{.Alert.Vars.tpl_s_tag_plural}} with dropped median: | |
{{range $r :=.EvalAll .Alert.Vars.s_med_bad}} | |
{{ if gt $r.Value 0.0}} | |
{{ index $r.Group $.Alert.Vars.tpl_s_tag }} | |
{{end}} | |
{{end}}` | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"annotations": { | |
"enable": true, | |
"list": [ | |
{ | |
"datasource": "elasticsearch", | |
"enable": false, | |
"iconColor": "#C0C6BE", | |
"iconSize": 13, | |
"index": "anthracite", | |
"lineColor": "rgba(33, 16, 218, 0.59)", | |
"name": "anthracite", | |
"query": "", | |
"showLine": true, | |
"timeField": "date" | |
} | |
] | |
}, | |
"editable": true, | |
"hideAllLegends": false, | |
"hideControls": false, | |
"id": null, | |
"nav": [ | |
{ | |
"collapse": false, | |
"enable": true, | |
"notice": false, | |
"now": true, | |
"refresh_intervals": [ | |
"5s", | |
"10s", | |
"30s", | |
"1m", | |
"5m", | |
"15m", | |
"30m", | |
"1h", | |
"2h", | |
"1d" | |
], | |
"status": "Stable", | |
"time_options": [ | |
"5m", | |
"15m", | |
"1h", | |
"6h", | |
"12h", | |
"24h", | |
"2d", | |
"7d", | |
"30d" | |
], | |
"type": "timepicker" | |
} | |
], | |
"originalTitle": "template-timeshift", | |
"refresh": false, | |
"rows": [ | |
{ | |
"collapse": false, | |
"editable": true, | |
"height": "600px", | |
"panels": [ | |
{ | |
"aliasColors": { | |
"now": "#890F02", | |
"now-14d": "#1F78C1", | |
"now-21d": "#052B51", | |
"now-7d": "#CFFAFF" | |
}, | |
"bars": false, | |
"datasource": null, | |
"editable": true, | |
"error": false, | |
"fill": 0, | |
"grid": { | |
"leftMax": null, | |
"leftMin": null, | |
"rightMax": null, | |
"rightMin": null, | |
"threshold1": null, | |
"threshold1Color": "rgba(216, 200, 27, 0.27)", | |
"threshold2": null, | |
"threshold2Color": "rgba(234, 112, 112, 0.22)" | |
}, | |
"id": 1, | |
"legend": { | |
"avg": false, | |
"current": false, | |
"max": false, | |
"min": false, | |
"show": true, | |
"total": false, | |
"values": false | |
}, | |
"lines": true, | |
"linewidth": 1, | |
"links": [], | |
"nullPointMode": "null", | |
"percentage": false, | |
"pointradius": 5, | |
"points": false, | |
"renderer": "flot", | |
"seriesOverrides": [ | |
{ | |
"alias": "now", | |
"lines": false, | |
"pointradius": 1, | |
"points": true | |
} | |
], | |
"span": 12, | |
"stack": false, | |
"steppedLine": false, | |
"targets": [ | |
{ | |
"target": "alias($patt, 'now')" | |
}, | |
{ | |
"target": "alias(timeShift($patt, '7d'), 'now-7d')" | |
}, | |
{ | |
"target": "alias(timeShift($patt, '14d'), 'now-14d')" | |
}, | |
{ | |
"target": "alias(timeShift($patt, '21d'), 'now-21d')" | |
} | |
], | |
"title": "$patt", | |
"tooltip": { | |
"shared": false, | |
"value_type": "cumulative" | |
}, | |
"type": "graph", | |
"x-axis": true, | |
"y-axis": true, | |
"y_formats": [ | |
"short", | |
"short" | |
] | |
} | |
], | |
"title": "Row1" | |
} | |
], | |
"sharedCrosshair": false, | |
"style": "dark", | |
"tags": [], | |
"templating": { | |
"enable": true, | |
"list": [ | |
{ | |
"allFormat": "glob", | |
"current": { | |
"text": "", | |
"value": "" | |
}, | |
"datasource": null, | |
"includeAll": false, | |
"name": "patt", | |
"options": [ | |
{ | |
"text": "", | |
"value": "" | |
} | |
], | |
"query": "", | |
"refresh_on_load": false, | |
"type": "custom" | |
} | |
] | |
}, | |
"time": { | |
"from": "now-7d", | |
"to": "now" | |
}, | |
"timezone": "browser", | |
"title": "template-timeshift", | |
"version": 6 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment