Skip to content

Instantly share code, notes, and snippets.

@richcollier
Last active April 1, 2022 15:59
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save richcollier/7e5603c366b9fcece6f1a8b1b3cf4d3f to your computer and use it in GitHub Desktop.
Save richcollier/7e5603c366b9fcece6f1a8b1b3cf4d3f to your computer and use it in GitHub Desktop.
#chained watch for anomalies across jobs"
POST _xpack/watcher/watch/_execute
{
"watch": {
"trigger": {
"schedule": {
"interval": "1m"
}
},
"metadata": {
"watch_timespan" : "20m", //how far back watch looks each invocation (should be > 2x bucket_span)
"lookback_window" : "10m", //how far back to look in other jobs for related anomalies
"job1_name" : "it_ops_kpi",
"job1_min_anomaly_score": 75, //minimum anomaly score (bucket score) for job1
"job2_name" : "it_ops_network",
"job2_min_record_score" : 10, //minimum record score for anomalies in job2
"job3_name" : "it_ops_sql",
"job3_min_record_score" : 5 //minimum record score for anomalies in job3
},
"input": {
"chain": {
"inputs": [
{
"job1": {
"search": {
"request": {
"indices": [
".ml-anomalies-*"
],
"body": {
"query": {
"bool": {
"filter": [
{ "range": { "timestamp": {"gte": "now-{{ctx.metadata.watch_timespan}}"}}},
{ "term": {"result_type": "bucket"}},
{ "term": {"job_id": "{{ctx.metadata.job1_name}}"}},
{ "range": {"anomaly_score": {"gte": "{{ctx.metadata.job1_min_anomaly_score}}"}}}
]
}
}
}
}
}
}
},
{
"job2": {
"search": {
"request": {
"indices": [
".ml-anomalies-*"
],
"body": {
"query": {
"bool": {
"filter": [
{ "range": { "timestamp": {"gte": "{{ctx.payload.job1.hits.hits.0._source.timestamp}}||-{{ctx.metadata.lookback_window}}", "lte": "{{ctx.payload.job1.hits.hits.0._source.timestamp}}"}}},
{ "term": {"result_type": "record"}},
{ "term": {"job_id": "{{ctx.metadata.job2_name}}"}},
// example of optional filters
// { "regexp":{"field_name":{"value":"Out.*"}}},
// { "range": {"actual": {"gte":"1000.0"}}},
{ "range": {"record_score": {"gte": "{{ctx.metadata.job2_min_record_score}}"}}}
]
}
}
}
}
}
}
},
{
"job3": {
"search": {
"request": {
"indices": [
".ml-anomalies-*"
],
"body": {
"query": {
"bool": {
"filter": [
{ "range": { "timestamp": {"gte": "{{ctx.payload.job1.hits.hits.0._source.timestamp}}||-{{ctx.metadata.lookback_window}}", "lte": "{{ctx.payload.job1.hits.hits.0._source.timestamp}}"}}},
{ "term": {"result_type": "record"}},
{ "term": {"job_id": "{{ctx.metadata.job3_name}}"}},
// example of optional filters
// { "regexp":{"hostname":{"value":"dbserver.*"}}},
{ "range": {"record_score": {"gte": "{{ctx.metadata.job3_min_record_score}}"}}}
]
}
}
}
}
}
}
}
]
}
},
"condition" : {
"script" : {
// return true only if all 3 jobs returned "hits", thus all had anomalies matching input conditions
"source" : "return ctx.payload.job1.hits.total > 0 && ctx.payload.job2.hits.total > 0 && ctx.payload.job3.hits.total > 0"
}
},
"actions": {
"log": {
// use java stream() to collect hits from each into collections
"transform": {
"script": "return ['anomaly_score': ctx.payload.job1.hits.hits.0._source.anomaly_score, 'bucket_time': Instant.ofEpochMilli(ctx.payload.job1.hits.hits.0._source.timestamp).atZone(ZoneOffset.UTC).format(DateTimeFormatter.ofPattern('yyyy-MM-dd HH:mm:ss')),'job2_anomaly_details':ctx.payload.job2.hits.hits.stream().map(p -> ['bucket_time': Instant.ofEpochMilli(ctx.payload.job2.hits.hits.0._source.timestamp).atZone(ZoneOffset.UTC).format(DateTimeFormatter.ofPattern('yyyy-MM-dd HH:mm:ss')),'field_name':p._source.field_name,'score':p._source.record_score,'actual':p._source.actual.0,'typical':p._source.typical.0]).collect(Collectors.toList()),'job3_anomaly_details':ctx.payload.job3.hits.hits.stream().map(p -> ['bucket_time': Instant.ofEpochMilli(ctx.payload.job3.hits.hits.0._source.timestamp).atZone(ZoneOffset.UTC).format(DateTimeFormatter.ofPattern('yyyy-MM-dd HH:mm:ss')),'hostname':p._source.hostname.0,'field_name':p._source.field_name,'score':p._source.record_score,'actual':p._source.actual.0,'typical':p._source.typical.0]).collect(Collectors.toList())]"
},
"logging": {
"text": "[CRITICAL] Anomaly Alert for job {{ctx.metadata.job1_name}}: score={{ctx.payload.anomaly_score}} at {{ctx.payload.bucket_time}} UTC \nPossibly influenced by these other anomalous metrics (within the prior 10 minutes):\njob:{{ctx.metadata.job2_name}}: (anomalies with at least a record score of {{ctx.metadata.job2_min_record_score}}):\n{{#ctx.payload.job2_anomaly_details}}field={{field_name}}: score={{score}}, value={{actual}} (typical={{typical}}) at {{bucket_time}} UTC\n{{/ctx.payload.job2_anomaly_details}}\njob:{{ctx.metadata.job3_name}}: (anomalies with at least a record score of {{ctx.metadata.job3_min_record_score}}):\n{{#ctx.payload.job3_anomaly_details}}hostname={{hostname}} field={{field_name}}: score={{score}}, value={{actual}} (typical={{typical}}) at {{bucket_time}} UTC\n{{/ctx.payload.job3_anomaly_details}}"
}
}
}
}
}
@richcollier
Copy link
Author

Example output:

[2018-03-17T09:02:28,053][INFO ][o.e.x.w.a.l.ExecutableLoggingAction] [Iv3Ksae] [CRITICAL] Anomaly Alert for job it_ops_kpi: score=85.4309 at 2017-02-08 15:15:00 UTC
Possibly influenced by these other anomalous metrics (within the prior 10 minutes):
job:it_ops_network: (anomalies with at least a record score of 10):
field=In_Octets: score=11.217614808972602, value=13610.62255859375 (typical=855553.8944717721) at 2017-02-08 15:15:00 UTC
field=Out_Octets: score=17.00518, value=1.9079535783333334E8 (typical=1116062.402864764) at 2017-02-08 15:15:00 UTC
field=Out_Discards: score=72.99199, value=137.04444376627606 (typical=0.012289061361553099) at 2017-02-08 15:15:00 UTC
job:it_ops_sql: (anomalies with at least a record score of 5):
hostname=dbserver.acme.com field=SQLServer_Buffer_Manager_Page_life_expectancy: score=6.023424, value=846.0000000000005 (typical=12.609336298838242) at 2017-02-08 15:10:00 UTC
hostname=dbserver.acme.com field=SQLServer_Buffer_Manager_Buffer_cache_hit_ratio: score=8.337633, value=96.93249340057375 (typical=98.93088463835487) at 2017-02-08 15:10:00 UTC
hostname=dbserver.acme.com field=SQLServer_General_Statistics_User_Connections: score=27.97728, value=168.15000000000006 (typical=196.1486370757187) at 2017-02-08 15:10:00 UTC

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment