Created
November 5, 2020 14:11
-
-
Save sourabh-agrawal/0efc4f6b7968afce66741a221faa4e7d to your computer and use it in GitHub Desktop.
Monitor kafka brokers with elastalert and get a slack alert if average normalized cpu of past 15 minutes crossed 90%
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
index: metricbeat-live* | |
description: "This alert is fired when avg normalized cpu over past 15 minutes has crossed 90% mark" | |
name: central-kafka-cpu-critical | |
type: metric_aggregation | |
alert: slack | |
# Query elasticsearch every 1hr | |
run_every: | |
hours: 1 | |
# Poll last 15minutes data | |
buffer_time: | |
minutes: 15 | |
# Realert after 1hr | |
realert: | |
hours: 1 | |
doc_type: _doc | |
# Here tags_hash.id is BrokerId. e.g (1,2,3) | |
query_key: tags_hash.id | |
metric_agg_key: system.cpu.total.norm.pct | |
metric_agg_type: avg | |
max_threshold: 0.90 | |
# Usually i am generating events every 20s, so there will be 45 events in 15min. Set bar as 30. | |
min_doc_count: 30 | |
# metric_aggregation doesn't provide access to keys other then query_key and | |
# aggregation result, to be used in alert_text. So if we really want to see | |
# other fields in slack then following is a work around. | |
# With top_count_keys elastalert will do a term query to elasticsearch to get top | |
# top_count_number values of each field defined in top_count_keys. | |
#top_count_keys: | |
# - tags_hash.id | |
# - tags_hash.application | |
# - tags_hash.cluster | |
# - host.name | |
# By default elastalert append ".raw/.keyword" in the fields defined in | |
# top_count_keys. Disable this dehaviour with raw_count_keys. | |
#raw_count_keys: false | |
#top_count_number: 1 | |
filter: | |
- query: | |
query_string: | |
query: "tags_hash.cluster: central AND tags_hash.application: kafka AND metricset.name: cpu" | |
# Slack alert configuration | |
slack_webhook_url: "<paste_your_webhook_url_here>" | |
slack_text_string: ":WARNING::mega: CPU Alert :boom::fire:" | |
slack_emoji_override: ":robot_face:" | |
# Available colors: good, warning, danger | |
slack_msg_color: 'danger' | |
alert_text_type: alert_text_only | |
alert_text: "\n | |
*CPU*\t\t\t\t :\t{0}\n | |
*Cluster*\t\t\t:\tCentral\n | |
*Application*\t:\tKafka\n | |
*BrokerId*\t\t :\t{1}\n\n | |
_This alert is fired when avg normalized cpu over past 15 minutes has crossed 90% mark_" | |
# Calculated aggregated cpu avg is stored in metric_system.cpu.total.norm.pct_avg. We don't need to include these. | |
alert_text_args: ["metric_system.cpu.total.norm.pct_avg", "tags_hash.id"] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
good job