Skip to content

Instantly share code, notes, and snippets.

@sherry-x
Created March 28, 2023 16:45
Show Gist options
  • Save sherry-x/cfe09f9464a9f90d7db2cefade167222 to your computer and use it in GitHub Desktop.
Save sherry-x/cfe09f9464a9f90d7db2cefade167222 to your computer and use it in GitHub Desktop.
from spec_types.alert_utils import (
create_alert_group,
create_group_by_trigger,
get_cluster,
AlertMetric,
MAINNET_SERVICE,
PREVIEWNET_SERVICE,
)
from spec_types.lib_fix_vour import (
AlertRuleVOur,
)
CLUSTER = get_cluster()
if CLUSTER == "managed":
services_to_monitor = [
(PREVIEWNET_SERVICE, {"route_by": "operator", "urgency": "low"})
]
else:
services_to_monitor = [(MAINNET_SERVICE, {"route_by": "operator"})]
OPERATOR_LABEL_REPLACE_ARGS = (
'"operator", "$2", "kubernetes_pod_name", "(.*):(.*?)[-/](.*)"'
)
rules = [
[
AlertRuleVOur(
# Each rule must have a unique title
title=f"[{service.name}][node][operator] "
"Node connected to less than 80% of other nodes",
condition="C",
# Several triggers can be used per alert
triggers=create_group_by_trigger(
metric=AlertMetric(
expr_template="""
min by (operator) (
max_over_time(
label_replace(
(
1 + sum by (kubernetes_pod_name)(
aptos_connections{{\
{SERVICE_FILTER}, network_id="Validator", role_type="validator"}}
)
)
/
quantile(0.5, aptos_consensus_current_epoch_validators{{\
{SERVICE_FILTER}, role="validator"}}),
{OPERATOR_LABEL_REPLACE_ARGS}
)[{AGGREGATION_INTERVAL}]
)
)
""",
name="Percent of connections for a node with fewest "
"connections, by a node operator",
additional_template_args={
"OPERATOR_LABEL_REPLACE_ARGS": OPERATOR_LABEL_REPLACE_ARGS, # noqa: E501
},
),
service=service,
condition_expression="{SOURCE} < 0.8",
aggregation_interval="20m",
),
annotations={
"summary": "Node operator has a node that has fewer than 80% "
"of connections (over last 20 minutes)",
},
labels=labels,
timeRangeFrom=4 * 3600,
),
AlertRuleVOur(
# Each rule must have a unique title
title=f"[{service.name}][node][operator] Node not participating "
"in consensus",
condition="C",
# Several triggers can be used per alert
triggers=create_group_by_trigger(
metric=AlertMetric(
expr_template="""
sum by (operator) (
label_replace(
max by (kubernetes_pod_name) (
1 - clamp_max(
sum_over_time(aptos_committed_votes_in_window{{\
{SERVICE_FILTER}, role="validator"}}[{AGGREGATION_INTERVAL}])
+ sum_over_time(aptos_committed_proposals_in_window{{\
{SERVICE_FILTER}, role="validator"}}[{AGGREGATION_INTERVAL}]),
1
)
),
{OPERATOR_LABEL_REPLACE_ARGS}
)
)
""",
name="Number of nodes not participating in consensus, "
"by node operator",
additional_template_args={
"OPERATOR_LABEL_REPLACE_ARGS": OPERATOR_LABEL_REPLACE_ARGS, # noqa: E501
},
),
service=service,
condition_expression="{SOURCE} > 0",
aggregation_interval="20m",
),
annotations={
"summary": "Number of nodes not participating in consensus, "
"for a node operator, is at least 1. (over last 20 minutes)",
},
labels=labels,
timeRangeFrom=4 * 3600,
),
AlertRuleVOur(
# Each rule must have a unique title
title=f"[{service.name}][node][operator] Broken Validator-VFN "
"connection",
condition="C",
# Several triggers can be used per alert
triggers=create_group_by_trigger(
metric=AlertMetric(
expr_template="""
sum by (operator) (
label_replace(
1 - clamp_max(
max_over_time(
aptos_data_client_connected_peers{{\
{SERVICE_FILTER}, role=~"validator_fullnode",peer_type="prioritized_peer"}}\
[{AGGREGATION_INTERVAL}]
),
1
),
{OPERATOR_LABEL_REPLACE_ARGS}
)
)
""",
name="Number of VFNs without a connection to the "
"validator, by a node operator",
additional_template_args={
"OPERATOR_LABEL_REPLACE_ARGS": OPERATOR_LABEL_REPLACE_ARGS, # noqa: E501
},
),
service=service,
condition_expression="{SOURCE} > 0",
aggregation_interval="20m",
),
annotations={
"summary": "Number of VFNs without a connection to the "
"validator, for a node operator, is at least 1. (over last 20m)",
},
labels=labels,
timeRangeFrom=4 * 3600,
),
]
for service, labels in services_to_monitor
]
alertgroup = create_alert_group(
name="operator_node_health",
# very expensive, and aggregation interval is large,
# so we can make evaluation interval large too.
evaluate_interval_s=300,
evaluate_for="10m",
# flatten rules
rules=[rule for rules_for_service in rules for rule in rules_for_service],
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment