diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index e146d40ee69a1..76ae9349002be 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -367,6 +367,7 @@ test-prometheus-alerting-rules: - curl -L https://github.com/prometheus/prometheus/releases/download/v2.19.0/prometheus-2.19.0.linux-amd64.tar.gz --output prometheus.tar.gz - tar -xzf prometheus.tar.gz - ./prometheus-*/promtool check rules .maintain/monitoring/alerting-rules/alerting-rules.yaml + - cat .maintain/monitoring/alerting-rules/alerting-rules.yaml | ./prometheus-*/promtool test rules .maintain/monitoring/alerting-rules/alerting-rule-tests.yaml #### stage: build diff --git a/.maintain/monitoring/alerting-rules/alerting-rule-tests.yaml b/.maintain/monitoring/alerting-rules/alerting-rule-tests.yaml new file mode 100644 index 0000000000000..069cfaf977b50 --- /dev/null +++ b/.maintain/monitoring/alerting-rules/alerting-rule-tests.yaml @@ -0,0 +1,239 @@ +rule_files: + - /dev/stdin + +evaluation_interval: 1m + +tests: + - interval: 1m + input_series: + - series: 'polkadot_sub_libp2p_peers_count{ + job="polkadot", + pod="polkadot-abcdef01234-abcdef", + instance="polkadot-abcdef01234-abcdef", + }' + values: '3 2+0x4 1+0x9' # 3 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + + - series: 'polkadot_sub_txpool_validations_scheduled{ + job="polkadot", + pod="polkadot-abcdef01234-abcdef", + instance="polkadot-abcdef01234-abcdef", + }' + values: '10+1x30' # 10 11 12 13 .. 40 + + - series: 'polkadot_sub_txpool_validations_finished{ + job="polkadot", + pod="polkadot-abcdef01234-abcdef", + instance="polkadot-abcdef01234-abcdef", + }' + values: '0x30' # 0 0 0 0 .. 0 + + - series: 'polkadot_block_height{ + status="best", job="polkadot", + pod="polkadot-abcdef01234-abcdef", + instance="polkadot-abcdef01234-abcdef", + }' + values: '1+1x3 4+0x13' # 1 2 3 4 4 4 4 4 4 4 4 4 ... + + - series: 'polkadot_block_height{ + status="finalized", + job="polkadot", + pod="polkadot-abcdef01234-abcdef", + instance="polkadot-abcdef01234-abcdef", + }' + values: '1+1x3 4+0x13' # 1 2 3 4 4 4 4 4 4 4 4 4 ... + + - series: 'polkadot_cpu_usage_percentage{ + job="polkadot", + pod="polkadot-abcdef01234-abcdef", + instance="polkadot-abcdef01234-abcdef", + }' + values: '0+20x5 100+0x5' # 0 20 40 60 80 100 100 100 100 100 100 + + alert_rule_test: + + ###################################################################### + # Resource usage + ###################################################################### + + - eval_time: 9m + alertname: HighCPUUsage + exp_alerts: + - eval_time: 10m + alertname: HighCPUUsage + exp_alerts: + - exp_labels: + severity: warning + pod: polkadot-abcdef01234-abcdef + instance: polkadot-abcdef01234-abcdef + job: polkadot + exp_annotations: + message: "The node polkadot-abcdef01234-abcdef has a CPU + usage higher than 100% for more than 5 minutes" + + ###################################################################### + # Block production + ###################################################################### + + - eval_time: 6m + alertname: LowNumberOfNewBlocks + exp_alerts: + - eval_time: 7m + alertname: LowNumberOfNewBlocks + exp_alerts: + - exp_labels: + severity: warning + pod: polkadot-abcdef01234-abcdef + instance: polkadot-abcdef01234-abcdef + job: polkadot + status: best + exp_annotations: + message: "Less than one new block per minute on instance + polkadot-abcdef01234-abcdef." + + - eval_time: 14m + alertname: LowNumberOfNewBlocks + exp_alerts: + - exp_labels: + severity: warning + pod: polkadot-abcdef01234-abcdef + instance: polkadot-abcdef01234-abcdef + job: polkadot + status: best + exp_annotations: + message: "Less than one new block per minute on instance + polkadot-abcdef01234-abcdef." + - exp_labels: + severity: critical + pod: polkadot-abcdef01234-abcdef + instance: polkadot-abcdef01234-abcdef + job: polkadot + status: best + exp_annotations: + message: "Less than one new block per minute on instance + polkadot-abcdef01234-abcdef." + + ###################################################################### + # Block finalization + ###################################################################### + + - eval_time: 6m + alertname: BlockFinalizationSlow + exp_alerts: + - eval_time: 7m + alertname: BlockFinalizationSlow + exp_alerts: + - exp_labels: + severity: warning + pod: polkadot-abcdef01234-abcdef + instance: polkadot-abcdef01234-abcdef + job: polkadot + status: finalized + exp_annotations: + message: "Finalized block on instance + polkadot-abcdef01234-abcdef increases by less than 1 per + minute." + + - eval_time: 14m + alertname: BlockFinalizationSlow + exp_alerts: + - exp_labels: + severity: warning + pod: polkadot-abcdef01234-abcdef + instance: polkadot-abcdef01234-abcdef + job: polkadot + status: finalized + exp_annotations: + message: "Finalized block on instance + polkadot-abcdef01234-abcdef increases by less than 1 per + minute." + - exp_labels: + severity: critical + pod: polkadot-abcdef01234-abcdef + instance: polkadot-abcdef01234-abcdef + job: polkadot + status: finalized + exp_annotations: + message: "Finalized block on instance + polkadot-abcdef01234-abcdef increases by less than 1 per + minute." + + ###################################################################### + # Transaction queue + ###################################################################### + + - eval_time: 10m + alertname: TransactionQueueSize + exp_alerts: + - eval_time: 11m + alertname: TransactionQueueSize + exp_alerts: + - exp_labels: + severity: warning + pod: polkadot-abcdef01234-abcdef + instance: polkadot-abcdef01234-abcdef + job: polkadot + exp_annotations: + message: "The node polkadot-abcdef01234-abcdef has more + than 10 transactions in the queue for more than 10 + minutes" + + - eval_time: 31m + alertname: TransactionQueueSize + exp_alerts: + - exp_labels: + severity: warning + pod: polkadot-abcdef01234-abcdef + instance: polkadot-abcdef01234-abcdef + job: polkadot + exp_annotations: + message: "The node polkadot-abcdef01234-abcdef has more + than 10 transactions in the queue for more than 10 + minutes" + - exp_labels: + severity: critical + pod: polkadot-abcdef01234-abcdef + instance: polkadot-abcdef01234-abcdef + job: polkadot + exp_annotations: + message: "The node polkadot-abcdef01234-abcdef has more + than 10 transactions in the queue for more than 30 + minutes" + + ###################################################################### + # Networking + ###################################################################### + + - eval_time: 3m # Values: 3 2 2 + alertname: LowNumberOfPeers + exp_alerts: + - eval_time: 4m # Values: 2 2 2 + alertname: LowNumberOfPeers + exp_alerts: + - exp_labels: + severity: warning + pod: polkadot-abcdef01234-abcdef + instance: polkadot-abcdef01234-abcdef + job: polkadot + exp_annotations: + message: "The node polkadot-abcdef01234-abcdef has less + than 3 peers for more than 3 minutes" + + - eval_time: 16m # Values: 3 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 + alertname: LowNumberOfPeers + exp_alerts: + - exp_labels: + severity: warning + pod: polkadot-abcdef01234-abcdef + instance: polkadot-abcdef01234-abcdef + job: polkadot + exp_annotations: + message: "The node polkadot-abcdef01234-abcdef has less + than 3 peers for more than 3 minutes" + - exp_labels: + severity: critical + pod: polkadot-abcdef01234-abcdef + instance: polkadot-abcdef01234-abcdef + job: polkadot + exp_annotations: + message: "The node polkadot-abcdef01234-abcdef has less + than 3 peers for more than 15 minutes" diff --git a/.maintain/monitoring/alerting-rules/alerting-rules.yaml b/.maintain/monitoring/alerting-rules/alerting-rules.yaml index cb5b3c271dd14..06d204f7afa41 100644 --- a/.maintain/monitoring/alerting-rules/alerting-rules.yaml +++ b/.maintain/monitoring/alerting-rules/alerting-rules.yaml @@ -12,7 +12,8 @@ groups: labels: severity: warning annotations: - message: 'The node {{ $labels.instance }} has a CPU usage higher than 100% for more than 5 minutes' + message: 'The node {{ $labels.instance }} has a CPU usage higher than 100% + for more than 5 minutes' ############################################################################## # Block production @@ -20,14 +21,16 @@ groups: - alert: LowNumberOfNewBlocks annotations: - message: 'Less than one new block per minute on instance {{ $labels.instance }}.' + message: 'Less than one new block per minute on instance {{ + $labels.instance }}.' expr: increase(polkadot_block_height{status="best"}[1m]) < 1 for: 3m labels: severity: warning - alert: LowNumberOfNewBlocks annotations: - message: 'Less than one new block per minute on instance {{ $labels.instance }}.' + message: 'Less than one new block per minute on instance {{ + $labels.instance }}.' expr: increase(polkadot_block_height{status="best"}[1m]) < 1 for: 10m labels: @@ -43,43 +46,51 @@ groups: labels: severity: warning annotations: - message: 'Finalized block on instance {{ $labels.instance }} increases by less than 1 per minute.' + message: 'Finalized block on instance {{ $labels.instance }} increases by + less than 1 per minute.' - alert: BlockFinalizationSlow expr: increase(polkadot_block_height{status="finalized"}[1m]) < 1 for: 10m labels: severity: critical annotations: - message: 'Finalized block on instance {{ $labels.instance }} increases by less than 1 per minute.' + message: 'Finalized block on instance {{ $labels.instance }} increases by + less than 1 per minute.' - alert: BlockFinalizationLaggingBehind # Under the assumption of an average block production of 6 seconds, # "best" and "finalized" being more than 10 blocks apart would imply # more than a 1 minute delay between block production and finalization. - expr: (polkadot_block_height_number{status="best"} - ignoring(status) polkadot_block_height_number{status="finalized"}) > 10 + expr: '(polkadot_block_height_number{status="best"} - ignoring(status) + polkadot_block_height_number{status="finalized"}) > 10' for: 8m labels: severity: critical annotations: - message: "Block finalization on instance {{ $labels.instance }} is behind block production by {{ $value }} for more than 8m" + message: "Block finalization on instance {{ $labels.instance }} is behind + block production by {{ $value }} for more than 8m" ############################################################################## # Transaction queue ############################################################################## - alert: TransactionQueueSize - expr: polkadot_sub_txpool_validations_scheduled - polkadot_sub_txpool_validations_finished > 10 + expr: 'polkadot_sub_txpool_validations_scheduled - + polkadot_sub_txpool_validations_finished > 10' for: 10m labels: severity: warning annotations: - message: 'The node {{ $labels.instance }} has more than 10 transactions in the queue for more than 10 minutes' + message: 'The node {{ $labels.instance }} has more than 10 transactions in + the queue for more than 10 minutes' - alert: TransactionQueueSize - expr: polkadot_sub_txpool_validations_scheduled - polkadot_sub_txpool_validations_finished > 10 + expr: 'polkadot_sub_txpool_validations_scheduled - + polkadot_sub_txpool_validations_finished > 10' for: 30m labels: severity: critical annotations: - message: 'The node {{ $labels.instance }} has more than 10 transactions in the queue for more than 30 minutes' + message: 'The node {{ $labels.instance }} has more than 10 transactions in + the queue for more than 30 minutes' ############################################################################## # Networking @@ -91,23 +102,28 @@ groups: labels: severity: warning annotations: - message: 'The node {{ $labels.instance }} has less than 3 peers for more than 3 minutes' + message: 'The node {{ $labels.instance }} has less than 3 peers for more + than 3 minutes' - alert: LowNumberOfPeers expr: polkadot_sub_libp2p_peers_count < 3 for: 15m labels: severity: critical annotations: - message: 'The node {{ $labels.instance }} has less than 3 peers for more than 15 minutes' + message: 'The node {{ $labels.instance }} has less than 3 peers for more + than 15 minutes' ############################################################################## # Others ############################################################################## - alert: AuthorityDiscoveryHighDiscoveryFailure - expr: polkadot_authority_discovery_handle_value_found_event_failure / ignoring(name) polkadot_authority_discovery_dht_event_received{name="value_found"} > 0.5 + expr: 'polkadot_authority_discovery_handle_value_found_event_failure / + ignoring(name) + polkadot_authority_discovery_dht_event_received{name="value_found"} > 0.5' for: 2h labels: severity: warning annotations: - message: "Authority discovery on node {{ $labels.instance }} fails to process more than 50 % of the values found on the DHT." + message: "Authority discovery on node {{ $labels.instance }} fails to + process more than 50 % of the values found on the DHT."