Skip to content
This repository has been archived by the owner on Nov 15, 2023. It is now read-only.

Commit

Permalink
.maintain/monitoring: Add alerting rule tests (#6343)
Browse files Browse the repository at this point in the history
* .maintain/monitoring: Add alerting rule tests

* .maintain/monitoring/alerting-rules/alerting-rules.yaml: Break lines

* .gitlab-ci.yml: Add promtool rule testing step
  • Loading branch information
mxinden authored Jun 19, 2020
1 parent 369f9fc commit 31c3e06
Show file tree
Hide file tree
Showing 3 changed files with 271 additions and 15 deletions.
1 change: 1 addition & 0 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -367,6 +367,7 @@ test-prometheus-alerting-rules:
- curl -L https://github.com/prometheus/prometheus/releases/download/v2.19.0/prometheus-2.19.0.linux-amd64.tar.gz --output prometheus.tar.gz
- tar -xzf prometheus.tar.gz
- ./prometheus-*/promtool check rules .maintain/monitoring/alerting-rules/alerting-rules.yaml
- cat .maintain/monitoring/alerting-rules/alerting-rules.yaml | ./prometheus-*/promtool test rules .maintain/monitoring/alerting-rules/alerting-rule-tests.yaml

#### stage: build

Expand Down
239 changes: 239 additions & 0 deletions .maintain/monitoring/alerting-rules/alerting-rule-tests.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,239 @@
rule_files:
- /dev/stdin

evaluation_interval: 1m

tests:
- interval: 1m
input_series:
- series: 'polkadot_sub_libp2p_peers_count{
job="polkadot",
pod="polkadot-abcdef01234-abcdef",
instance="polkadot-abcdef01234-abcdef",
}'
values: '3 2+0x4 1+0x9' # 3 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1

- series: 'polkadot_sub_txpool_validations_scheduled{
job="polkadot",
pod="polkadot-abcdef01234-abcdef",
instance="polkadot-abcdef01234-abcdef",
}'
values: '10+1x30' # 10 11 12 13 .. 40

- series: 'polkadot_sub_txpool_validations_finished{
job="polkadot",
pod="polkadot-abcdef01234-abcdef",
instance="polkadot-abcdef01234-abcdef",
}'
values: '0x30' # 0 0 0 0 .. 0

- series: 'polkadot_block_height{
status="best", job="polkadot",
pod="polkadot-abcdef01234-abcdef",
instance="polkadot-abcdef01234-abcdef",
}'
values: '1+1x3 4+0x13' # 1 2 3 4 4 4 4 4 4 4 4 4 ...

- series: 'polkadot_block_height{
status="finalized",
job="polkadot",
pod="polkadot-abcdef01234-abcdef",
instance="polkadot-abcdef01234-abcdef",
}'
values: '1+1x3 4+0x13' # 1 2 3 4 4 4 4 4 4 4 4 4 ...

- series: 'polkadot_cpu_usage_percentage{
job="polkadot",
pod="polkadot-abcdef01234-abcdef",
instance="polkadot-abcdef01234-abcdef",
}'
values: '0+20x5 100+0x5' # 0 20 40 60 80 100 100 100 100 100 100

alert_rule_test:

######################################################################
# Resource usage
######################################################################

- eval_time: 9m
alertname: HighCPUUsage
exp_alerts:
- eval_time: 10m
alertname: HighCPUUsage
exp_alerts:
- exp_labels:
severity: warning
pod: polkadot-abcdef01234-abcdef
instance: polkadot-abcdef01234-abcdef
job: polkadot
exp_annotations:
message: "The node polkadot-abcdef01234-abcdef has a CPU
usage higher than 100% for more than 5 minutes"

######################################################################
# Block production
######################################################################

- eval_time: 6m
alertname: LowNumberOfNewBlocks
exp_alerts:
- eval_time: 7m
alertname: LowNumberOfNewBlocks
exp_alerts:
- exp_labels:
severity: warning
pod: polkadot-abcdef01234-abcdef
instance: polkadot-abcdef01234-abcdef
job: polkadot
status: best
exp_annotations:
message: "Less than one new block per minute on instance
polkadot-abcdef01234-abcdef."

- eval_time: 14m
alertname: LowNumberOfNewBlocks
exp_alerts:
- exp_labels:
severity: warning
pod: polkadot-abcdef01234-abcdef
instance: polkadot-abcdef01234-abcdef
job: polkadot
status: best
exp_annotations:
message: "Less than one new block per minute on instance
polkadot-abcdef01234-abcdef."
- exp_labels:
severity: critical
pod: polkadot-abcdef01234-abcdef
instance: polkadot-abcdef01234-abcdef
job: polkadot
status: best
exp_annotations:
message: "Less than one new block per minute on instance
polkadot-abcdef01234-abcdef."

######################################################################
# Block finalization
######################################################################

- eval_time: 6m
alertname: BlockFinalizationSlow
exp_alerts:
- eval_time: 7m
alertname: BlockFinalizationSlow
exp_alerts:
- exp_labels:
severity: warning
pod: polkadot-abcdef01234-abcdef
instance: polkadot-abcdef01234-abcdef
job: polkadot
status: finalized
exp_annotations:
message: "Finalized block on instance
polkadot-abcdef01234-abcdef increases by less than 1 per
minute."

- eval_time: 14m
alertname: BlockFinalizationSlow
exp_alerts:
- exp_labels:
severity: warning
pod: polkadot-abcdef01234-abcdef
instance: polkadot-abcdef01234-abcdef
job: polkadot
status: finalized
exp_annotations:
message: "Finalized block on instance
polkadot-abcdef01234-abcdef increases by less than 1 per
minute."
- exp_labels:
severity: critical
pod: polkadot-abcdef01234-abcdef
instance: polkadot-abcdef01234-abcdef
job: polkadot
status: finalized
exp_annotations:
message: "Finalized block on instance
polkadot-abcdef01234-abcdef increases by less than 1 per
minute."

######################################################################
# Transaction queue
######################################################################

- eval_time: 10m
alertname: TransactionQueueSize
exp_alerts:
- eval_time: 11m
alertname: TransactionQueueSize
exp_alerts:
- exp_labels:
severity: warning
pod: polkadot-abcdef01234-abcdef
instance: polkadot-abcdef01234-abcdef
job: polkadot
exp_annotations:
message: "The node polkadot-abcdef01234-abcdef has more
than 10 transactions in the queue for more than 10
minutes"

- eval_time: 31m
alertname: TransactionQueueSize
exp_alerts:
- exp_labels:
severity: warning
pod: polkadot-abcdef01234-abcdef
instance: polkadot-abcdef01234-abcdef
job: polkadot
exp_annotations:
message: "The node polkadot-abcdef01234-abcdef has more
than 10 transactions in the queue for more than 10
minutes"
- exp_labels:
severity: critical
pod: polkadot-abcdef01234-abcdef
instance: polkadot-abcdef01234-abcdef
job: polkadot
exp_annotations:
message: "The node polkadot-abcdef01234-abcdef has more
than 10 transactions in the queue for more than 30
minutes"

######################################################################
# Networking
######################################################################

- eval_time: 3m # Values: 3 2 2
alertname: LowNumberOfPeers
exp_alerts:
- eval_time: 4m # Values: 2 2 2
alertname: LowNumberOfPeers
exp_alerts:
- exp_labels:
severity: warning
pod: polkadot-abcdef01234-abcdef
instance: polkadot-abcdef01234-abcdef
job: polkadot
exp_annotations:
message: "The node polkadot-abcdef01234-abcdef has less
than 3 peers for more than 3 minutes"

- eval_time: 16m # Values: 3 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1
alertname: LowNumberOfPeers
exp_alerts:
- exp_labels:
severity: warning
pod: polkadot-abcdef01234-abcdef
instance: polkadot-abcdef01234-abcdef
job: polkadot
exp_annotations:
message: "The node polkadot-abcdef01234-abcdef has less
than 3 peers for more than 3 minutes"
- exp_labels:
severity: critical
pod: polkadot-abcdef01234-abcdef
instance: polkadot-abcdef01234-abcdef
job: polkadot
exp_annotations:
message: "The node polkadot-abcdef01234-abcdef has less
than 3 peers for more than 15 minutes"
46 changes: 31 additions & 15 deletions .maintain/monitoring/alerting-rules/alerting-rules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,22 +12,25 @@ groups:
labels:
severity: warning
annotations:
message: 'The node {{ $labels.instance }} has a CPU usage higher than 100% for more than 5 minutes'
message: 'The node {{ $labels.instance }} has a CPU usage higher than 100%
for more than 5 minutes'

##############################################################################
# Block production
##############################################################################

- alert: LowNumberOfNewBlocks
annotations:
message: 'Less than one new block per minute on instance {{ $labels.instance }}.'
message: 'Less than one new block per minute on instance {{
$labels.instance }}.'
expr: increase(polkadot_block_height{status="best"}[1m]) < 1
for: 3m
labels:
severity: warning
- alert: LowNumberOfNewBlocks
annotations:
message: 'Less than one new block per minute on instance {{ $labels.instance }}.'
message: 'Less than one new block per minute on instance {{
$labels.instance }}.'
expr: increase(polkadot_block_height{status="best"}[1m]) < 1
for: 10m
labels:
Expand All @@ -43,43 +46,51 @@ groups:
labels:
severity: warning
annotations:
message: 'Finalized block on instance {{ $labels.instance }} increases by less than 1 per minute.'
message: 'Finalized block on instance {{ $labels.instance }} increases by
less than 1 per minute.'
- alert: BlockFinalizationSlow
expr: increase(polkadot_block_height{status="finalized"}[1m]) < 1
for: 10m
labels:
severity: critical
annotations:
message: 'Finalized block on instance {{ $labels.instance }} increases by less than 1 per minute.'
message: 'Finalized block on instance {{ $labels.instance }} increases by
less than 1 per minute.'
- alert: BlockFinalizationLaggingBehind
# Under the assumption of an average block production of 6 seconds,
# "best" and "finalized" being more than 10 blocks apart would imply
# more than a 1 minute delay between block production and finalization.
expr: (polkadot_block_height_number{status="best"} - ignoring(status) polkadot_block_height_number{status="finalized"}) > 10
expr: '(polkadot_block_height_number{status="best"} - ignoring(status)
polkadot_block_height_number{status="finalized"}) > 10'
for: 8m
labels:
severity: critical
annotations:
message: "Block finalization on instance {{ $labels.instance }} is behind block production by {{ $value }} for more than 8m"
message: "Block finalization on instance {{ $labels.instance }} is behind
block production by {{ $value }} for more than 8m"

##############################################################################
# Transaction queue
##############################################################################

- alert: TransactionQueueSize
expr: polkadot_sub_txpool_validations_scheduled - polkadot_sub_txpool_validations_finished > 10
expr: 'polkadot_sub_txpool_validations_scheduled -
polkadot_sub_txpool_validations_finished > 10'
for: 10m
labels:
severity: warning
annotations:
message: 'The node {{ $labels.instance }} has more than 10 transactions in the queue for more than 10 minutes'
message: 'The node {{ $labels.instance }} has more than 10 transactions in
the queue for more than 10 minutes'
- alert: TransactionQueueSize
expr: polkadot_sub_txpool_validations_scheduled - polkadot_sub_txpool_validations_finished > 10
expr: 'polkadot_sub_txpool_validations_scheduled -
polkadot_sub_txpool_validations_finished > 10'
for: 30m
labels:
severity: critical
annotations:
message: 'The node {{ $labels.instance }} has more than 10 transactions in the queue for more than 30 minutes'
message: 'The node {{ $labels.instance }} has more than 10 transactions in
the queue for more than 30 minutes'

##############################################################################
# Networking
Expand All @@ -91,23 +102,28 @@ groups:
labels:
severity: warning
annotations:
message: 'The node {{ $labels.instance }} has less than 3 peers for more than 3 minutes'
message: 'The node {{ $labels.instance }} has less than 3 peers for more
than 3 minutes'
- alert: LowNumberOfPeers
expr: polkadot_sub_libp2p_peers_count < 3
for: 15m
labels:
severity: critical
annotations:
message: 'The node {{ $labels.instance }} has less than 3 peers for more than 15 minutes'
message: 'The node {{ $labels.instance }} has less than 3 peers for more
than 15 minutes'

##############################################################################
# Others
##############################################################################

- alert: AuthorityDiscoveryHighDiscoveryFailure
expr: polkadot_authority_discovery_handle_value_found_event_failure / ignoring(name) polkadot_authority_discovery_dht_event_received{name="value_found"} > 0.5
expr: 'polkadot_authority_discovery_handle_value_found_event_failure /
ignoring(name)
polkadot_authority_discovery_dht_event_received{name="value_found"} > 0.5'
for: 2h
labels:
severity: warning
annotations:
message: "Authority discovery on node {{ $labels.instance }} fails to process more than 50 % of the values found on the DHT."
message: "Authority discovery on node {{ $labels.instance }} fails to
process more than 50 % of the values found on the DHT."

0 comments on commit 31c3e06

Please sign in to comment.