Skip to content
This repository has been archived by the owner on Nov 15, 2023. It is now read-only.

.maintain/monitoring: Add alerting rule tests #6343

Merged
merged 4 commits into from
Jun 19, 2020
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
239 changes: 239 additions & 0 deletions .maintain/monitoring/alerting-rules/alerting-rule-tests.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,239 @@
rule_files:
- /dev/stdin

evaluation_interval: 1m

tests:
- interval: 1m
input_series:
- series: 'polkadot_sub_libp2p_peers_count{
job="polkadot",
pod="polkadot-abcdef01234-abcdef",
instance="polkadot-abcdef01234-abcdef",
}'
values: '3 2+0x4 1+0x9' # 3 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1

- series: 'polkadot_sub_txpool_validations_scheduled{
job="polkadot",
pod="polkadot-abcdef01234-abcdef",
instance="polkadot-abcdef01234-abcdef",
}'
values: '10+1x30' # 10 11 12 13 .. 40

- series: 'polkadot_sub_txpool_validations_finished{
job="polkadot",
pod="polkadot-abcdef01234-abcdef",
instance="polkadot-abcdef01234-abcdef",
}'
values: '0x30' # 0 0 0 0 .. 0

- series: 'polkadot_block_height{
status="best", job="polkadot",
pod="polkadot-abcdef01234-abcdef",
instance="polkadot-abcdef01234-abcdef",
}'
values: '1+1x3 4+0x13' # 1 2 3 4 4 4 4 4 4 4 4 4 ...

- series: 'polkadot_block_height{
status="finalized",
job="polkadot",
pod="polkadot-abcdef01234-abcdef",
instance="polkadot-abcdef01234-abcdef",
}'
values: '1+1x3 4+0x13' # 1 2 3 4 4 4 4 4 4 4 4 4 ...

- series: 'polkadot_cpu_usage_percentage{
job="polkadot",
pod="polkadot-abcdef01234-abcdef",
instance="polkadot-abcdef01234-abcdef",
}'
values: '0+20x5 100+0x5' # 0 20 40 60 80 100 100 100 100 100 100

alert_rule_test:

######################################################################
# Resource usage
######################################################################

- eval_time: 9m
alertname: HighCPUUsage
exp_alerts:
- eval_time: 10m
alertname: HighCPUUsage
exp_alerts:
- exp_labels:
severity: warning
pod: polkadot-abcdef01234-abcdef
instance: polkadot-abcdef01234-abcdef
job: polkadot
exp_annotations:
message: "The node polkadot-abcdef01234-abcdef has a CPU
usage higher than 100% for more than 5 minutes"

######################################################################
# Block production
######################################################################

- eval_time: 6m
alertname: LowNumberOfNewBlocks
exp_alerts:
- eval_time: 7m
alertname: LowNumberOfNewBlocks
exp_alerts:
- exp_labels:
severity: warning
pod: polkadot-abcdef01234-abcdef
instance: polkadot-abcdef01234-abcdef
job: polkadot
status: best
exp_annotations:
message: "Less than one new block per minute on instance
polkadot-abcdef01234-abcdef."

- eval_time: 14m
alertname: LowNumberOfNewBlocks
exp_alerts:
- exp_labels:
severity: warning
pod: polkadot-abcdef01234-abcdef
instance: polkadot-abcdef01234-abcdef
job: polkadot
status: best
exp_annotations:
message: "Less than one new block per minute on instance
polkadot-abcdef01234-abcdef."
- exp_labels:
severity: critical
pod: polkadot-abcdef01234-abcdef
instance: polkadot-abcdef01234-abcdef
job: polkadot
status: best
exp_annotations:
message: "Less than one new block per minute on instance
polkadot-abcdef01234-abcdef."

######################################################################
# Block finalization
######################################################################

- eval_time: 6m
alertname: BlockFinalizationSlow
exp_alerts:
- eval_time: 7m
alertname: BlockFinalizationSlow
exp_alerts:
- exp_labels:
severity: warning
pod: polkadot-abcdef01234-abcdef
instance: polkadot-abcdef01234-abcdef
job: polkadot
status: finalized
exp_annotations:
message: "Finalized block on instance
polkadot-abcdef01234-abcdef increases by less than 1 per
minute."

- eval_time: 14m
alertname: BlockFinalizationSlow
exp_alerts:
- exp_labels:
severity: warning
pod: polkadot-abcdef01234-abcdef
instance: polkadot-abcdef01234-abcdef
job: polkadot
status: finalized
exp_annotations:
message: "Finalized block on instance
polkadot-abcdef01234-abcdef increases by less than 1 per
minute."
- exp_labels:
severity: critical
pod: polkadot-abcdef01234-abcdef
instance: polkadot-abcdef01234-abcdef
job: polkadot
status: finalized
exp_annotations:
message: "Finalized block on instance
polkadot-abcdef01234-abcdef increases by less than 1 per
minute."

######################################################################
# Transaction queue
######################################################################

- eval_time: 10m
alertname: TransactionQueueSize
exp_alerts:
- eval_time: 11m
alertname: TransactionQueueSize
exp_alerts:
- exp_labels:
severity: warning
pod: polkadot-abcdef01234-abcdef
instance: polkadot-abcdef01234-abcdef
job: polkadot
exp_annotations:
message: "The node polkadot-abcdef01234-abcdef has more
than 10 transactions in the queue for more than 10
minutes"

- eval_time: 31m
alertname: TransactionQueueSize
exp_alerts:
- exp_labels:
severity: warning
pod: polkadot-abcdef01234-abcdef
instance: polkadot-abcdef01234-abcdef
job: polkadot
exp_annotations:
message: "The node polkadot-abcdef01234-abcdef has more
than 10 transactions in the queue for more than 10
minutes"
- exp_labels:
severity: critical
pod: polkadot-abcdef01234-abcdef
instance: polkadot-abcdef01234-abcdef
job: polkadot
exp_annotations:
message: "The node polkadot-abcdef01234-abcdef has more
than 10 transactions in the queue for more than 30
minutes"

######################################################################
# Networking
######################################################################

- eval_time: 3m # Values: 3 2 2
alertname: LowNumberOfPeers
exp_alerts:
- eval_time: 4m # Values: 2 2 2
alertname: LowNumberOfPeers
exp_alerts:
- exp_labels:
severity: warning
pod: polkadot-abcdef01234-abcdef
instance: polkadot-abcdef01234-abcdef
job: polkadot
exp_annotations:
message: "The node polkadot-abcdef01234-abcdef has less
than 3 peers for more than 3 minutes"

- eval_time: 16m # Values: 3 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1
alertname: LowNumberOfPeers
exp_alerts:
- exp_labels:
severity: warning
pod: polkadot-abcdef01234-abcdef
instance: polkadot-abcdef01234-abcdef
job: polkadot
exp_annotations:
message: "The node polkadot-abcdef01234-abcdef has less
than 3 peers for more than 3 minutes"
- exp_labels:
severity: critical
pod: polkadot-abcdef01234-abcdef
instance: polkadot-abcdef01234-abcdef
job: polkadot
exp_annotations:
message: "The node polkadot-abcdef01234-abcdef has less
than 3 peers for more than 15 minutes"
46 changes: 31 additions & 15 deletions .maintain/monitoring/alerting-rules/alerting-rules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,22 +12,25 @@ groups:
labels:
severity: warning
annotations:
message: 'The node {{ $labels.instance }} has a CPU usage higher than 100% for more than 5 minutes'
message: 'The node {{ $labels.instance }} has a CPU usage higher than 100%
for more than 5 minutes'

##############################################################################
# Block production
##############################################################################

- alert: LowNumberOfNewBlocks
annotations:
message: 'Less than one new block per minute on instance {{ $labels.instance }}.'
message: 'Less than one new block per minute on instance {{
$labels.instance }}.'
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

imo this kind of line break makes the file less readable

expr: increase(polkadot_block_height{status="best"}[1m]) < 1
for: 3m
labels:
severity: warning
- alert: LowNumberOfNewBlocks
annotations:
message: 'Less than one new block per minute on instance {{ $labels.instance }}.'
message: 'Less than one new block per minute on instance {{
$labels.instance }}.'
expr: increase(polkadot_block_height{status="best"}[1m]) < 1
for: 10m
labels:
Expand All @@ -43,43 +46,51 @@ groups:
labels:
severity: warning
annotations:
message: 'Finalized block on instance {{ $labels.instance }} increases by less than 1 per minute.'
message: 'Finalized block on instance {{ $labels.instance }} increases by
less than 1 per minute.'
- alert: BlockFinalizationSlow
expr: increase(polkadot_block_height{status="finalized"}[1m]) < 1
for: 10m
labels:
severity: critical
annotations:
message: 'Finalized block on instance {{ $labels.instance }} increases by less than 1 per minute.'
message: 'Finalized block on instance {{ $labels.instance }} increases by
less than 1 per minute.'
- alert: BlockFinalizationLaggingBehind
# Under the assumption of an average block production of 6 seconds,
# "best" and "finalized" being more than 10 blocks apart would imply
# more than a 1 minute delay between block production and finalization.
expr: (polkadot_block_height_number{status="best"} - ignoring(status) polkadot_block_height_number{status="finalized"}) > 10
expr: '(polkadot_block_height_number{status="best"} - ignoring(status)
polkadot_block_height_number{status="finalized"}) > 10'
for: 8m
labels:
severity: critical
annotations:
message: "Block finalization on instance {{ $labels.instance }} is behind block production by {{ $value }} for more than 8m"
message: "Block finalization on instance {{ $labels.instance }} is behind
block production by {{ $value }} for more than 8m"

##############################################################################
# Transaction queue
##############################################################################

- alert: TransactionQueueSize
expr: polkadot_sub_txpool_validations_scheduled - polkadot_sub_txpool_validations_finished > 10
expr: 'polkadot_sub_txpool_validations_scheduled -
polkadot_sub_txpool_validations_finished > 10'
for: 10m
labels:
severity: warning
annotations:
message: 'The node {{ $labels.instance }} has more than 10 transactions in the queue for more than 10 minutes'
message: 'The node {{ $labels.instance }} has more than 10 transactions in
the queue for more than 10 minutes'
- alert: TransactionQueueSize
expr: polkadot_sub_txpool_validations_scheduled - polkadot_sub_txpool_validations_finished > 10
expr: 'polkadot_sub_txpool_validations_scheduled -
polkadot_sub_txpool_validations_finished > 10'
for: 30m
labels:
severity: critical
annotations:
message: 'The node {{ $labels.instance }} has more than 10 transactions in the queue for more than 30 minutes'
message: 'The node {{ $labels.instance }} has more than 10 transactions in
the queue for more than 30 minutes'

##############################################################################
# Networking
Expand All @@ -91,23 +102,28 @@ groups:
labels:
severity: warning
annotations:
message: 'The node {{ $labels.instance }} has less than 3 peers for more than 3 minutes'
message: 'The node {{ $labels.instance }} has less than 3 peers for more
than 3 minutes'
- alert: LowNumberOfPeers
expr: polkadot_sub_libp2p_peers_count < 3
for: 15m
labels:
severity: critical
annotations:
message: 'The node {{ $labels.instance }} has less than 3 peers for more than 15 minutes'
message: 'The node {{ $labels.instance }} has less than 3 peers for more
than 15 minutes'

##############################################################################
# Others
##############################################################################

- alert: AuthorityDiscoveryHighDiscoveryFailure
expr: polkadot_authority_discovery_handle_value_found_event_failure / ignoring(name) polkadot_authority_discovery_dht_event_received{name="value_found"} > 0.5
expr: 'polkadot_authority_discovery_handle_value_found_event_failure /
ignoring(name)
polkadot_authority_discovery_dht_event_received{name="value_found"} > 0.5'
for: 2h
labels:
severity: warning
annotations:
message: "Authority discovery on node {{ $labels.instance }} fails to process more than 50 % of the values found on the DHT."
message: "Authority discovery on node {{ $labels.instance }} fails to
process more than 50 % of the values found on the DHT."