Skip to content

Commit

Permalink
[obs] alert when cluster is under high load (gitpod-io#17755)
Browse files Browse the repository at this point in the history
* [obs] formatting

* [obs] alert and inspect cluster due to high sustained load
  • Loading branch information
kylos101 authored May 26, 2023
1 parent 0e11c83 commit 2208a87
Showing 1 changed file with 58 additions and 47 deletions.
105 changes: 58 additions & 47 deletions operations/observability/mixins/workspace/rules/central/nodes.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,53 +5,64 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
prometheus: k8s
role: alert-rules
name: workspace-nodes-monitoring-rules
labels:
prometheus: k8s
role: alert-rules
name: workspace-nodes-monitoring-rules
spec:
groups:
- name: workspace-nodes-rules
rules:
- record: nodepool:node_load1:normalized
expr:
|
node_load1 * on(node) group_left(nodepool) kube_node_labels
/
count without (cpu) (
count without (mode) (
node_cpu_seconds_total * on(node) group_left(nodepool) kube_node_labels
)
)
- name: workspace-nodes-alerts
rules:
- alert: GitpodWorkspaceNodeHighNormalizedLoadAverage
labels:
severity: warning
team: workspace
for: 60m
annotations:
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodWorkspaceNodeHighNormalizedLoadAverage.md
summary: Workspace node's normalized load average is higher than 10 for more than 60 minutes. Check for abuse.
description: Node {{ $labels.node }} in {{ $labels.cluster }} is reporting {{ printf "%.2f" $value }}% normalized load average. Normalized load average is current load average divided by number of CPU cores of the node.
expr: nodepool:node_load1:normalized{nodepool=~".*workspace.*", cluster!~"ephemeral.*"} > 10
groups:
- name: workspace-nodes-rules
rules:
- record: nodepool:node_load1:normalized
expr: |
node_load1 * on(node) group_left(nodepool) kube_node_labels
/
count without (cpu) (
count without (mode) (
node_cpu_seconds_total * on(node) group_left(nodepool) kube_node_labels
)
)
- name: workspace-nodes-alerts
rules:
- alert: GitpodWorkspaceNodeHighNormalizedLoadAverage
labels:
severity: warning
team: workspace
for: 60m
annotations:
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/GitpodWorkspaceNodeHighNormalizedLoadAverage.md
summary: Workspace node's normalized load average is higher than 10 for more than 60 minutes. Check for abuse.
description: Node {{ $labels.node }} in {{ $labels.cluster }} is reporting {{ printf "%.2f" $value }}% normalized load average. Normalized load average is current load average divided by number of CPU cores of the node.
expr: nodepool:node_load1:normalized{nodepool=~".*workspace.*", cluster!~"ephemeral.*"} > 10

- alert: AutoscalerAddsNodesTooFast
labels:
severity: critical
annotations:
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/AutoscalerAddsNodesTooFast.md
summary: Autoscaler is adding new nodes rapidly
description: Autoscaler in cluster {{ $labels.cluster }} is rapidly adding new nodes.
expr: ((sum(kube_node_labels{nodepool=~"workspace-.*", cluster!~"ephemeral.*"}) by (cluster)) - (sum(kube_node_labels{nodepool=~"workspace-.*", cluster!~"ephemeral.*"} offset 10m) by (cluster))) > 15
- alert: AutoscalerAddsNodesTooFast
labels:
severity: critical
annotations:
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/AutoscalerAddsNodesTooFast.md
summary: Autoscaler is adding new nodes rapidly
description: Autoscaler in cluster {{ $labels.cluster }} is rapidly adding new nodes.
expr: ((sum(kube_node_labels{nodepool=~"workspace-.*", cluster!~"ephemeral.*"}) by (cluster)) - (sum(kube_node_labels{nodepool=~"workspace-.*", cluster!~"ephemeral.*"} offset 10m) by (cluster))) > 15

- alert: AutoscaleFailure
labels:
severity: warning
team: workspace
annotations:
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/AutoscaleFailure.md
summary: Automatic scale-up failed for some reason.
description: Automatic scale-up in cluster {{ $labels.cluster }} failed due to {{ $labels.reason }}.
expr: |
increase(cluster_autoscaler_failed_scale_ups_total{cluster!~"ephemeral.*"}[1m]) != 0
- alert: AutoscaleFailure
labels:
severity: warning
team: workspace
annotations:
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/AutoscaleFailure.md
summary: Automatic scale-up failed for some reason.
description: Automatic scale-up in cluster {{ $labels.cluster }} failed due to {{ $labels.reason }}.
expr: |
increase(cluster_autoscaler_failed_scale_ups_total{cluster!~"ephemeral.*"}[1m]) != 0
- alert: NodePoolLoad
labels:
severity: critical
team: workspace
for: 60m
annotations:
runbook_url: https://github.com/gitpod-io/runbooks/blob/main/runbooks/NodePoolLoad.md
summary: Node pool load is high
description: Node pool {{ $labels.nodepool }} in cluster {{ $labels.cluster }} has high, sustained load
expr: |
sum by(nodepool) (nodepool:node_load1:normalized{nodepool=~".*workspace.*",cluster!~"ephemeral.*"}}) > 40

0 comments on commit 2208a87

Please sign in to comment.