From d6305f5920c47944202756279e56ddb0fadbf7fc Mon Sep 17 00:00:00 2001 From: Markos Chandras Date: Wed, 12 Aug 2020 18:20:22 +0300 Subject: [PATCH] mixin: Fix alert about unhealthy sidecar (#2929) The alert was giving the wrong information as the $value contained the number of pods that failing to send heartbeat instead of the actual number of seconds that each sidecar was being unhealthy. Also the 5 minute interval is probably too low as on large deployments prometheus could take much longer to come up online and for sidecar to become actually useful. As such, we can simply subtract the timestamp of the last heartbeat from the current time and fire if we are lagging for more than 10 minutes. Signed-off-by: Markos Chandras --- CHANGELOG.md | 1 + examples/alerts/alerts.md | 2 +- examples/alerts/alerts.yaml | 2 +- examples/alerts/tests.yaml | 86 ++++++++++++++++++++++------------ mixin/alerts/sidecar.libsonnet | 2 +- 5 files changed, 59 insertions(+), 34 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b68640aac1..6b1b261ef1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,7 @@ We use *breaking* word for marking changes that are not backward compatible (rel - [#2970](https://github.com/thanos-io/thanos/pull/2970) Store: Upgrade minio-go/v7 to fix slowness when running on EKS. - [#2957](https://github.com/thanos-io/thanos/pull/2957) Rule: now sets all of the relevant fields properly; avoids a panic when `/api/v1/rules` is called and the time zone is _not_ UTC; `rules` field is an empty array now if no rules have been defined in a rule group. - [#2976](https://github.com/thanos-io/thanos/pull/2976) Query: Better rounding for incoming query timestamps. +- [#2929](https://github.com/thanos-io/thanos/pull/2929) Mixin: Fix expression for 'unhealthy sidecar' alert and also increase the timeout for 10 minutes. ### Added diff --git a/examples/alerts/alerts.md b/examples/alerts/alerts.md index cb40e2a9f7..89e567a07d 100644 --- a/examples/alerts/alerts.md +++ b/examples/alerts/alerts.md @@ -275,7 +275,7 @@ rules: message: Thanos Sidecar {{$labels.job}} {{$labels.pod}} is unhealthy for {{ $value }} seconds. expr: | - count(time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job=~"thanos-sidecar.*"}) by (job, pod) >= 300) > 0 + time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job=~"thanos-sidecar.*"}) by (job, pod) >= 600 labels: severity: critical ``` diff --git a/examples/alerts/alerts.yaml b/examples/alerts/alerts.yaml index 72c3279e49..ad5f75301b 100644 --- a/examples/alerts/alerts.yaml +++ b/examples/alerts/alerts.yaml @@ -258,7 +258,7 @@ groups: message: Thanos Sidecar {{$labels.job}} {{$labels.pod}} is unhealthy for {{ $value }} seconds. expr: | - count(time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job=~"thanos-sidecar.*"}) by (job, pod) >= 300) > 0 + time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job=~"thanos-sidecar.*"}) by (job, pod) >= 600 labels: severity: critical - name: thanos-store.rules diff --git a/examples/alerts/tests.yaml b/examples/alerts/tests.yaml index 25df0414e4..adac87b9a4 100644 --- a/examples/alerts/tests.yaml +++ b/examples/alerts/tests.yaml @@ -22,47 +22,47 @@ tests: exp_samples: - labels: '{}' value: 120 - - expr: max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (pod) + - expr: max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job, pod) eval_time: 2m exp_samples: - - labels: '{pod="thanos-sidecar-pod-0"}' + - labels: '{job="thanos-sidecar", pod="thanos-sidecar-pod-0"}' value: 43 - - labels: '{pod="thanos-sidecar-pod-1"}' + - labels: '{job="thanos-sidecar", pod="thanos-sidecar-pod-1"}' value: 42 - - expr: max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (pod) - eval_time: 5m + - expr: max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job, pod) + eval_time: 10m exp_samples: - - labels: '{pod="thanos-sidecar-pod-0"}' + - labels: '{job="thanos-sidecar", pod="thanos-sidecar-pod-0"}' value: 0 - - labels: '{pod="thanos-sidecar-pod-1"}' + - labels: '{job="thanos-sidecar", pod="thanos-sidecar-pod-1"}' value: 0 - - expr: max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (pod) - eval_time: 6m + - expr: max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job, pod) + eval_time: 11m exp_samples: - - labels: '{pod="thanos-sidecar-pod-0"}' + - labels: '{job="thanos-sidecar", pod="thanos-sidecar-pod-0"}' value: 0 - - labels: '{pod="thanos-sidecar-pod-1"}' + - labels: '{job="thanos-sidecar", pod="thanos-sidecar-pod-1"}' value: 0 - - expr: time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (pod) - eval_time: 5m + - expr: time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job, pod) + eval_time: 10m exp_samples: - - labels: '{pod="thanos-sidecar-pod-0"}' - value: 300 - - labels: '{pod="thanos-sidecar-pod-1"}' - value: 300 - - expr: time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (pod) - eval_time: 6m + - labels: '{job="thanos-sidecar", pod="thanos-sidecar-pod-0"}' + value: 600 + - labels: '{job="thanos-sidecar", pod="thanos-sidecar-pod-1"}' + value: 600 + - expr: time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job, pod) + eval_time: 11m exp_samples: - - labels: '{pod="thanos-sidecar-pod-0"}' - value: 360 - - labels: '{pod="thanos-sidecar-pod-1"}' - value: 360 - - expr: time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (pod) >= 300 + - labels: '{job="thanos-sidecar", pod="thanos-sidecar-pod-0"}' + value: 660 + - labels: '{job="thanos-sidecar", pod="thanos-sidecar-pod-1"}' + value: 660 + - expr: time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job, pod) >= 600 eval_time: 12m exp_samples: - - labels: '{pod="thanos-sidecar-pod-0"}' + - labels: '{job="thanos-sidecar", pod="thanos-sidecar-pod-0"}' value: 720 - - labels: '{pod="thanos-sidecar-pod-1"}' + - labels: '{job="thanos-sidecar", pod="thanos-sidecar-pod-1"}' value: 720 alert_rule_test: - eval_time: 1m @@ -71,24 +71,48 @@ tests: alertname: ThanosSidecarUnhealthy - eval_time: 3m alertname: ThanosSidecarUnhealthy - - eval_time: 5m + - eval_time: 10m alertname: ThanosSidecarUnhealthy exp_alerts: - exp_labels: severity: critical + job: thanos-sidecar + pod: thanos-sidecar-pod-0 exp_annotations: - message: 'Thanos Sidecar is unhealthy for 2 seconds.' - - eval_time: 6m + message: 'Thanos Sidecar thanos-sidecar thanos-sidecar-pod-0 is unhealthy for 600 seconds.' + - exp_labels: + severity: critical + job: thanos-sidecar + pod: thanos-sidecar-pod-1 + exp_annotations: + message: 'Thanos Sidecar thanos-sidecar thanos-sidecar-pod-1 is unhealthy for 600 seconds.' + - eval_time: 11m alertname: ThanosSidecarUnhealthy exp_alerts: - exp_labels: severity: critical + job: thanos-sidecar + pod: thanos-sidecar-pod-0 exp_annotations: - message: 'Thanos Sidecar is unhealthy for 2 seconds.' + message: 'Thanos Sidecar thanos-sidecar thanos-sidecar-pod-0 is unhealthy for 660 seconds.' + - exp_labels: + severity: critical + job: thanos-sidecar + pod: thanos-sidecar-pod-1 + exp_annotations: + message: 'Thanos Sidecar thanos-sidecar thanos-sidecar-pod-1 is unhealthy for 660 seconds.' - eval_time: 12m alertname: ThanosSidecarUnhealthy exp_alerts: - exp_labels: severity: critical + job: thanos-sidecar + pod: thanos-sidecar-pod-0 + exp_annotations: + message: 'Thanos Sidecar thanos-sidecar thanos-sidecar-pod-0 is unhealthy for 720 seconds.' + - exp_labels: + severity: critical + job: thanos-sidecar + pod: thanos-sidecar-pod-1 exp_annotations: - message: 'Thanos Sidecar is unhealthy for 2 seconds.' + message: 'Thanos Sidecar thanos-sidecar thanos-sidecar-pod-1 is unhealthy for 720 seconds.' diff --git a/mixin/alerts/sidecar.libsonnet b/mixin/alerts/sidecar.libsonnet index c81e2ba0a9..e1790dbac6 100644 --- a/mixin/alerts/sidecar.libsonnet +++ b/mixin/alerts/sidecar.libsonnet @@ -27,7 +27,7 @@ message: 'Thanos Sidecar {{$labels.job}} {{$labels.pod}} is unhealthy for {{ $value }} seconds.', }, expr: ||| - count(time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{%(selector)s}) by (job, pod) >= 300) > 0 + time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{%(selector)s}) by (job, pod) >= 600 ||| % thanos.sidecar, labels: { severity: 'critical',