diff --git a/CHANGELOG.md b/CHANGELOG.md index 217cb35db62..e6e2ea9f01f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -113,6 +113,7 @@ * [ENHANCEMENT] Dashboards: remove "All" option for namespace dropdown in dashboards. #8829 * [ENHANCEMENT] Dashboards: add Kafka end-to-end latency outliers panel in the "Mimir / Writes" dashboard. #8948 * [ENHANCEMENT] Dashboards: add "Out-of-order samples appended" panel to "Mimir / Tenants" dashboard. #8939 +* [ENHANCEMENT] Alerts: `RequestErrors` and `RulerRemoteEvaluationFailing` have been enriched with a native histogram version. #9004 * [BUGFIX] Dashboards: fix "current replicas" in autoscaling panels when HPA is not active. #8566 * [BUGFIX] Alerts: do not fire `MimirRingMembersMismatch` during the migration to experimental ingest storage. #8727 diff --git a/operations/helm/charts/mimir-distributed/CHANGELOG.md b/operations/helm/charts/mimir-distributed/CHANGELOG.md index 4946e7dbca0..ecb0067aac6 100644 --- a/operations/helm/charts/mimir-distributed/CHANGELOG.md +++ b/operations/helm/charts/mimir-distributed/CHANGELOG.md @@ -45,6 +45,7 @@ Entries should include a reference to the Pull Request that introduced the chang * [ENHANCEMENT] Add support for setting namespace for dashboard config maps. #8813 * [ENHANCEMENT] Add support for string `extraObjects` for better support with templating. #8825 * [ENHANCEMENT] Helm : allow setting a read and write urls to continous-test. #7674 +* [ENHANCEMENT] Alerts: `RequestErrors` and `RulerRemoteEvaluationFailing` have been enriched with a native histogram version. #9004 * [BUGFIX] Add missing container security context to run `continuous-test` under the restricted security policy. #8653 * [BUGFIX] Add `global.extraVolumeMounts` to the exporter container on memcached statefulsets #8787 * [BUGFIX] Fix helm releases failing when `querier.kedaAutoscaling.predictiveScalingEnabled=true`. #8731 diff --git a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml index 71b1abd150b..e611a8a40e4 100644 --- a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml +++ b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml @@ -32,12 +32,31 @@ spec: # - 529: used by distributor rate limiting (using 529 instead of 429 to let the client retry) # - 598: used by GEM gateway when the client is very slow to send the request and the gateway times out reading the request body ( - sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{status_code=~"5..",status_code!~"529|598",route!~"ready|debug_pprof"}[1m])) + sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{status_code=~"5..", status_code!~"529|598", route!~"ready|debug_pprof"}[1m])) / sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{route!~"ready|debug_pprof"}[1m])) ) * 100 > 1 for: 15m labels: + histogram: classic + severity: critical + - alert: MimirRequestErrors + annotations: + message: | + The route {{ $labels.route }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% errors. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrequesterrors + expr: | + # The following 5xx errors considered as non-error: + # - 529: used by distributor rate limiting (using 529 instead of 429 to let the client retry) + # - 598: used by GEM gateway when the client is very slow to send the request and the gateway times out reading the request body + ( + sum by (cluster, namespace, job, route) (histogram_count(rate(cortex_request_duration_seconds{status_code=~"5..", status_code!~"529|598", route!~"ready|debug_pprof"}[1m]))) + / + sum by (cluster, namespace, job, route) (histogram_count(rate(cortex_request_duration_seconds{route!~"ready|debug_pprof"}[1m]))) + ) * 100 > 1 + for: 15m + labels: + histogram: native severity: critical - alert: MimirRequestLatency annotations: @@ -485,13 +504,29 @@ spec: Mimir rulers in {{ $labels.cluster }}/{{ $labels.namespace }} are failing to perform {{ printf "%.2f" $value }}% of remote evaluations through the ruler-query-frontend. runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulerremoteevaluationfailing expr: | - 100 * ( - sum by (cluster, namespace) (rate(cortex_request_duration_seconds_count{route="/httpgrpc.HTTP/Handle", status_code=~"5..", job=~".*/(ruler-query-frontend.*)"}[5m])) + ( + sum by (cluster, namespace) (rate(cortex_request_duration_seconds_count{status_code=~"5..", route="/httpgrpc.HTTP/Handle", job=~".*/(ruler-query-frontend.*)"}[5m])) / - sum by (cluster, namespace) (rate(cortex_request_duration_seconds_count{route="/httpgrpc.HTTP/Handle", job=~".*/(ruler-query-frontend.*)"}[5m])) - ) > 1 + sum by (cluster, namespace) (rate(cortex_request_duration_seconds_count{route="/httpgrpc.HTTP/Handle", job=~".*/(ruler-query-frontend.*)"}[5m])) + ) * 100 > 1 + for: 5m + labels: + histogram: classic + severity: warning + - alert: MimirRulerRemoteEvaluationFailing + annotations: + message: | + Mimir rulers in {{ $labels.cluster }}/{{ $labels.namespace }} are failing to perform {{ printf "%.2f" $value }}% of remote evaluations through the ruler-query-frontend. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulerremoteevaluationfailing + expr: | + ( + sum by (cluster, namespace) (histogram_count(rate(cortex_request_duration_seconds{status_code=~"5..", route="/httpgrpc.HTTP/Handle", job=~".*/(ruler-query-frontend.*)"}[5m]))) + / + sum by (cluster, namespace) (histogram_count(rate(cortex_request_duration_seconds{route="/httpgrpc.HTTP/Handle", job=~".*/(ruler-query-frontend.*)"}[5m]))) + ) * 100 > 1 for: 5m labels: + histogram: native severity: warning - name: gossip_alerts rules: diff --git a/operations/mimir-mixin-compiled-baremetal/alerts.yaml b/operations/mimir-mixin-compiled-baremetal/alerts.yaml index 14aa12775f3..09f3237f9ae 100644 --- a/operations/mimir-mixin-compiled-baremetal/alerts.yaml +++ b/operations/mimir-mixin-compiled-baremetal/alerts.yaml @@ -20,12 +20,31 @@ groups: # - 529: used by distributor rate limiting (using 529 instead of 429 to let the client retry) # - 598: used by GEM gateway when the client is very slow to send the request and the gateway times out reading the request body ( - sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{status_code=~"5..",status_code!~"529|598",route!~"ready|debug_pprof"}[1m])) + sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{status_code=~"5..", status_code!~"529|598", route!~"ready|debug_pprof"}[1m])) / sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{route!~"ready|debug_pprof"}[1m])) ) * 100 > 1 for: 15m labels: + histogram: classic + severity: critical + - alert: MimirRequestErrors + annotations: + message: | + The route {{ $labels.route }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% errors. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrequesterrors + expr: | + # The following 5xx errors considered as non-error: + # - 529: used by distributor rate limiting (using 529 instead of 429 to let the client retry) + # - 598: used by GEM gateway when the client is very slow to send the request and the gateway times out reading the request body + ( + sum by (cluster, namespace, job, route) (histogram_count(rate(cortex_request_duration_seconds{status_code=~"5..", status_code!~"529|598", route!~"ready|debug_pprof"}[1m]))) + / + sum by (cluster, namespace, job, route) (histogram_count(rate(cortex_request_duration_seconds{route!~"ready|debug_pprof"}[1m]))) + ) * 100 > 1 + for: 15m + labels: + histogram: native severity: critical - alert: MimirRequestLatency annotations: @@ -463,13 +482,29 @@ groups: Mimir rulers in {{ $labels.cluster }}/{{ $labels.namespace }} are failing to perform {{ printf "%.2f" $value }}% of remote evaluations through the ruler-query-frontend. runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulerremoteevaluationfailing expr: | - 100 * ( - sum by (cluster, namespace) (rate(cortex_request_duration_seconds_count{route="/httpgrpc.HTTP/Handle", status_code=~"5..", job=~".*/(ruler-query-frontend.*)"}[5m])) + ( + sum by (cluster, namespace) (rate(cortex_request_duration_seconds_count{status_code=~"5..", route="/httpgrpc.HTTP/Handle", job=~".*/(ruler-query-frontend.*)"}[5m])) / - sum by (cluster, namespace) (rate(cortex_request_duration_seconds_count{route="/httpgrpc.HTTP/Handle", job=~".*/(ruler-query-frontend.*)"}[5m])) - ) > 1 + sum by (cluster, namespace) (rate(cortex_request_duration_seconds_count{route="/httpgrpc.HTTP/Handle", job=~".*/(ruler-query-frontend.*)"}[5m])) + ) * 100 > 1 + for: 5m + labels: + histogram: classic + severity: warning + - alert: MimirRulerRemoteEvaluationFailing + annotations: + message: | + Mimir rulers in {{ $labels.cluster }}/{{ $labels.namespace }} are failing to perform {{ printf "%.2f" $value }}% of remote evaluations through the ruler-query-frontend. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulerremoteevaluationfailing + expr: | + ( + sum by (cluster, namespace) (histogram_count(rate(cortex_request_duration_seconds{status_code=~"5..", route="/httpgrpc.HTTP/Handle", job=~".*/(ruler-query-frontend.*)"}[5m]))) + / + sum by (cluster, namespace) (histogram_count(rate(cortex_request_duration_seconds{route="/httpgrpc.HTTP/Handle", job=~".*/(ruler-query-frontend.*)"}[5m]))) + ) * 100 > 1 for: 5m labels: + histogram: native severity: warning - name: gossip_alerts rules: diff --git a/operations/mimir-mixin-compiled/alerts.yaml b/operations/mimir-mixin-compiled/alerts.yaml index e0cda36e95e..adf97fac0e3 100644 --- a/operations/mimir-mixin-compiled/alerts.yaml +++ b/operations/mimir-mixin-compiled/alerts.yaml @@ -20,12 +20,31 @@ groups: # - 529: used by distributor rate limiting (using 529 instead of 429 to let the client retry) # - 598: used by GEM gateway when the client is very slow to send the request and the gateway times out reading the request body ( - sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{status_code=~"5..",status_code!~"529|598",route!~"ready|debug_pprof"}[1m])) + sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{status_code=~"5..", status_code!~"529|598", route!~"ready|debug_pprof"}[1m])) / sum by (cluster, namespace, job, route) (rate(cortex_request_duration_seconds_count{route!~"ready|debug_pprof"}[1m])) ) * 100 > 1 for: 15m labels: + histogram: classic + severity: critical + - alert: MimirRequestErrors + annotations: + message: | + The route {{ $labels.route }} in {{ $labels.cluster }}/{{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% errors. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrequesterrors + expr: | + # The following 5xx errors considered as non-error: + # - 529: used by distributor rate limiting (using 529 instead of 429 to let the client retry) + # - 598: used by GEM gateway when the client is very slow to send the request and the gateway times out reading the request body + ( + sum by (cluster, namespace, job, route) (histogram_count(rate(cortex_request_duration_seconds{status_code=~"5..", status_code!~"529|598", route!~"ready|debug_pprof"}[1m]))) + / + sum by (cluster, namespace, job, route) (histogram_count(rate(cortex_request_duration_seconds{route!~"ready|debug_pprof"}[1m]))) + ) * 100 > 1 + for: 15m + labels: + histogram: native severity: critical - alert: MimirRequestLatency annotations: @@ -473,13 +492,29 @@ groups: Mimir rulers in {{ $labels.cluster }}/{{ $labels.namespace }} are failing to perform {{ printf "%.2f" $value }}% of remote evaluations through the ruler-query-frontend. runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulerremoteevaluationfailing expr: | - 100 * ( - sum by (cluster, namespace) (rate(cortex_request_duration_seconds_count{route="/httpgrpc.HTTP/Handle", status_code=~"5..", job=~".*/(ruler-query-frontend.*)"}[5m])) + ( + sum by (cluster, namespace) (rate(cortex_request_duration_seconds_count{status_code=~"5..", route="/httpgrpc.HTTP/Handle", job=~".*/(ruler-query-frontend.*)"}[5m])) / - sum by (cluster, namespace) (rate(cortex_request_duration_seconds_count{route="/httpgrpc.HTTP/Handle", job=~".*/(ruler-query-frontend.*)"}[5m])) - ) > 1 + sum by (cluster, namespace) (rate(cortex_request_duration_seconds_count{route="/httpgrpc.HTTP/Handle", job=~".*/(ruler-query-frontend.*)"}[5m])) + ) * 100 > 1 + for: 5m + labels: + histogram: classic + severity: warning + - alert: MimirRulerRemoteEvaluationFailing + annotations: + message: | + Mimir rulers in {{ $labels.cluster }}/{{ $labels.namespace }} are failing to perform {{ printf "%.2f" $value }}% of remote evaluations through the ruler-query-frontend. + runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirrulerremoteevaluationfailing + expr: | + ( + sum by (cluster, namespace) (histogram_count(rate(cortex_request_duration_seconds{status_code=~"5..", route="/httpgrpc.HTTP/Handle", job=~".*/(ruler-query-frontend.*)"}[5m]))) + / + sum by (cluster, namespace) (histogram_count(rate(cortex_request_duration_seconds{route="/httpgrpc.HTTP/Handle", job=~".*/(ruler-query-frontend.*)"}[5m]))) + ) * 100 > 1 for: 5m labels: + histogram: native severity: warning - name: gossip_alerts rules: diff --git a/operations/mimir-mixin/alerts/alerts.libsonnet b/operations/mimir-mixin/alerts/alerts.libsonnet index c3778eaa4dd..1aca13ec3ff 100644 --- a/operations/mimir-mixin/alerts/alerts.libsonnet +++ b/operations/mimir-mixin/alerts/alerts.libsonnet @@ -12,6 +12,80 @@ local utils = import 'mixin-utils/utils.libsonnet'; local groupStatefulSetByRolloutGroup(metricName) = 'sum without(statefulset) (label_replace(%s, "rollout_group", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?"))' % metricName, + local request_metric = 'cortex_request_duration_seconds', + + local rate(error_query, total_query, comment='') = ||| + %(comment)s( + %(errorQuery)s + / + %(totalQuery)s + ) * 100 > 1 + ||| % { comment: comment, errorQuery: error_query, totalQuery: total_query }, + + local requestErrorsQuery(selector, error_selector, rate_interval, sum_by, comment='') = + local errorSelector = '%s, %s' % [error_selector, selector]; + local errorQuery = utils.ncHistogramSumBy(utils.ncHistogramCountRate(request_metric, errorSelector, rate_interval), sum_by); + local totalQuery = utils.ncHistogramSumBy(utils.ncHistogramCountRate(request_metric, selector, rate_interval), sum_by); + { + classic: rate(errorQuery.classic, totalQuery.classic, comment), + native: rate(errorQuery.native, totalQuery.native, comment), + }, + + local requestErrorsAlert(histogram) = + local query = requestErrorsQuery( + selector='route!~"%s"' % std.join('|', ['ready'] + $._config.alert_excluded_routes), + // Note if alert_aggregation_labels is "job", this will repeat the label. But + // prometheus seems to tolerate that. + error_selector='status_code=~"5..", status_code!~"529|598"', + rate_interval=$.alertRangeInterval(1), + sum_by=[$._config.alert_aggregation_labels, $._config.per_job_label, 'route'], + comment=||| + # The following 5xx errors considered as non-error: + # - 529: used by distributor rate limiting (using 529 instead of 429 to let the client retry) + # - 598: used by GEM gateway when the client is very slow to send the request and the gateway times out reading the request body + |||, + ); + if histogram != 'classic' && histogram != 'native' + then {} + else { + alert: $.alertName('RequestErrors'), + expr: if histogram == 'classic' then query.classic else query.native, + 'for': '15m', + labels: { + severity: 'critical', + histogram: histogram, + }, + annotations: { + message: ||| + The route {{ $labels.route }} in %(alert_aggregation_variables)s is experiencing {{ printf "%%.2f" $value }}%% errors. + ||| % $._config, + }, + }, + + local rulerRemoteEvaluationFailingAlert(histogram) = + local query = requestErrorsQuery( + selector='route="/httpgrpc.HTTP/Handle", %s' % $.jobMatcher($._config.job_names.ruler_query_frontend), + error_selector='status_code=~"5.."', + rate_interval=$.alertRangeInterval(5), + sum_by=[$._config.alert_aggregation_labels], + ); + if histogram != 'classic' && histogram != 'native' + then {} + else { + alert: $.alertName('RulerRemoteEvaluationFailing'), + expr: if histogram == 'classic' then query.classic else query.native, + 'for': '5m', + labels: { + severity: 'warning', + histogram: histogram, + }, + annotations: { + message: ||| + %(product)s rulers in %(alert_aggregation_variables)s are failing to perform {{ printf "%%.2f" $value }}%% of remote evaluations through the ruler-query-frontend. + ||| % $._config, + }, + }, + local alertGroups = [ { name: 'mimir_alerts', @@ -29,35 +103,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; message: '%(product)s cluster %(alert_aggregation_variables)s has {{ printf "%%f" $value }} unhealthy ingester(s).' % $._config, }, }, - { - alert: $.alertName('RequestErrors'), - // Note if alert_aggregation_labels is "job", this will repeat the label. But - // prometheus seems to tolerate that. - expr: ||| - # The following 5xx errors considered as non-error: - # - 529: used by distributor rate limiting (using 529 instead of 429 to let the client retry) - # - 598: used by GEM gateway when the client is very slow to send the request and the gateway times out reading the request body - ( - sum by (%(group_by)s, %(job_label)s, route) (rate(cortex_request_duration_seconds_count{status_code=~"5..",status_code!~"529|598",route!~"%(excluded_routes)s"}[%(range_interval)s])) - / - sum by (%(group_by)s, %(job_label)s, route) (rate(cortex_request_duration_seconds_count{route!~"%(excluded_routes)s"}[%(range_interval)s])) - ) * 100 > 1 - ||| % { - group_by: $._config.alert_aggregation_labels, - job_label: $._config.per_job_label, - excluded_routes: std.join('|', ['ready'] + $._config.alert_excluded_routes), - range_interval: $.alertRangeInterval(1), - }, - 'for': '15m', - labels: { - severity: 'critical', - }, - annotations: { - message: ||| - The route {{ $labels.route }} in %(alert_aggregation_variables)s is experiencing {{ printf "%%.2f" $value }}%% errors. - ||| % $._config, - }, - }, + requestErrorsAlert('classic'), + requestErrorsAlert('native'), { alert: $.alertName('RequestLatency'), expr: ||| @@ -708,29 +755,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; ||| % $._config, }, }, - { - alert: $.alertName('RulerRemoteEvaluationFailing'), - expr: ||| - 100 * ( - sum by (%(alert_aggregation_labels)s) (rate(cortex_request_duration_seconds_count{route="/httpgrpc.HTTP/Handle", status_code=~"5..", %(job_regex)s}[%(range_interval)s])) - / - sum by (%(alert_aggregation_labels)s) (rate(cortex_request_duration_seconds_count{route="/httpgrpc.HTTP/Handle", %(job_regex)s}[%(range_interval)s])) - ) > 1 - ||| % { - alert_aggregation_labels: $._config.alert_aggregation_labels, - job_regex: $.jobMatcher($._config.job_names.ruler_query_frontend), - range_interval: $.alertRangeInterval(5), - }, - 'for': '5m', - labels: { - severity: 'warning', - }, - annotations: { - message: ||| - %(product)s rulers in %(alert_aggregation_variables)s are failing to perform {{ printf "%%.2f" $value }}%% of remote evaluations through the ruler-query-frontend. - ||| % $._config, - }, - }, + rulerRemoteEvaluationFailingAlert('classic'), + rulerRemoteEvaluationFailingAlert('native'), ], }, {