From 58b6ee346fccbe7d263ab549535bfddb8b222701 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gy=C3=B6rgy=20Krajcsovits?= Date: Thu, 14 Mar 2024 10:04:12 +0100 Subject: [PATCH] dashboards: overview: use native histograms in status MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allow switching between basing status on classic or native version of cortex_request_duration_seconds. Related to #7154 Signed-off-by: György Krajcsovits --- CHANGELOG.md | 2 + .../dashboards/mimir-overview.json | 57 ++++++++++- .../dashboards/mimir-reads.json | 2 +- .../dashboards/mimir-writes.json | 2 +- .../dashboards/mimir-overview.json | 57 ++++++++++- .../dashboards/mimir-reads.json | 2 +- .../dashboards/mimir-writes.json | 2 +- .../dashboards/dashboard-queries.libsonnet | 99 +++++++++++++++---- .../mimir-mixin/dashboards/overview.libsonnet | 19 +++- operations/mimir-mixin/jsonnetfile.lock.json | 8 +- 10 files changed, 211 insertions(+), 39 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c8c49022844..d40f6945686 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,8 @@ ### Mixin * [ENHANCEMENT] Alerts: allow configuring alerts range interval via `_config.base_alerts_range_interval_minutes`. #7591 +* [ENHANCEMENT] Dashboards: allow switching between using classic of native histograms in dashboards. #7627 + Overview dashboard, Status panel, `cortex_request_duration_seconds` metric. ### Jsonnet diff --git a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-overview.json b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-overview.json index df6aa21b232..29ad0fd641f 100644 --- a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-overview.json +++ b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-overview.json @@ -81,7 +81,7 @@ "uid": "$datasource" }, "exemplar": false, - "expr": "(\n # gRPC errors are not tracked as 5xx but \"error\".\n sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\",status_code=~\"5.*|error\"}[$__rate_interval]))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))\n", + "expr": "(\n # gRPC errors are not tracked as 5xx but \"error\".\n sum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\",status_code=~\"5.*|error\"}[$__rate_interval])))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])))\n < ($latency_metrics * -Inf)", "instant": false, "legendFormat": "Writes", "range": true @@ -91,7 +91,27 @@ "uid": "$datasource" }, "exemplar": false, - "expr": "(\n sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\",status_code=~\"5.*\"}[$__rate_interval]))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]))\n", + "expr": "(\n # gRPC errors are not tracked as 5xx but \"error\".\n sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\",status_code=~\"5.*|error\"}[$__rate_interval]))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))\n < ($latency_metrics * +Inf)", + "instant": false, + "legendFormat": "Writes", + "range": true + }, + { + "datasource": { + "uid": "$datasource" + }, + "exemplar": false, + "expr": "(\n sum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\",status_code=~\"5.*|error\"}[$__rate_interval])))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval])))\n < ($latency_metrics * -Inf)", + "instant": false, + "legendFormat": "Reads", + "range": true + }, + { + "datasource": { + "uid": "$datasource" + }, + "exemplar": false, + "expr": "(\n sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\",status_code=~\"5.*|error\"}[$__rate_interval]))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]))\n < ($latency_metrics * +Inf)", "instant": false, "legendFormat": "Reads", "range": true @@ -348,7 +368,7 @@ "span": 3, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -670,7 +690,7 @@ "span": 3, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -1450,6 +1470,35 @@ "tagsQuery": "", "type": "query", "useTags": false + }, + { + "current": { + "selected": true, + "text": "classic", + "value": "1" + }, + "description": "Choose between showing latencies based on low precision classic or high precision native histogram metrics.", + "hide": 0, + "includeAll": false, + "label": "Latency metrics", + "multi": false, + "name": "latency_metrics", + "options": [ + { + "selected": false, + "text": "native", + "value": "-1" + }, + { + "selected": true, + "text": "classic", + "value": "1" + } + ], + "query": "native : -1,classic : 1", + "skipUrlSync": false, + "type": "custom", + "useTags": false } ] }, diff --git a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-reads.json b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-reads.json index 0f5b707d489..583aec28a6a 100644 --- a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-reads.json +++ b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-reads.json @@ -624,7 +624,7 @@ "span": 4, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" diff --git a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json index 7995372bfc4..309c7009072 100644 --- a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json +++ b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json @@ -623,7 +623,7 @@ "span": 4, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" diff --git a/operations/mimir-mixin-compiled/dashboards/mimir-overview.json b/operations/mimir-mixin-compiled/dashboards/mimir-overview.json index df6aa21b232..29ad0fd641f 100644 --- a/operations/mimir-mixin-compiled/dashboards/mimir-overview.json +++ b/operations/mimir-mixin-compiled/dashboards/mimir-overview.json @@ -81,7 +81,7 @@ "uid": "$datasource" }, "exemplar": false, - "expr": "(\n # gRPC errors are not tracked as 5xx but \"error\".\n sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\",status_code=~\"5.*|error\"}[$__rate_interval]))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))\n", + "expr": "(\n # gRPC errors are not tracked as 5xx but \"error\".\n sum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\",status_code=~\"5.*|error\"}[$__rate_interval])))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval])))\n < ($latency_metrics * -Inf)", "instant": false, "legendFormat": "Writes", "range": true @@ -91,7 +91,27 @@ "uid": "$datasource" }, "exemplar": false, - "expr": "(\n sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\",status_code=~\"5.*\"}[$__rate_interval]))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]))\n", + "expr": "(\n # gRPC errors are not tracked as 5xx but \"error\".\n sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\",status_code=~\"5.*|error\"}[$__rate_interval]))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]))\n < ($latency_metrics * +Inf)", + "instant": false, + "legendFormat": "Writes", + "range": true + }, + { + "datasource": { + "uid": "$datasource" + }, + "exemplar": false, + "expr": "(\n sum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\",status_code=~\"5.*|error\"}[$__rate_interval])))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval])))\n < ($latency_metrics * -Inf)", + "instant": false, + "legendFormat": "Reads", + "range": true + }, + { + "datasource": { + "uid": "$datasource" + }, + "exemplar": false, + "expr": "(\n sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\",status_code=~\"5.*|error\"}[$__rate_interval]))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]))\n < ($latency_metrics * +Inf)", "instant": false, "legendFormat": "Reads", "range": true @@ -348,7 +368,7 @@ "span": 3, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -670,7 +690,7 @@ "span": 3, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" @@ -1450,6 +1470,35 @@ "tagsQuery": "", "type": "query", "useTags": false + }, + { + "current": { + "selected": true, + "text": "classic", + "value": "1" + }, + "description": "Choose between showing latencies based on low precision classic or high precision native histogram metrics.", + "hide": 0, + "includeAll": false, + "label": "Latency metrics", + "multi": false, + "name": "latency_metrics", + "options": [ + { + "selected": false, + "text": "native", + "value": "-1" + }, + { + "selected": true, + "text": "classic", + "value": "1" + } + ], + "query": "native : -1,classic : 1", + "skipUrlSync": false, + "type": "custom", + "useTags": false } ] }, diff --git a/operations/mimir-mixin-compiled/dashboards/mimir-reads.json b/operations/mimir-mixin-compiled/dashboards/mimir-reads.json index 1d05752d68f..8858735a47d 100644 --- a/operations/mimir-mixin-compiled/dashboards/mimir-reads.json +++ b/operations/mimir-mixin-compiled/dashboards/mimir-reads.json @@ -624,7 +624,7 @@ "span": 4, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" diff --git a/operations/mimir-mixin-compiled/dashboards/mimir-writes.json b/operations/mimir-mixin-compiled/dashboards/mimir-writes.json index c91b1b084bd..d5cb7699c18 100644 --- a/operations/mimir-mixin-compiled/dashboards/mimir-writes.json +++ b/operations/mimir-mixin-compiled/dashboards/mimir-writes.json @@ -623,7 +623,7 @@ "span": 4, "targets": [ { - "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", + "expr": "sum by (status) (\n label_replace(label_replace(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((distributor.*|cortex|mimir|mimir-write.*))\", route=~\"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push|otlp_v1_metrics\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n", "format": "time_series", "legendFormat": "{{status}}", "refId": "A" diff --git a/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet b/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet index ca75f19c490..30ffe12acc3 100644 --- a/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet +++ b/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet @@ -1,3 +1,5 @@ +local utils = import 'mixin-utils/utils.libsonnet'; + { // This object contains common queries used in the Mimir dashboards. // These queries are NOT intended to be configurable or overriddeable via jsonnet, @@ -25,55 +27,102 @@ query_http_routes_regex: '(prometheus|api_prom)_api_v1_query(_range)?', gateway: { - writeRequestsPerSecond: 'cortex_request_duration_seconds_count{%(gatewayMatcher)s, route=~"%(writeHTTPRoutesRegex)s"}' % variables, - readRequestsPerSecond: 'cortex_request_duration_seconds_count{%(gatewayMatcher)s, route=~"%(readHTTPRoutesRegex)s"}' % variables, + local p = self, + writeRequestsPerSecondMetric: 'cortex_request_duration_seconds', + writeRequestsPerSecondSelector: '%(gatewayMatcher)s, route=~"%(writeHTTPRoutesRegex)s"' % variables, + // deprecated, will be removed + writeRequestsPerSecond: '%s{%s}' % [p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector], + readRequestsPerSecondMetric: 'cortex_request_duration_seconds', + readRequestsPerSecondSelector: '%(gatewayMatcher)s, route=~"%(readHTTPRoutesRegex)s"' % variables, + // deprecated, will be removed + readRequestsPerSecond: '%s{%s}' % [p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector], // Write failures rate as percentage of total requests. - writeFailuresRate: ||| + writeFailuresRate:: { + local template = ||| ( - sum(rate(cortex_request_duration_seconds_count{%(gatewayMatcher)s, route=~"%(writeHTTPRoutesRegex)s",status_code=~"5.*"}[$__rate_interval])) + # gRPC errors are not tracked as 5xx but "error". + sum(%(countFailQuery)s) or # Handle the case no failure has been tracked yet. vector(0) ) / - sum(rate(cortex_request_duration_seconds_count{%(gatewayMatcher)s, route=~"%(writeHTTPRoutesRegex)s"}[$__rate_interval])) - ||| % variables, + sum(%(countQuery)s) + |||, + classic: template % { + countFailQuery: utils.nativeClassicHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector+',status_code=~"5.*|error"').classic, + countQuery: utils.nativeClassicHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector).classic, + }, + native: template % { + countFailQuery: utils.nativeHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector+',status_code=~"5.*|error"').native, + countQuery: utils.nativeHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector).native, + }, + }, // Read failures rate as percentage of total requests. - readFailuresRate: ||| + readFailuresRate:: { + local template = ||| ( - sum(rate(cortex_request_duration_seconds_count{%(gatewayMatcher)s, route=~"%(readHTTPRoutesRegex)s",status_code=~"5.*"}[$__rate_interval])) + # gRPC errors are not tracked as 5xx but "error". + sum(%(countFailQuery)s) or # Handle the case no failure has been tracked yet. vector(0) ) / - sum(rate(cortex_request_duration_seconds_count{%(gatewayMatcher)s, route=~"%(readHTTPRoutesRegex)s"}[$__rate_interval])) - ||| % variables, + sum(%(countQuery)s) + |||, + classic: template % { + countFailQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector+',status_code=~"5.*|error"').classic, + countQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector).classic, + }, + native: template % { + countFailQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector+',status_code=~"5.*|error"').native, + countQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector).native, + }, + }, }, distributor: { - writeRequestsPerSecond: 'cortex_request_duration_seconds_count{%(distributorMatcher)s, route=~"%(writeGRPCRoutesRegex)s|%(writeHTTPRoutesRegex)s"}' % variables, + local p = self, + writeRequestsPerSecondMetric: 'cortex_request_duration_seconds', + writeRequestsPerSecondSelector: '%(distributorMatcher)s, route=~"%(writeGRPCRoutesRegex)s|%(writeHTTPRoutesRegex)s"' % variables, + // deprecated, will be removed + writeRequestsPerSecond: '%s{%s}' % [p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector], samplesPerSecond: 'sum(%(groupPrefixJobs)s:cortex_distributor_received_samples:rate5m{%(distributorMatcher)s})' % variables, exemplarsPerSecond: 'sum(%(groupPrefixJobs)s:cortex_distributor_received_exemplars:rate5m{%(distributorMatcher)s})' % variables, // Write failures rate as percentage of total requests. - writeFailuresRate: ||| + writeFailuresRate:: { + local template = ||| ( # gRPC errors are not tracked as 5xx but "error". - sum(rate(cortex_request_duration_seconds_count{%(distributorMatcher)s, route=~"%(writeGRPCRoutesRegex)s|%(writeHTTPRoutesRegex)s",status_code=~"5.*|error"}[$__rate_interval])) + sum(%(countFailQuery)s) or # Handle the case no failure has been tracked yet. vector(0) ) / - sum(rate(cortex_request_duration_seconds_count{%(distributorMatcher)s, route=~"%(writeGRPCRoutesRegex)s|%(writeHTTPRoutesRegex)s"}[$__rate_interval])) - ||| % variables, + sum(%(countQuery)s) + |||, + classic: template % { + countFailQuery: utils.nativeClassicHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector+',status_code=~"5.*|error"').classic, + countQuery: utils.nativeClassicHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector).classic, + }, + native: template % { + countFailQuery: utils.nativeClassicHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector+',status_code=~"5.*|error"').native, + countQuery: utils.nativeClassicHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector).native, + }, + }, }, query_frontend: { - readRequestsPerSecond: 'cortex_request_duration_seconds_count{%(queryFrontendMatcher)s, route=~"%(readHTTPRoutesRegex)s"}' % variables, + local p = self, + readRequestsPerSecondMetric: 'cortex_request_duration_seconds', + readRequestsPerSecondSelector: '%(queryFrontendMatcher)s, route=~"%(readHTTPRoutesRegex)s"' % variables, + // deprecated, will be removed + readRequestsPerSecond: '%s{%s}' % [p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector], instantQueriesPerSecond: 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route=~"(prometheus|api_prom)_api_v1_query"}[$__rate_interval]))' % variables, rangeQueriesPerSecond: 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route=~"(prometheus|api_prom)_api_v1_query_range"}[$__rate_interval]))' % variables, labelNamesQueriesPerSecond: 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route=~"(prometheus|api_prom)_api_v1_labels"}[$__rate_interval]))' % variables, @@ -88,16 +137,26 @@ otherQueriesPerSecond: 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route=~"(prometheus|api_prom)_api_v1_.*",route!~".*(query|query_range|label.*|series|read|metadata|query_exemplars|cardinality_.*)"}[$__rate_interval]))' % variables, // Read failures rate as percentage of total requests. - readFailuresRate: ||| + readFailuresRate:: { + local template = ||| ( - sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s, route=~"%(readHTTPRoutesRegex)s",status_code=~"5.*"}[$__rate_interval])) + sum(%(countFailQuery)s) or # Handle the case no failure has been tracked yet. vector(0) ) / - sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s, route=~"%(readHTTPRoutesRegex)s"}[$__rate_interval])) - ||| % variables, + sum(%(countQuery)s) + |||, + classic: template % { + countFailQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector+',status_code=~"5.*|error"').classic, + countQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector).classic, + }, + native: template % { + countFailQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector+',status_code=~"5.*|error"').native, + countQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector).native, + }, + }, }, ruler: { diff --git a/operations/mimir-mixin/dashboards/overview.libsonnet b/operations/mimir-mixin/dashboards/overview.libsonnet index 5b754d73b61..bc3a7def14a 100644 --- a/operations/mimir-mixin/dashboards/overview.libsonnet +++ b/operations/mimir-mixin/dashboards/overview.libsonnet @@ -33,6 +33,7 @@ local filename = 'mimir-overview.json'; assert std.md5(filename) == 'ffcd83628d7d4b5a03d1cafd159e6c9c' : 'UID of the dashboard has changed, please update references to dashboard.'; ($.dashboard('Overview') + { uid: std.md5(filename) }) .addClusterSelectorTemplates() + .addShowNativeLatencyVariable() .addRow( $.row('%(product)s cluster health' % $._config) @@ -53,9 +54,21 @@ local filename = 'mimir-overview.json'; 'Status', [ // Write failures. - if $._config.gateway_enabled then $.queries.gateway.writeFailuresRate else $.queries.distributor.writeFailuresRate, + utils.showNativeHistogramQuery( + if $._config.gateway_enabled then $.queries.gateway.writeFailuresRate else $.queries.distributor.writeFailuresRate + ), + // Write failures but from classic histograms. + utils.showClassicHistogramQuery( + if $._config.gateway_enabled then $.queries.gateway.writeFailuresRate else $.queries.distributor.writeFailuresRate + ), // Read failures. - if $._config.gateway_enabled then $.queries.gateway.readFailuresRate else $.queries.query_frontend.readFailuresRate, + utils.showNativeHistogramQuery( + if $._config.gateway_enabled then $.queries.gateway.readFailuresRate else $.queries.query_frontend.readFailuresRate, + ), + // Read failures but from classic histograms. + utils.showClassicHistogramQuery( + if $._config.gateway_enabled then $.queries.gateway.readFailuresRate else $.queries.query_frontend.readFailuresRate, + ), // Rule evaluation failures. $.queries.ruler.evaluations.failuresRate, // Alerting notifications. @@ -84,7 +97,7 @@ local filename = 'mimir-overview.json'; // Object storage failures. $.queries.storage.failuresRate, ], - ['Writes', 'Reads', 'Rule evaluations', 'Alerting notifications', 'Object storage'] + ['Writes', 'Writes', 'Reads', 'Reads', 'Rule evaluations', 'Alerting notifications', 'Object storage'] ) ) .addPanel( diff --git a/operations/mimir-mixin/jsonnetfile.lock.json b/operations/mimir-mixin/jsonnetfile.lock.json index 2dde18000fd..dcbb4bcca96 100644 --- a/operations/mimir-mixin/jsonnetfile.lock.json +++ b/operations/mimir-mixin/jsonnetfile.lock.json @@ -8,8 +8,8 @@ "subdir": "grafana-builder" } }, - "version": "0098700428a0a4ee7d884d332d137caff5c52497", - "sum": "B49EzIY2WZsFxNMJcgRxE/gcZ9ltnS8pkOOV6Q5qioc=" + "version": "87011e1ef43aa4cd6ae2777b41eb54877c7790e9", + "sum": "+z5VY+bPBNqXcmNAV8xbJcbsRA+pro1R3IM7aIY8OlU=" }, { "source": { @@ -18,8 +18,8 @@ "subdir": "mixin-utils" } }, - "version": "0098700428a0a4ee7d884d332d137caff5c52497", - "sum": "EWPd0a5uU5x1vTuyyMbH+d41wrgem7v21c2p4jekkbA=" + "version": "87011e1ef43aa4cd6ae2777b41eb54877c7790e9", + "sum": "0jg7qc3N8FtMnnQbunYCGSNcjHr9Y1krZW9OSTmWcEQ=" } ], "legacyImports": false