From 43fd96e5f729dbac7b0edeb40404728a93d45a69 Mon Sep 17 00:00:00 2001 From: Yuri Nikolic Date: Tue, 23 Jul 2024 12:14:05 +0200 Subject: [PATCH 1/2] Remote ruler reads dashboard: allow using cortex_request_duration_seconds native histogram Signed-off-by: Yuri Nikolic --- CHANGELOG.md | 5 ++-- .../charts/mimir-distributed/CHANGELOG.md | 5 ++-- .../dashboards/mimir-remote-ruler-reads.json | 8 +++++- .../dashboards/mimir-remote-ruler-reads.json | 8 +++++- .../dashboards/dashboard-queries.libsonnet | 25 +++++++++++++------ .../dashboards/remote-ruler-reads.libsonnet | 19 ++------------ 6 files changed, 40 insertions(+), 30 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 55d8cc3857a..fb8a583ea29 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -71,10 +71,11 @@ * Overview dashboard: status, read/write latency and queries/ingestion per sec panels, `cortex_request_duration_seconds` metric. #7674 #8502 #8791 * Writes dashboard: `cortex_request_duration_seconds` metric. #8757 #8791 * Reads dashboard: `cortex_request_duration_seconds` metric. #8752 - * Rollout progress dashboard. #8779 - * Alertmanager dashboard. #8792 + * Rollout progress dashboard: `cortex_request_duration_seconds` metric. #8779 + * Alertmanager dashboard: `cortex_request_duration_seconds` metric. #8792 * Ruler dashboard: `cortex_request_duration_seconds` metric. #8795 * Queries dashboard: `cortex_request_duration_seconds` metric. #8800 + * Remote ruler reads dashboard: `cortex_request_duration_seconds` metric. #8801 * [ENHANCEMENT] Alerts: `MimirRunningIngesterReceiveDelayTooHigh` alert has been tuned to be more reactive to high receive delay. #8538 * [ENHANCEMENT] Dashboards: improve end-to-end latency and strong read consistency panels when experimental ingest storage is enabled. #8543 * [ENHANCEMENT] Dashboards: Add panels for monitoring ingester autoscaling when not using ingest-storage. These panels are disabled by default, but can be enabled using the `autoscaling.ingester.enabled: true` config option. #8484 diff --git a/operations/helm/charts/mimir-distributed/CHANGELOG.md b/operations/helm/charts/mimir-distributed/CHANGELOG.md index da82ea31308..2a7a93a09bf 100644 --- a/operations/helm/charts/mimir-distributed/CHANGELOG.md +++ b/operations/helm/charts/mimir-distributed/CHANGELOG.md @@ -33,10 +33,11 @@ Entries should include a reference to the Pull Request that introduced the chang * Overview dashboard: status, read/write latency and queries/ingestion per sec panels, `cortex_request_duration_seconds` metric. #7674 * Writes dashboard: `cortex_request_duration_seconds` metric. #8757 * Reads dashboard: `cortex_request_duration_seconds` metric. #8752 - * Rollout progress dashboard. #8779 - * Alertmanager dashboard. #8792 + * Rollout progress dashboard: `cortex_request_duration_seconds` metric. #8779 + * Alertmanager dashboard: `cortex_request_duration_seconds` metric. #8792 * Ruler dashboard: `cortex_request_duration_seconds` metric. #8795 * Queries dashboard: `cortex_request_duration_seconds` metric. #8800 + * Remote ruler reads dashboard: `cortex_request_duration_seconds` metric. #8801 * [ENHANCEMENT] Memcached: Update to Memcached 1.6.28 and memcached-exporter 0.14.4. #8557 * [ENHANCEMENT] Add missing fields in multiple topology spread constraints. #8533 * [ENHANCEMENT] Add support for setting the image pull secrets, node selectors, tolerations and topology spread constraints for the Grafana Agent pods used for metamonitoring. #8670 diff --git a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-remote-ruler-reads.json b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-remote-ruler-reads.json index 4b3da1738f8..5a6c13faab8 100644 --- a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-remote-ruler-reads.json +++ b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-remote-ruler-reads.json @@ -91,7 +91,13 @@ "steppedLine": false, "targets": [ { - "expr": "sum(\n rate(\n cortex_request_duration_seconds_count{\n cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\",\n route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"\n }[$__rate_interval]\n )\n)\n", + "expr": "sum (rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\",route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"}[$__rate_interval])) < ($latency_metrics * +Inf)", + "format": "time_series", + "instant": true, + "refId": "A_classic" + }, + { + "expr": "sum (histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\",route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"}[$__rate_interval]))) < ($latency_metrics * -Inf)", "format": "time_series", "instant": true, "refId": "A" diff --git a/operations/mimir-mixin-compiled/dashboards/mimir-remote-ruler-reads.json b/operations/mimir-mixin-compiled/dashboards/mimir-remote-ruler-reads.json index c341dd1b9d4..b4defc0f56a 100644 --- a/operations/mimir-mixin-compiled/dashboards/mimir-remote-ruler-reads.json +++ b/operations/mimir-mixin-compiled/dashboards/mimir-remote-ruler-reads.json @@ -91,7 +91,13 @@ "steppedLine": false, "targets": [ { - "expr": "sum(\n rate(\n cortex_request_duration_seconds_count{\n cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\",\n route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"\n }[$__rate_interval]\n )\n)\n", + "expr": "sum (rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\",route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"}[$__rate_interval])) < ($latency_metrics * +Inf)", + "format": "time_series", + "instant": true, + "refId": "A_classic" + }, + { + "expr": "sum (histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\",route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"}[$__rate_interval]))) < ($latency_metrics * -Inf)", "format": "time_series", "instant": true, "refId": "A" diff --git a/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet b/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet index 00eb29947a4..13a2a313d98 100644 --- a/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet +++ b/operations/mimir-mixin/dashboards/dashboard-queries.libsonnet @@ -82,6 +82,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; queries:: { // Define the supported replacement variables in a single place. Most of them are frequently used. local variables = { + requestsPerSecondMetric: $.requests_per_second_metric, gatewayMatcher: $.jobMatcher($._config.job_names.gateway), distributorMatcher: $.jobMatcher($._config.job_names.distributor), ingesterMatcher: $.jobMatcher($._config.job_names.ingester), @@ -90,18 +91,21 @@ local utils = import 'mixin-utils/utils.libsonnet'; alertmanagerMatcher: $.jobMatcher($._config.job_names.alertmanager), namespaceMatcher: $.namespaceMatcher(), storeGatewayMatcher: $.jobMatcher($._config.job_names.store_gateway), + rulerQueryFrontendMatcher: $.jobMatcher($._config.job_names.ruler_query_frontend), writeHTTPRoutesRegex: $.queries.write_http_routes_regex, writeDistributorRoutesRegex: std.join('|', [$.queries.write_grpc_distributor_routes_regex, $.queries.write_http_routes_regex]), writeGRPCIngesterRoute: $.queries.write_grpc_ingester_route, readHTTPRoutesRegex: $.queries.read_http_routes_regex, readGRPCIngesterRoute: $.queries.read_grpc_ingester_route, readGRPCStoreGatewayRoute: $.queries.read_grpc_store_gateway_route, + rulerQueryFrontendRoutesRegex: $.queries.ruler_query_frontend_routes_regex, perClusterLabel: $._config.per_cluster_label, recordingRulePrefix: $.recordingRulePrefix($.jobSelector('any')), // The job name does not matter here. groupPrefixJobs: $._config.group_prefix_jobs, instance: $._config.per_instance_label, }, + requests_per_second_metric: 'cortex_request_duration_seconds', write_http_routes_regex: 'api_(v1|prom)_push|otlp_v1_metrics', write_grpc_distributor_routes_regex: '/distributor.Distributor/Push|/httpgrpc.*', write_grpc_ingester_route: '/cortex.Ingester/Push', @@ -111,10 +115,12 @@ local utils = import 'mixin-utils/utils.libsonnet'; query_http_routes_regex: '(prometheus|api_prom)_api_v1_query(_range)?', alertmanager_http_routes_regex: 'api_v1_alerts|alertmanager', alertmanager_grpc_routes_regex: '/alertmanagerpb.Alertmanager/HandleRequest', + // Both support gRPC and HTTP requests. HTTP request is used when rule evaluation query requests go through the query-tee. + ruler_query_frontend_routes_regex: '/httpgrpc.HTTP/Handle|.*api_v1_query', gateway: { local p = self, - requestsPerSecondMetric: 'cortex_request_duration_seconds', + requestsPerSecondMetric: $.queries.requests_per_second_metric, writeRequestsPerSecondSelector: '%(gatewayMatcher)s, route=~"%(writeHTTPRoutesRegex)s"' % variables, readRequestsPerSecondSelector: '%(gatewayMatcher)s, route=~"%(readHTTPRoutesRegex)s"' % variables, @@ -127,7 +133,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; distributor: { local p = self, - requestsPerSecondMetric: 'cortex_request_duration_seconds', + requestsPerSecondMetric: $.queries.requests_per_second_metric, writeRequestsPerSecondRouteRegex: '%(writeDistributorRoutesRegex)s' % variables, writeRequestsPerSecondSelector: '%(distributorMatcher)s, route=~"%(writeDistributorRoutesRegex)s"' % variables, samplesPerSecond: 'sum(%(groupPrefixJobs)s:cortex_distributor_received_samples:rate5m{%(distributorMatcher)s})' % variables, @@ -139,7 +145,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; query_frontend: { local p = self, - requestsPerSecondMetric: 'cortex_request_duration_seconds', + requestsPerSecondMetric: $.queries.requests_per_second_metric, readRequestsPerSecondSelector: '%(queryFrontendMatcher)s, route=~"%(readHTTPRoutesRegex)s"' % variables, // These query routes are used in the overview and other dashboard, everythign else is considered "other" queries. // Has to be a list to keep the same colors as before, see overridesNonErrorColorsPalette. @@ -198,7 +204,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, ruler: { - requestsPerSecondMetric: 'cortex_request_duration_seconds', + requestsPerSecondMetric: $.queries.requests_per_second_metric, evaluations: { successPerSecond: ||| @@ -253,7 +259,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, alertmanager: { - requestsPerSecondMetric: 'cortex_request_duration_seconds', + requestsPerSecondMetric: $.queries.requests_per_second_metric, notifications: { // Notifications / sec attempted to deliver by the Alertmanager to the receivers. totalPerSecond: ||| @@ -293,7 +299,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, ingester: { - requestsPerSecondMetric: 'cortex_request_duration_seconds', + requestsPerSecondMetric: $.queries.requests_per_second_metric, readRequestsPerSecondSelector: '%(ingesterMatcher)s,route=~"%(readGRPCIngesterRoute)s"' % variables, writeRequestsPerSecondSelector: '%(ingesterMatcher)s, route="%(writeGRPCIngesterRoute)s"' % variables, @@ -324,8 +330,13 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, store_gateway: { - requestsPerSecondMetric: 'cortex_request_duration_seconds', + requestsPerSecondMetric: $.queries.requests_per_second_metric, readRequestsPerSecondSelector: '%(storeGatewayMatcher)s,route=~"%(readGRPCStoreGatewayRoute)s"' % variables, }, + + ruler_query_frontend: { + requestsPerSecondMetric: $.queries.requests_per_second_metric, + readRequestsPerSecondSelector: '%(rulerQueryFrontendMatcher)s,route=~"%(rulerQueryFrontendRoutesRegex)s"' % variables, + }, }, } diff --git a/operations/mimir-mixin/dashboards/remote-ruler-reads.libsonnet b/operations/mimir-mixin/dashboards/remote-ruler-reads.libsonnet index e706808800c..a2f364671d5 100644 --- a/operations/mimir-mixin/dashboards/remote-ruler-reads.libsonnet +++ b/operations/mimir-mixin/dashboards/remote-ruler-reads.libsonnet @@ -3,9 +3,6 @@ local filename = 'mimir-remote-ruler-reads.json'; (import 'dashboard-utils.libsonnet') + (import 'dashboard-queries.libsonnet') { - // Both support gRPC and HTTP requests. HTTP request is used when rule evaluation query requests go through the query-tee. - local rulerRoutesRegex = '/httpgrpc.HTTP/Handle|.*api_v1_query', - [filename]: assert std.md5(filename) == 'f103238f7f5ab2f1345ce650cbfbfe2f' : 'UID of the dashboard has changed, please update references to dashboard.'; ($.dashboard('Remote ruler reads') + { uid: std.md5(filename) }) @@ -33,19 +30,7 @@ local filename = 'mimir-remote-ruler-reads.json'; }) .addPanel( $.panel('Evaluations / sec') + - $.statPanel(||| - sum( - rate( - cortex_request_duration_seconds_count{ - %(queryFrontend)s, - route=~"%(rulerRoutesRegex)s" - }[$__rate_interval] - ) - ) - ||| % { - queryFrontend: $.jobMatcher($._config.job_names.ruler_query_frontend), - rulerRoutesRegex: rulerRoutesRegex, - }, format='reqps') + + $.statPanel(utils.ncHistogramSumBy(utils.ncHistogramCountRate($.queries.ruler_query_frontend.requestsPerSecondMetric, $.queries.ruler_query_frontend.readRequestsPerSecondSelector)), format='reqps') + $.panelDescription( 'Evaluations per second', ||| @@ -58,7 +43,7 @@ local filename = 'mimir-remote-ruler-reads.json'; queryFrontendJobName=$._config.job_names.ruler_query_frontend, querySchedulerJobName=$._config.job_names.ruler_query_scheduler, querierJobName=$._config.job_names.ruler_querier, - queryRoutesRegex=rulerRoutesRegex, + queryRoutesRegex=$.queries.ruler_query_frontend_routes_regex, rowTitlePrefix='Ruler-', )) From 029a3e7310d7ea8b2251fdda39cde8e576c2b658 Mon Sep 17 00:00:00 2001 From: Yuri Nikolic Date: Tue, 23 Jul 2024 12:41:52 +0200 Subject: [PATCH 2/2] Fix failing lint checks Signed-off-by: Yuri Nikolic --- .../templates/metamonitoring/grafana-dashboards.yaml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/grafana-dashboards.yaml b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/grafana-dashboards.yaml index 2787479de40..05a4206ec4d 100644 --- a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/grafana-dashboards.yaml +++ b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/grafana-dashboards.yaml @@ -24773,7 +24773,13 @@ data: "steppedLine": false, "targets": [ { - "expr": "sum(\n rate(\n cortex_request_duration_seconds_count{\n cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\",\n route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"\n }[$__rate_interval]\n )\n)\n", + "expr": "sum (rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\",route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"}[$__rate_interval])) < ($latency_metrics * +Inf)", + "format": "time_series", + "instant": true, + "refId": "A_classic" + }, + { + "expr": "sum (histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((ruler-query-frontend.*))\",route=~\"/httpgrpc.HTTP/Handle|.*api_v1_query\"}[$__rate_interval]))) < ($latency_metrics * -Inf)", "format": "time_series", "instant": true, "refId": "A"