From 519934531ccd52a0862a8ded80b4bc39e34f89b4 Mon Sep 17 00:00:00 2001 From: David Grant Date: Thu, 25 Jul 2024 21:47:33 -0700 Subject: [PATCH] Add store-gateway autoscaling row to reads dashboard (#8824) * Add store-gateway autoscaling row. * Build helm. * Disable store-gateway autoscale row by default. Fix a typo (woudln't). * Changelog entry. --- CHANGELOG.md | 1 + .../metamonitoring/grafana-dashboards.yaml | 6 ++-- .../dashboards/mimir-reads.json | 2 +- .../dashboards/mimir-remote-ruler-reads.json | 2 +- .../dashboards/mimir-writes.json | 2 +- .../dashboards/mimir-reads.json | 2 +- .../dashboards/mimir-remote-ruler-reads.json | 2 +- .../dashboards/mimir-writes.json | 2 +- operations/mimir-mixin/config.libsonnet | 4 +++ .../dashboards/dashboard-utils.libsonnet | 2 +- .../mimir-mixin/dashboards/reads.libsonnet | 34 ++++++++++++++++++- 11 files changed, 48 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8707df8b6e6..90b90f6a548 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -83,6 +83,7 @@ * [ENHANCEMENT] Alerts: `MimirRunningIngesterReceiveDelayTooHigh` alert has been tuned to be more reactive to high receive delay. #8538 * [ENHANCEMENT] Dashboards: improve end-to-end latency and strong read consistency panels when experimental ingest storage is enabled. #8543 * [ENHANCEMENT] Dashboards: Add panels for monitoring ingester autoscaling when not using ingest-storage. These panels are disabled by default, but can be enabled using the `autoscaling.ingester.enabled: true` config option. #8484 +* [ENHANCEMENT] Dashboards: Add panels for monitoring store-gateway autoscaling. These panels are disabled by default, but can be enabled using the `autoscaling.store_gateway.enabled: true` config option. #8824 * [ENHANCEMENT] Dashboards: add panels to show writes to experimental ingest storage backend in the "Mimir / Ruler" dashboard, when `_config.show_ingest_storage_panels` is enabled. #8732 * [ENHANCEMENT] Dashboards: show all series in tooltips on time series dashboard panels. #8748 * [ENHANCEMENT] Dashboards: add compactor autoscaling panels to "Mimir / Compactor" dashboard. The panels are disabled by default, but can be enabled setting `_config.autoscaling.compactor.enabled` to `true`. #8777 diff --git a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/grafana-dashboards.yaml b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/grafana-dashboards.yaml index 05a4206ec4d..9ad22cbf66b 100644 --- a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/grafana-dashboards.yaml +++ b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/grafana-dashboards.yaml @@ -20286,7 +20286,7 @@ data: }, { "datasource": "$datasource", - "description": "### Autoscaler failures rate\nThe rate of failures in the KEDA custom metrics API server. Whenever an error occurs, the KEDA custom\nmetrics server is unable to query the scaling metric from Prometheus so the autoscaler woudln't work properly.\n\n", + "description": "### Autoscaler failures rate\nThe rate of failures in the KEDA custom metrics API server. Whenever an error occurs, the KEDA custom\nmetrics server is unable to query the scaling metric from Prometheus so the autoscaler wouldn't work properly.\n\n", "fieldConfig": { "defaults": { "custom": { @@ -26080,7 +26080,7 @@ data: }, { "datasource": "$datasource", - "description": "### Autoscaler failures rate\nThe rate of failures in the KEDA custom metrics API server. Whenever an error occurs, the KEDA custom\nmetrics server is unable to query the scaling metric from Prometheus so the autoscaler woudln't work properly.\n\n", + "description": "### Autoscaler failures rate\nThe rate of failures in the KEDA custom metrics API server. Whenever an error occurs, the KEDA custom\nmetrics server is unable to query the scaling metric from Prometheus so the autoscaler wouldn't work properly.\n\n", "fieldConfig": { "defaults": { "custom": { @@ -40380,7 +40380,7 @@ data: }, { "datasource": "$datasource", - "description": "### Autoscaler failures rate\nThe rate of failures in the KEDA custom metrics API server. Whenever an error occurs, the KEDA custom\nmetrics server is unable to query the scaling metric from Prometheus so the autoscaler woudln't work properly.\n\n", + "description": "### Autoscaler failures rate\nThe rate of failures in the KEDA custom metrics API server. Whenever an error occurs, the KEDA custom\nmetrics server is unable to query the scaling metric from Prometheus so the autoscaler wouldn't work properly.\n\n", "fieldConfig": { "defaults": { "custom": { diff --git a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-reads.json b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-reads.json index 2b01367205a..dca7f232a96 100644 --- a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-reads.json +++ b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-reads.json @@ -2575,7 +2575,7 @@ }, { "datasource": "$datasource", - "description": "### Autoscaler failures rate\nThe rate of failures in the KEDA custom metrics API server. Whenever an error occurs, the KEDA custom\nmetrics server is unable to query the scaling metric from Prometheus so the autoscaler woudln't work properly.\n\n", + "description": "### Autoscaler failures rate\nThe rate of failures in the KEDA custom metrics API server. Whenever an error occurs, the KEDA custom\nmetrics server is unable to query the scaling metric from Prometheus so the autoscaler wouldn't work properly.\n\n", "fieldConfig": { "defaults": { "custom": { diff --git a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-remote-ruler-reads.json b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-remote-ruler-reads.json index 5a6c13faab8..cdd34971485 100644 --- a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-remote-ruler-reads.json +++ b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-remote-ruler-reads.json @@ -1398,7 +1398,7 @@ }, { "datasource": "$datasource", - "description": "### Autoscaler failures rate\nThe rate of failures in the KEDA custom metrics API server. Whenever an error occurs, the KEDA custom\nmetrics server is unable to query the scaling metric from Prometheus so the autoscaler woudln't work properly.\n\n", + "description": "### Autoscaler failures rate\nThe rate of failures in the KEDA custom metrics API server. Whenever an error occurs, the KEDA custom\nmetrics server is unable to query the scaling metric from Prometheus so the autoscaler wouldn't work properly.\n\n", "fieldConfig": { "defaults": { "custom": { diff --git a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json index 6ae1c98a496..7dc271175bb 100644 --- a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json +++ b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json @@ -1335,7 +1335,7 @@ }, { "datasource": "$datasource", - "description": "### Autoscaler failures rate\nThe rate of failures in the KEDA custom metrics API server. Whenever an error occurs, the KEDA custom\nmetrics server is unable to query the scaling metric from Prometheus so the autoscaler woudln't work properly.\n\n", + "description": "### Autoscaler failures rate\nThe rate of failures in the KEDA custom metrics API server. Whenever an error occurs, the KEDA custom\nmetrics server is unable to query the scaling metric from Prometheus so the autoscaler wouldn't work properly.\n\n", "fieldConfig": { "defaults": { "custom": { diff --git a/operations/mimir-mixin-compiled/dashboards/mimir-reads.json b/operations/mimir-mixin-compiled/dashboards/mimir-reads.json index 7e3243f4cff..6025c0773c1 100644 --- a/operations/mimir-mixin-compiled/dashboards/mimir-reads.json +++ b/operations/mimir-mixin-compiled/dashboards/mimir-reads.json @@ -2575,7 +2575,7 @@ }, { "datasource": "$datasource", - "description": "### Autoscaler failures rate\nThe rate of failures in the KEDA custom metrics API server. Whenever an error occurs, the KEDA custom\nmetrics server is unable to query the scaling metric from Prometheus so the autoscaler woudln't work properly.\n\n", + "description": "### Autoscaler failures rate\nThe rate of failures in the KEDA custom metrics API server. Whenever an error occurs, the KEDA custom\nmetrics server is unable to query the scaling metric from Prometheus so the autoscaler wouldn't work properly.\n\n", "fieldConfig": { "defaults": { "custom": { diff --git a/operations/mimir-mixin-compiled/dashboards/mimir-remote-ruler-reads.json b/operations/mimir-mixin-compiled/dashboards/mimir-remote-ruler-reads.json index b4defc0f56a..e02339afdc2 100644 --- a/operations/mimir-mixin-compiled/dashboards/mimir-remote-ruler-reads.json +++ b/operations/mimir-mixin-compiled/dashboards/mimir-remote-ruler-reads.json @@ -1398,7 +1398,7 @@ }, { "datasource": "$datasource", - "description": "### Autoscaler failures rate\nThe rate of failures in the KEDA custom metrics API server. Whenever an error occurs, the KEDA custom\nmetrics server is unable to query the scaling metric from Prometheus so the autoscaler woudln't work properly.\n\n", + "description": "### Autoscaler failures rate\nThe rate of failures in the KEDA custom metrics API server. Whenever an error occurs, the KEDA custom\nmetrics server is unable to query the scaling metric from Prometheus so the autoscaler wouldn't work properly.\n\n", "fieldConfig": { "defaults": { "custom": { diff --git a/operations/mimir-mixin-compiled/dashboards/mimir-writes.json b/operations/mimir-mixin-compiled/dashboards/mimir-writes.json index 80226b0a8d1..abb26f92b41 100644 --- a/operations/mimir-mixin-compiled/dashboards/mimir-writes.json +++ b/operations/mimir-mixin-compiled/dashboards/mimir-writes.json @@ -1335,7 +1335,7 @@ }, { "datasource": "$datasource", - "description": "### Autoscaler failures rate\nThe rate of failures in the KEDA custom metrics API server. Whenever an error occurs, the KEDA custom\nmetrics server is unable to query the scaling metric from Prometheus so the autoscaler woudln't work properly.\n\n", + "description": "### Autoscaler failures rate\nThe rate of failures in the KEDA custom metrics API server. Whenever an error occurs, the KEDA custom\nmetrics server is unable to query the scaling metric from Prometheus so the autoscaler wouldn't work properly.\n\n", "fieldConfig": { "defaults": { "custom": { diff --git a/operations/mimir-mixin/config.libsonnet b/operations/mimir-mixin/config.libsonnet index e166336b44c..6f37127a970 100644 --- a/operations/mimir-mixin/config.libsonnet +++ b/operations/mimir-mixin/config.libsonnet @@ -630,6 +630,10 @@ enabled: false, hpa_name: $._config.autoscaling_hpa_prefix + 'ruler-querier', }, + store_gateway: { + enabled: false, + hpa_name: $._config.autoscaling_hpa_prefix + 'store-gateway-zone-a', + }, distributor: { enabled: false, hpa_name: $._config.autoscaling_hpa_prefix + 'distributor', diff --git a/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet b/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet index 34c0d513a28..26dabc751e0 100644 --- a/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -774,7 +774,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; title, ||| The rate of failures in the KEDA custom metrics API server. Whenever an error occurs, the KEDA custom - metrics server is unable to query the scaling metric from Prometheus so the autoscaler woudln't work properly. + metrics server is unable to query the scaling metric from Prometheus so the autoscaler wouldn't work properly. ||| ), diff --git a/operations/mimir-mixin/dashboards/reads.libsonnet b/operations/mimir-mixin/dashboards/reads.libsonnet index b079f2f2ee0..eb71f708c4e 100644 --- a/operations/mimir-mixin/dashboards/reads.libsonnet +++ b/operations/mimir-mixin/dashboards/reads.libsonnet @@ -296,11 +296,43 @@ local filename = 'mimir-reads.json'; title, ||| The rate of failures in the KEDA custom metrics API server. Whenever an error occurs, the KEDA custom - metrics server is unable to query the scaling metric from Prometheus so the autoscaler woudln't work properly. + metrics server is unable to query the scaling metric from Prometheus so the autoscaler wouldn't work properly. ||| ), ) ) + .addRowIf( + $._config.autoscaling.store_gateway.enabled, + $.row('Store-gateway – autoscaling') + .addPanel( + $.autoScalingActualReplicas('store_gateway') + { title: 'Replicas (leader zone)' } + + $.panelDescription( + 'Replicas (leader zone)', + ||| + The minimum, maximum, and current number of replicas for the leader zone of store-gateways. + Other zones scale to follow this zone (with delay for downscale). + ||| + ) + ) + .addPanel( + $.timeseriesPanel('Replicas') + + $.panelDescription('Replicas', 'Number of store-gateway replicas per zone.') + + $.queryPanel( + [ + 'sum by (%s) (up{%s})' % [$._config.per_job_label, $.jobMatcher($._config.job_names.store_gateway)], + ], + [ + '{{ %(per_job_label)s }}' % $._config.per_job_label, + ], + ), + ) + .addPanel( + $.autoScalingDesiredReplicasByValueScalingMetricPanel('store_gateway', '', '') + { title: 'Desired replicas (leader zone)' } + ) + .addPanel( + $.autoScalingFailuresPanel('store_gateway') + { title: 'Autoscaler failures rate' } + ), + ) .addRow( $.kvStoreRow('Store-gateway – key-value store for store-gateways ring', 'store_gateway', 'store-gateway') )