grafana · dimitarvdimitrov · Apr 26, 2024 · Apr 2, 2024 · Apr 2, 2024 · Apr 3, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -66,6 +66,7 @@
 * [ENHANCEMENT] Compactor: add `$._config.cortex_compactor_concurrent_rollout_enabled` option (disabled by default) that makes use of rollout-operator to speed up the rollout of compactors. #7783 #7878
 * [ENHANCEMENT] Shuffle-sharding: add `$._config.shuffle_sharding.ingest_storage_partitions_enabled` and `$._config.shuffle_sharding.ingester_partitions_shard_size` options, that allow configuring partitions shard size in ingest-storage mode. #7804
 * [ENHANCEMENT] Rollout-operator: upgrade to v0.14.0.
+* [ENHANCEMENT] Add `_config.autoscaling_querier_predictive_scaling_enabled` to scale querier based on inflight queries 7 days ago. #7775
 * [BUGFIX] Guard against missing samples in KEDA queries. #7691
 
 ### Mimirtool

diff --git a/operations/helm/charts/mimir-distributed/CHANGELOG.md b/operations/helm/charts/mimir-distributed/CHANGELOG.md
@@ -80,6 +80,7 @@ Entries should include a reference to the Pull Request that introduced the chang
 * [ENHANCEMENT] Recording rules: add native histogram recording rules to `cortex_request_duration_seconds`. #7528
 * [ENHANCEMENT] Make the port used in ServiceMonitor for kube-state-metrics configurable. #7507
 * [ENHANCEMENT] Produce a clearer error messages when multiple X-Scope-OrgID headers are present. #7704
+* [ENHANCEMENT] Add `querier.kedaAutoscaling.predictiveScalingEnabled` to scale querier based on inflight queries 7 days ago. #7775
 * [BUGFIX] Metamonitoring: update dashboards to drop unsupported `step` parameter in targets. #7157
 * [BUGFIX] Recording rules: drop rules for metrics removed in 2.0: `cortex_memcache_request_duration_seconds` and `cortex_cache_request_duration_seconds`. #7514
 * [BUGFIX] Store-gateway: setting "resources.requests.memory" with a quantity that used power-of-ten SI suffix, caused an error. #7506

@@ -44,4 +44,16 @@ spec:
       {{- end }}
     name: cortex_querier_hpa_default_requests_duration
     type: prometheus
-{{- end }}
+  {{- $autoscaling := .Values.querier.kedaAutoscaling -}}
+  {{- if .Values.querier.kedaAutoscaling.predictiveScalingEnabled }}
+  - metadata:
+      query: sum(max_over_time(cortex_query_scheduler_inflight_requests{container="query-scheduler",namespace="{{ .Release.Namespace }}",quantile="0.5"}[{{$autoscaling.predictiveScalingLookback}}] offset {{$autoscaling.predictiveScalingPeriod}}))
+      serverAddress: {{ include "mimir.kedaPrometheusAddress" (dict "ctx" $) }}
+      threshold: {{ $autoscaling.querySchedulerInflightRequestsThreshold | quote }}
+      {{- if .Values.kedaAutoscaling.customHeaders }}
+      customHeaders: {{ (include "mimir.lib.mapToCSVString" (dict "map" .Values.kedaAutoscaling.customHeaders)) | quote }}
+      {{- end }}
+    name: cortex_querier_hpa_default
+    type: prometheus
+  {{- end }}
+{{- end }}
@@ -1289,6 +1289,17 @@ querier:
     minReplicaCount: 1
     maxReplicaCount: 10
     querySchedulerInflightRequestsThreshold: 12
+    # -- predictiveScalingEnabled scales up the querier based on the inflight requests in the past.
+    # This helps with scaling up for predictable traffic patterns and minimizing HTTP 429 responses due to filled query queues.
+    # Due to false positive items it can increase the querier TCO.
+    predictiveScalingEnabled: false
+    # -- The period to consider when considering scheduler metrics for predictive scaling.
+    # This is usually slightly lower than the period of the repeating query events to give scaling up lead time.
+    predictiveScalingPeriod: 6d23h30m
+    # -- The time range to consider when considering scheduler metrics for predictive scaling.
+    # For example: if lookback is 30m and period is 6d23h30m,
+    # the querier will scale based on the maximum inflight queries between 6d23h30m and 7d ago.
+    predictiveScalingLookback: 30m
     behavior:
       scaleDown:
         policies:

@@ -2140,6 +2140,14 @@ spec:
       threshold: "7"
     name: cortex_querier_hpa_default_requests_duration
     type: prometheus
+  - metadata:
+      metricName: cortex_querier_hpa_default_7d_offset
+      query: sum(max_over_time(cortex_query_scheduler_inflight_requests{container="query-scheduler",namespace="default",quantile="0.5"}[30m]
+        offset 6d23h30m))
+      serverAddress: http://prometheus.default:9090/prometheus
+      threshold: "7"
+    name: cortex_querier_hpa_default_7d_offset
+    type: prometheus
 ---
 apiVersion: keda.sh/v1alpha1
 kind: ScaledObject

@@ -2140,6 +2140,14 @@ spec:
       threshold: "6"
     name: cortex_querier_hpa_default_requests_duration
     type: prometheus
+  - metadata:
+      metricName: cortex_querier_hpa_default_7d_offset
+      query: sum(max_over_time(cortex_query_scheduler_inflight_requests{container="query-scheduler",namespace="default",quantile="0.5"}[30m]
+        offset 6d23h30m))
+      serverAddress: http://prometheus.default:9090/prometheus
+      threshold: "6"
+    name: cortex_querier_hpa_default_7d_offset
+    type: prometheus
 ---
 apiVersion: keda.sh/v1alpha1
 kind: ScaledObject

@@ -22,6 +22,7 @@ mimir {
     autoscaling_querier_enabled: true,
     autoscaling_querier_min_replicas: 3,
     autoscaling_querier_max_replicas: 30,
+    autoscaling_querier_predictive_scaling_enabled: true,
 
     autoscaling_ruler_querier_enabled: true,
     autoscaling_ruler_querier_min_replicas: 3,

@@ -6,6 +6,9 @@
     autoscaling_querier_min_replicas: error 'you must set autoscaling_querier_min_replicas in the _config',
     autoscaling_querier_max_replicas: error 'you must set autoscaling_querier_max_replicas in the _config',
     autoscaling_querier_target_utilization: 0.75,  // Target to utilize 75% querier workers on peak traffic, so we have 25% room for higher peaks.
+    autoscaling_querier_predictive_scaling_enabled: false,  // Use inflight queries from the past to predict the number of queriers needed.
+    autoscaling_querier_predictive_scaling_period: '6d23h30m',  // The period to consider when considering scheduler metrics for predictive scaling. This is usually slightly lower than the period of the repeating query events to give scaling up lead time.
+    autoscaling_querier_predictive_scaling_lookback: '30m',  // The time range to consider when considering scheduler metrics for predictive scaling. For example: if lookback is 30m and period is 6d23h30m, the querier will scale based on the maximum inflight queries between 6d23h30m and 7d0h0m ago.
 
     autoscaling_ruler_querier_enabled: false,
     autoscaling_ruler_querier_min_replicas: error 'you must set autoscaling_ruler_querier_min_replicas in the _config',
@@ -170,31 +173,47 @@
     min_replica_count: replicasWithWeight(min_replicas, weight),
     max_replica_count: replicasWithWeight(max_replicas, weight),
 
-    triggers: [
-      {
-        metric_name: 'cortex_%s_hpa_%s' % [std.strReplace(name, '-', '_'), $._config.namespace],
+    triggers:
+      [
+        {
+          metric_name: 'cortex_%s_hpa_%s' % [std.strReplace(name, '-', '_'), $._config.namespace],
 
-        // Each query scheduler tracks *at regular intervals* the number of inflight requests
-        // (both enqueued and processing queries) as a summary. With the following query we target
-        // to have enough querier workers to run the max observed inflight requests 50% of time.
-        //
-        // This metric covers the case queries are piling up in the query-scheduler queue.
-        query: metricWithWeight('sum(max_over_time(cortex_query_scheduler_inflight_requests{container="%s",namespace="%s",quantile="0.5"}[1m]))' % [query_scheduler_container_name, $._config.namespace], weight),
+          // Each query scheduler tracks *at regular intervals* the number of inflight requests
+          // (both enqueued and processing queries) as a summary. With the following query we target
+          // to have enough querier workers to run the max observed inflight requests 50% of time.
+          //
+          // This metric covers the case queries are piling up in the query-scheduler queue.
+          query: metricWithWeight('sum(max_over_time(cortex_query_scheduler_inflight_requests{container="%s",namespace="%s",quantile="0.5"}[1m]))' % [query_scheduler_container_name, $._config.namespace], weight),
 
-        threshold: '%d' % std.floor(querier_max_concurrent * target_utilization),
-      },
-      {
-        metric_name: 'cortex_%s_hpa_%s_requests_duration' % [std.strReplace(name, '-', '_'), $._config.namespace],
+          threshold: '%d' % std.floor(querier_max_concurrent * target_utilization),
+        },
+        {
+          metric_name: 'cortex_%s_hpa_%s_requests_duration' % [std.strReplace(name, '-', '_'), $._config.namespace],
 
-        // The total requests duration / second is a good approximation of the number of querier workers used.
-        //
-        // This metric covers the case queries are not necessarily piling up in the query-scheduler queue,
-        // but queriers are busy.
-        query: metricWithWeight('sum(rate(cortex_querier_request_duration_seconds_sum{container="%s",namespace="%s"}[1m]))' % [querier_container_name, $._config.namespace], weight),
+          // The total requests duration / second is a good approximation of the number of querier workers used.
+          //
+          // This metric covers the case queries are not necessarily piling up in the query-scheduler queue,
+          // but queriers are busy.
+          query: metricWithWeight('sum(rate(cortex_querier_request_duration_seconds_sum{container="%s",namespace="%s"}[1m]))' % [querier_container_name, $._config.namespace], weight),
 
-        threshold: '%d' % std.floor(querier_max_concurrent * target_utilization),
-      },
-    ],
+          threshold: '%d' % std.floor(querier_max_concurrent * target_utilization),
+        },
+      ]
+      + if !$._config.autoscaling_querier_predictive_scaling_enabled then [] else [
+        {
+          metric_name: 'cortex_%s_hpa_%s_7d_offset' % [std.strReplace(name, '-', '_'), $._config.namespace],
+
+          // Scale queriers according to how many queriers would have been sufficient to handle the load $period ago.
+          // We use the query scheduler metric which includes active queries and queries in the queue.
+          query: metricWithWeight('sum(max_over_time(cortex_query_scheduler_inflight_requests{container="%(container)s",namespace="%(namespace)s",quantile="0.5"}[%(lookback)s] offset %(period)s))' % {
+            container: query_scheduler_container_name,
+            namespace: $._config.namespace,
+            lookback: $._config.autoscaling_querier_predictive_scaling_lookback,
+            period: $._config.autoscaling_querier_predictive_scaling_period,
+          }, weight),
+          threshold: '%d' % std.floor(querier_max_concurrent * target_utilization),
+        },
+      ],
   }) + {
     spec+: {
       advanced: {