diff --git a/cmd/mimir/config-descriptor.json b/cmd/mimir/config-descriptor.json index 550524c2700..16b020e5559 100644 --- a/cmd/mimir/config-descriptor.json +++ b/cmd/mimir/config-descriptor.json @@ -4377,14 +4377,14 @@ }, { "kind": "field", - "name": "ruler_max_concurrent_rule_evaluations_per_tenant", + "name": "ruler_max_independent_rule_evaluation_concurrency_per_tenant", "required": false, - "desc": "Maximum number of independent rules that can run concurrently for each tenant. Depends on `-ruler.max-global-rule-evaluation-concurrency` being more than 0 and ideally this flag should be a lower value. 0 to disable.", + "desc": "Maximum number of independent rules that can run concurrently for each tenant. Depends on ruler.max-independent-rule-evaluation-concurrency being greater than 0. Ideally this flag should be a lower value. 0 to disable.", "fieldValue": null, "fieldDefaultValue": 4, - "fieldFlag": "ruler.max-concurrent-rule-evaluations-per-tenant", + "fieldFlag": "ruler.max-independent-rule-evaluation-concurrency-per-tenant", "fieldType": "int", - "fieldCategory": "experimental" + "fieldCategory": "advanced" }, { "kind": "field", @@ -12570,12 +12570,12 @@ }, { "kind": "field", - "name": "max_global_rule_evaluation_concurrency", + "name": "max_independent_rule_evaluation_concurrency", "required": false, - "desc": "Allow rules that don't have dependencies to be evaluated concurrently. 0 to disable.", + "desc": "Number of rules rules that don't have dependencies that we allow to be evaluated concurrently across all tenants. 0 to disable.", "fieldValue": null, "fieldDefaultValue": 0, - "fieldFlag": "ruler.max-global-rule-evaluation-concurrency", + "fieldFlag": "ruler.max-independent-rule-evaluation-concurrency", "fieldType": "int", "fieldCategory": "experimental" } diff --git a/cmd/mimir/help-all.txt.tmpl b/cmd/mimir/help-all.txt.tmpl index 15e52aad236..8e86c5cf83a 100644 --- a/cmd/mimir/help-all.txt.tmpl +++ b/cmd/mimir/help-all.txt.tmpl @@ -2623,10 +2623,10 @@ Usage of ./cmd/mimir/mimir: This grace period controls which alerts the ruler restores after a restart. Alerts with "for" duration lower than this grace period are not restored after a ruler restart. This means that if the alerts have been firing before the ruler restarted, they will now go to pending state and then to firing again after their "for" duration expires. Alerts with "for" duration greater than or equal to this grace period that have been pending before the ruler restart will remain in pending state for at least this grace period. Alerts with "for" duration greater than or equal to this grace period that have been firing before the ruler restart will continue to be firing after the restart. (default 2m0s) -ruler.for-outage-tolerance duration Max time to tolerate outage for restoring "for" state of alert. (default 1h0m0s) - -ruler.max-concurrent-rule-evaluations-per-tenant int - [experimental] Maximum number of independent rules that can run concurrently for each tenant. Depends on `-ruler.max-global-rule-evaluation-concurrency` being more than 0 and ideally this flag should be a lower value. 0 to disable. (default 4) - -ruler.max-global-rule-evaluation-concurrency int - [experimental] Allow rules that don't have dependencies to be evaluated concurrently. 0 to disable. + -ruler.max-independent-rule-evaluation-concurrency int + [experimental] Number of rules rules that don't have dependencies that we allow to be evaluated concurrently across all tenants. 0 to disable. + -ruler.max-independent-rule-evaluation-concurrency-per-tenant int + Maximum number of independent rules that can run concurrently for each tenant. Depends on ruler.max-independent-rule-evaluation-concurrency being greater than 0. Ideally this flag should be a lower value. 0 to disable. (default 4) -ruler.max-rule-groups-per-tenant int Maximum number of rule groups per-tenant. 0 to disable. (default 70) -ruler.max-rule-groups-per-tenant-by-namespace value diff --git a/docs/sources/mimir/configure/configuration-parameters/index.md b/docs/sources/mimir/configure/configuration-parameters/index.md index 46cb18a9adc..9118b9238fb 100644 --- a/docs/sources/mimir/configure/configuration-parameters/index.md +++ b/docs/sources/mimir/configure/configuration-parameters/index.md @@ -2082,10 +2082,17 @@ tenant_federation: # CLI flag: -ruler.tenant-federation.enabled [enabled: | default = false] -# (experimental) Allow rules that don't have dependencies to be evaluated -# concurrently. 0 to disable. -# CLI flag: -ruler.max-global-rule-evaluation-concurrency -[max_global_rule_evaluation_concurrency: | default = 0] +# (experimental) Number of rules rules that don't have dependencies that we +# allow to be evaluated concurrently across all tenants. 0 to disable. +# CLI flag: -ruler.max-independent-rule-evaluation-concurrency +[max_independent_rule_evaluation_concurrency: | default = 0] + +# (experimental) Threshold of the interval to last rule group runtime duration +# to allow a rule to be evaluated concurrency. Expressed in terms of percentage. +# By default, the rule group runtime duration must exceed 50% of the evaluation +# interval. +# CLI flag: -ruler.threshold-independent-rule-evaluation-concurrency +[threshold_independent_rule_evaluation_concurrency: | default = 50] ``` ### ruler_storage @@ -3527,11 +3534,11 @@ The `limits` block configures default and per-tenant limits imposed by component # CLI flag: -ruler.protected-namespaces [ruler_protected_namespaces: | default = ""] -# (experimental) Maximum number of independent rules that can run concurrently -# for each tenant. Depends on `-ruler.max-global-rule-evaluation-concurrency` -# being more than 0 and ideally this flag should be a lower value. 0 to disable. -# CLI flag: -ruler.max-concurrent-rule-evaluations-per-tenant -[ruler_max_concurrent_rule_evaluations_per_tenant: | default = 4] +# (advanced) Maximum number of independent rules that can run concurrently for +# each tenant. Depends on ruler.max-independent-rule-evaluation-concurrency +# being greater than 0. Ideally this flag should be a lower value. 0 to disable. +# CLI flag: -ruler.max-independent-rule-evaluation-concurrency-per-tenant +[ruler_max_independent_rule_evaluation_concurrency_per_tenant: | default = 4] # The tenant's shard size, used when store-gateway sharding is enabled. Value of # 0 disables shuffle sharding for the tenant, that is all tenant blocks are diff --git a/pkg/mimir/modules.go b/pkg/mimir/modules.go index 55eea10976b..451772e3a7e 100644 --- a/pkg/mimir/modules.go +++ b/pkg/mimir/modules.go @@ -872,8 +872,8 @@ func (t *Mimir) initRuler() (serv services.Service, err error) { var concurrencyController ruler.MultiTenantRuleConcurrencyController concurrencyController = &ruler.NoopConcurrencyController{} - if t.Cfg.Ruler.MaxGlobalRuleEvaluationConcurrency > 0 { - concurrencyController = ruler.NewMultiTenantConcurrencyController(util_log.Logger, t.Cfg.Ruler.MaxGlobalRuleEvaluationConcurrency, t.Overrides, t.Registerer) + if t.Cfg.Ruler.MaxIndependentRuleEvaluationConcurrency > 0 { + concurrencyController = ruler.NewMultiTenantConcurrencyController(util_log.Logger, t.Cfg.Ruler.MaxIndependentRuleEvaluationConcurrency, t.Overrides, t.Registerer) } managerFactory := ruler.DefaultTenantManagerFactory( t.Cfg.Ruler, diff --git a/pkg/ruler/compat.go b/pkg/ruler/compat.go index 69f0c67fb65..82538af17b5 100644 --- a/pkg/ruler/compat.go +++ b/pkg/ruler/compat.go @@ -151,7 +151,7 @@ type RulesLimits interface { RulerAlertingRulesEvaluationEnabled(userID string) bool RulerSyncRulesOnChangesEnabled(userID string) bool RulerProtectedNamespaces(userID string) []string - RulerMaxConcurrentRuleEvaluationsPerTenant(userID string) int64 + RulerMaxIndependentRuleEvaluationConcurrencyPerTenant(userID string) int64 } func MetricsQueryFunc(qf rules.QueryFunc, queries, failedQueries prometheus.Counter, remoteQuerier bool) rules.QueryFunc { diff --git a/pkg/ruler/rule_concurrency.go b/pkg/ruler/rule_concurrency.go index 2333192276a..fbc320ebc30 100644 --- a/pkg/ruler/rule_concurrency.go +++ b/pkg/ruler/rule_concurrency.go @@ -32,9 +32,8 @@ type DynamicSemaphore struct { } // NewDynamicSemaphore creates a new DynamicSemaphore -func NewDynamicSemaphore(logger log.Logger, maxConcurrency func() int64) *DynamicSemaphore { +func NewDynamicSemaphore(maxConcurrency func() int64) *DynamicSemaphore { return &DynamicSemaphore{ - logger: logger, maxConcurrency: maxConcurrency, acquired: 0, } @@ -173,8 +172,8 @@ func (c *MultiTenantConcurrencyController) Allow(ctx context.Context, group *rul c.mtx.Lock() tc, ok := c.tenantConcurrency[userID] if !ok { - tc = NewDynamicSemaphore(c.logger, func() int64 { - return c.limits.RulerMaxConcurrentRuleEvaluationsPerTenant(userID) + tc = NewDynamicSemaphore(func() int64 { + return c.limits.RulerMaxIndependentRuleEvaluationConcurrencyPerTenant(userID) }) c.tenantConcurrency[userID] = tc } diff --git a/pkg/ruler/rule_concurrency_test.go b/pkg/ruler/rule_concurrency_test.go index 1c4d27fb7aa..fe4ab93ecb2 100644 --- a/pkg/ruler/rule_concurrency_test.go +++ b/pkg/ruler/rule_concurrency_test.go @@ -90,9 +90,9 @@ func TestMultiTenantConcurrencyController(t *testing.T) { reg := prometheus.NewPedanticRegistry() limits := validation.MockOverrides(func(_ *validation.Limits, tenantLimits map[string]*validation.Limits) { tenantLimits["user1"] = validation.MockDefaultLimits() - tenantLimits["user1"].RulerMaxConcurrentRuleEvaluationsPerTenant = 2 + tenantLimits["user1"].RulerMaxConcurrentRuleEvaluations = 2 tenantLimits["user2"] = validation.MockDefaultLimits() - tenantLimits["user2"].RulerMaxConcurrentRuleEvaluationsPerTenant = 2 + tenantLimits["user2"].RulerMaxConcurrentRuleEvaluations = 2 }) rg := rules.NewGroup(rules.GroupOptions{ diff --git a/pkg/ruler/ruler.go b/pkg/ruler/ruler.go index 1facec0f1d4..7e958d4faa3 100644 --- a/pkg/ruler/ruler.go +++ b/pkg/ruler/ruler.go @@ -137,7 +137,7 @@ type Config struct { RingCheckPeriod time.Duration `yaml:"-"` rulerSyncQueuePollFrequency time.Duration `yaml:"-"` - MaxGlobalRuleEvaluationConcurrency int64 `yaml:"max_global_rule_evaluation_concurrency" category:"experimental"` + MaxIndependentRuleEvaluationConcurrency int64 `yaml:"max_independent_rule_evaluation_concurrency" category:"experimental"` } // Validate config and returns error on failure @@ -191,7 +191,7 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet, logger log.Logger) { f.BoolVar(&cfg.EnableQueryStats, "ruler.query-stats-enabled", false, "Report the wall time for ruler queries to complete as a per-tenant metric and as an info level log message.") - f.Int64Var(&cfg.MaxGlobalRuleEvaluationConcurrency, "ruler.max-global-rule-evaluation-concurrency", 0, "Allow rules that don't have dependencies to be evaluated concurrently. 0 to disable.") + f.Int64Var(&cfg.MaxIndependentRuleEvaluationConcurrency, "ruler.max-independent-rule-evaluation-concurrency", 0, "Number of rules rules that don't have dependencies that we allow to be evaluated concurrently across all tenants. 0 to disable.") cfg.RingCheckPeriod = 5 * time.Second } diff --git a/pkg/util/validation/limits.go b/pkg/util/validation/limits.go index 7bfb0185e7a..20e55a2fe3d 100644 --- a/pkg/util/validation/limits.go +++ b/pkg/util/validation/limits.go @@ -178,17 +178,17 @@ type Limits struct { ActiveSeriesResultsMaxSizeBytes int `yaml:"active_series_results_max_size_bytes" json:"active_series_results_max_size_bytes" category:"experimental"` // Ruler defaults and limits. - RulerEvaluationDelay model.Duration `yaml:"ruler_evaluation_delay_duration" json:"ruler_evaluation_delay_duration"` - RulerTenantShardSize int `yaml:"ruler_tenant_shard_size" json:"ruler_tenant_shard_size"` - RulerMaxRulesPerRuleGroup int `yaml:"ruler_max_rules_per_rule_group" json:"ruler_max_rules_per_rule_group"` - RulerMaxRuleGroupsPerTenant int `yaml:"ruler_max_rule_groups_per_tenant" json:"ruler_max_rule_groups_per_tenant"` - RulerRecordingRulesEvaluationEnabled bool `yaml:"ruler_recording_rules_evaluation_enabled" json:"ruler_recording_rules_evaluation_enabled"` - RulerAlertingRulesEvaluationEnabled bool `yaml:"ruler_alerting_rules_evaluation_enabled" json:"ruler_alerting_rules_evaluation_enabled"` - RulerSyncRulesOnChangesEnabled bool `yaml:"ruler_sync_rules_on_changes_enabled" json:"ruler_sync_rules_on_changes_enabled" category:"advanced"` - RulerMaxRulesPerRuleGroupByNamespace LimitsMap[int] `yaml:"ruler_max_rules_per_rule_group_by_namespace" json:"ruler_max_rules_per_rule_group_by_namespace" category:"experimental"` - RulerMaxRuleGroupsPerTenantByNamespace LimitsMap[int] `yaml:"ruler_max_rule_groups_per_tenant_by_namespace" json:"ruler_max_rule_groups_per_tenant_by_namespace" category:"experimental"` - RulerProtectedNamespaces flagext.StringSliceCSV `yaml:"ruler_protected_namespaces" json:"ruler_protected_namespaces" category:"experimental"` - RulerMaxConcurrentRuleEvaluationsPerTenant int64 `yaml:"ruler_max_concurrent_rule_evaluations_per_tenant" json:"ruler_max_concurrent_rule_evaluations_per_tenant" category:"experimental"` + RulerEvaluationDelay model.Duration `yaml:"ruler_evaluation_delay_duration" json:"ruler_evaluation_delay_duration"` + RulerTenantShardSize int `yaml:"ruler_tenant_shard_size" json:"ruler_tenant_shard_size"` + RulerMaxRulesPerRuleGroup int `yaml:"ruler_max_rules_per_rule_group" json:"ruler_max_rules_per_rule_group"` + RulerMaxRuleGroupsPerTenant int `yaml:"ruler_max_rule_groups_per_tenant" json:"ruler_max_rule_groups_per_tenant"` + RulerRecordingRulesEvaluationEnabled bool `yaml:"ruler_recording_rules_evaluation_enabled" json:"ruler_recording_rules_evaluation_enabled"` + RulerAlertingRulesEvaluationEnabled bool `yaml:"ruler_alerting_rules_evaluation_enabled" json:"ruler_alerting_rules_evaluation_enabled"` + RulerSyncRulesOnChangesEnabled bool `yaml:"ruler_sync_rules_on_changes_enabled" json:"ruler_sync_rules_on_changes_enabled" category:"advanced"` + RulerMaxRulesPerRuleGroupByNamespace LimitsMap[int] `yaml:"ruler_max_rules_per_rule_group_by_namespace" json:"ruler_max_rules_per_rule_group_by_namespace" category:"experimental"` + RulerMaxRuleGroupsPerTenantByNamespace LimitsMap[int] `yaml:"ruler_max_rule_groups_per_tenant_by_namespace" json:"ruler_max_rule_groups_per_tenant_by_namespace" category:"experimental"` + RulerProtectedNamespaces flagext.StringSliceCSV `yaml:"ruler_protected_namespaces" json:"ruler_protected_namespaces" category:"experimental"` + RulerMaxIndependentRuleEvaluationConcurrencyPerTenant int64 `yaml:"ruler_max_independent_rule_evaluation_concurrency_per_tenant" json:"ruler_max_independent_rule_evaluation_concurrency_per_tenant" category:"advanced"` // Store-gateway. StoreGatewayTenantShardSize int `yaml:"store_gateway_tenant_shard_size" json:"store_gateway_tenant_shard_size"` @@ -321,7 +321,7 @@ func (l *Limits) RegisterFlags(f *flag.FlagSet) { } f.Var(&l.RulerMaxRuleGroupsPerTenantByNamespace, "ruler.max-rule-groups-per-tenant-by-namespace", "Maximum number of rule groups per tenant by namespace. Value is a map, where each key is the namespace and value is the number of rule groups allowed in the namespace (int). On the command line, this map is given in a JSON format. The number of rule groups specified has the same meaning as -ruler.max-rule-groups-per-tenant, but only applies for the specific namespace. If specified, it supersedes -ruler.max-rule-groups-per-tenant.") f.Var(&l.RulerProtectedNamespaces, "ruler.protected-namespaces", "List of namespaces that are protected from modification unless a special HTTP header is used. If a namespace is protected, it can only be read, not modified via the ruler's configuration API. The value is a list of strings, where each string is a namespace name. On the command line, this list is given as a comma-separated list.") - f.Int64Var(&l.RulerMaxConcurrentRuleEvaluationsPerTenant, "ruler.max-concurrent-rule-evaluations-per-tenant", 4, "Maximum number of independent rules that can run concurrently for each tenant. Depends on `-ruler.max-global-rule-evaluation-concurrency` being more than 0 and ideally this flag should be a lower value. 0 to disable.") + f.Int64Var(&l.RulerMaxIndependentRuleEvaluationConcurrencyPerTenant, "ruler.max-independent-rule-evaluation-concurrency-per-tenant", 4, "Maximum number of independent rules that can run concurrently for each tenant. Depends on ruler.max-independent-rule-evaluation-concurrency being greater than 0. Ideally this flag should be a lower value. 0 to disable.") f.Var(&l.CompactorBlocksRetentionPeriod, "compactor.blocks-retention-period", "Delete blocks containing samples older than the specified retention period. Also used by query-frontend to avoid querying beyond the retention period by instant, range or remote read queries. 0 to disable.") f.IntVar(&l.CompactorSplitAndMergeShards, "compactor.split-and-merge-shards", 0, "The number of shards to use when splitting blocks. 0 to disable splitting.") @@ -901,9 +901,9 @@ func (o *Overrides) RulerSyncRulesOnChangesEnabled(userID string) bool { return o.getOverridesForUser(userID).RulerSyncRulesOnChangesEnabled } -// RulerMaxConcurrentRuleEvaluationsPerTenant returns the maximum number of independent rules that can run concurrently for a given user. -func (o *Overrides) RulerMaxConcurrentRuleEvaluationsPerTenant(userID string) int64 { - return o.getOverridesForUser(userID).RulerMaxConcurrentRuleEvaluationsPerTenant +// RulerMaxIndependentRuleEvaluationConcurrencyPerTenant returns the maximum number of independent rules that can run concurrently for a given user. +func (o *Overrides) RulerMaxIndependentRuleEvaluationConcurrencyPerTenant(userID string) int64 { + return o.getOverridesForUser(userID).RulerMaxIndependentRuleEvaluationConcurrencyPerTenant } // StoreGatewayTenantShardSize returns the store-gateway shard size for a given user. diff --git a/pkg/util/validation/limits_test.go b/pkg/util/validation/limits_test.go index 202faed1465..134b2e8449d 100644 --- a/pkg/util/validation/limits_test.go +++ b/pkg/util/validation/limits_test.go @@ -977,27 +977,27 @@ func TestRulerMaxConcurrentRuleEvaluationsPerTenantOverrides(t *testing.T) { }{ "no user specific concurrency": { inputYAML: ` -ruler_max_concurrent_rule_evaluations_per_tenant: 5 +ruler_max_independent_rule_evaluation_concurrency_per_tenant: 5 `, expectedPerTenantConcurrency: 5, }, "default limit for not specific user": { inputYAML: ` -ruler_max_concurrent_rule_evaluations_per_tenant: 5 +ruler_max_independent_rule_evaluation_concurrency_per_tenant: 5 `, overrides: ` randomuser: - ruler_max_concurrent_rule_evaluations_per_tenant: 10 + ruler_max_independent_rule_evaluation_concurrency_per_tenant: 10 `, expectedPerTenantConcurrency: 5, }, "overridden limit for specific user": { inputYAML: ` -ruler_max_concurrent_rule_evaluations_per_tenant: 5 +ruler_max_independent_rule_evaluation_concurrency_per_tenant: 5 `, overrides: ` user1: - ruler_max_concurrent_rule_evaluations_per_tenant: 15 + ruler_max_independent_rule_evaluation_concurrency_per_tenant: 15 `, expectedPerTenantConcurrency: 15, }, @@ -1019,7 +1019,7 @@ user1: ov, err := NewOverrides(LimitsYAML, tl) require.NoError(t, err) - require.Equal(t, tt.expectedPerTenantConcurrency, ov.RulerMaxConcurrentRuleEvaluationsPerTenant("user1")) + require.Equal(t, tt.expectedPerTenantConcurrency, ov.RulerMaxIndependentRuleEvaluationConcurrencyPerTenant("user1")) }) } }