From 661df870e924312fbb68170284055205a5fa1915 Mon Sep 17 00:00:00 2001 From: Yuri Nikolic Date: Fri, 24 May 2024 13:41:30 +0200 Subject: [PATCH 01/16] Adding circuit breakers on ingester server side for write path Signed-off-by: Yuri Nikolic --- CHANGELOG.md | 1 + cmd/mimir/config-descriptor.json | 87 +++++++ cmd/mimir/help-all.txt.tmpl | 14 ++ cmd/mimir/help.txt.tmpl | 2 + .../mimir/configure/about-versioning.md | 10 +- .../configuration-parameters/index.md | 38 +++ pkg/ingester/circuitbreaker.go | 227 ++++++++++++++++++ pkg/ingester/circuitbreaker_test.go | 193 +++++++++++++++ pkg/ingester/errors.go | 19 ++ pkg/ingester/errors_test.go | 18 ++ pkg/ingester/ingester.go | 34 ++- pkg/ingester/metrics.go | 12 + 12 files changed, 653 insertions(+), 2 deletions(-) create mode 100644 pkg/ingester/circuitbreaker.go create mode 100644 pkg/ingester/circuitbreaker_test.go diff --git a/CHANGELOG.md b/CHANGELOG.md index 0a0a113e423..9c27ab7f3aa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,7 @@ * [FEATURE] mimirtool: Add `runtime-config verify` sub-command, for verifying Mimir runtime config files. #8123 * [FEATURE] Query-frontend, querier: new experimental `/cardinality/active_native_histogram_metrics` API to get active native histogram metric names with statistics about active native histogram buckets. #7982 #7986 #8008 * [FEATURE] Alertmanager: Added `-alertmanager.max-silences-count` and `-alertmanager.max-silence-size-bytes` to set limits on per tenant silences. Disabled by default. #6898 +* [FEATURE] Ingester: add experimental support for the server-side circuit breakers when writing to ingesters via `-ingester.circuit-breaker.enabled`, `-ingester.circuit-breaker.failure-threshold`, or `-ingester.circuit-breaker.cooldown-period` or their corresponding YAML. Added metrics `cortex_ingester_circuit_breaker_results_total` and `cortex_ingester_circuit_breaker_transitions_total`. #8180 * [ENHANCEMENT] Reduced memory allocations in functions used to propagate contextual information between gRPC calls. #7529 * [ENHANCEMENT] Distributor: add experimental limit for exemplars per series per request, enabled with `-distributor.max-exemplars-per-series-per-request`, the number of discarded exemplars are tracked with `cortex_discarded_exemplars_total{reason="too_many_exemplars_per_series_per_request"}` #7989 #8010 * [ENHANCEMENT] Store-gateway: merge series from different blocks concurrently. #7456 diff --git a/cmd/mimir/config-descriptor.json b/cmd/mimir/config-descriptor.json index fdcd14adfc7..b47db34ed85 100644 --- a/cmd/mimir/config-descriptor.json +++ b/cmd/mimir/config-descriptor.json @@ -3140,6 +3140,93 @@ "fieldFlag": "ingester.owned-series-update-interval", "fieldType": "duration", "fieldCategory": "experimental" + }, + { + "kind": "block", + "name": "circuit_breaker", + "required": false, + "desc": "", + "blockEntries": [ + { + "kind": "field", + "name": "enabled", + "required": false, + "desc": "Enable circuit breaking when making requests to ingesters", + "fieldValue": null, + "fieldDefaultValue": false, + "fieldFlag": "ingester.circuit-breaker.enabled", + "fieldType": "boolean", + "fieldCategory": "experimental" + }, + { + "kind": "field", + "name": "failure_threshold", + "required": false, + "desc": "Max percentage of requests that can fail over period before the circuit breaker opens", + "fieldValue": null, + "fieldDefaultValue": 10, + "fieldFlag": "ingester.circuit-breaker.failure-threshold", + "fieldType": "int", + "fieldCategory": "experimental" + }, + { + "kind": "field", + "name": "failure_execution_threshold", + "required": false, + "desc": "How many requests must have been executed in period for the circuit breaker to be eligible to open for the rate of failures", + "fieldValue": null, + "fieldDefaultValue": 100, + "fieldFlag": "ingester.circuit-breaker.failure-execution-threshold", + "fieldType": "int", + "fieldCategory": "experimental" + }, + { + "kind": "field", + "name": "thresholding_period", + "required": false, + "desc": "Moving window of time that the percentage of failed requests is computed over", + "fieldValue": null, + "fieldDefaultValue": 60000000000, + "fieldFlag": "ingester.circuit-breaker.thresholding-period", + "fieldType": "duration", + "fieldCategory": "experimental" + }, + { + "kind": "field", + "name": "cooldown_period", + "required": false, + "desc": "How long the circuit breaker will stay in the open state before allowing some requests", + "fieldValue": null, + "fieldDefaultValue": 10000000000, + "fieldFlag": "ingester.circuit-breaker.cooldown-period", + "fieldType": "duration", + "fieldCategory": "experimental" + }, + { + "kind": "field", + "name": "initial_delay", + "required": false, + "desc": "How long the circuit breaker should wait between creation and starting up. During that time both failures and successes will not be counted.", + "fieldValue": null, + "fieldDefaultValue": 0, + "fieldFlag": "ingester.circuit-breaker.initial-delay", + "fieldType": "duration", + "fieldCategory": "experimental" + }, + { + "kind": "field", + "name": "push_timeout", + "required": false, + "desc": "How long is execution of ingester's Push supposed to last before it is reported as timeout in a circuit breaker. This configuration is used for circuit breakers only, and timeout expirations are not reported as errors", + "fieldValue": null, + "fieldDefaultValue": 0, + "fieldFlag": "ingester.circuit-breaker.push-timeout", + "fieldType": "duration", + "fieldCategory": "experiment" + } + ], + "fieldValue": null, + "fieldDefaultValue": null } ], "fieldValue": null, diff --git a/cmd/mimir/help-all.txt.tmpl b/cmd/mimir/help-all.txt.tmpl index 0bbfbd3c8ff..da93d0dc327 100644 --- a/cmd/mimir/help-all.txt.tmpl +++ b/cmd/mimir/help-all.txt.tmpl @@ -1307,6 +1307,20 @@ Usage of ./cmd/mimir/mimir: After what time a series is considered to be inactive. (default 10m0s) -ingester.active-series-metrics-update-period duration How often to update active series metrics. (default 1m0s) + -ingester.circuit-breaker.cooldown-period duration + [experimental] How long the circuit breaker will stay in the open state before allowing some requests (default 10s) + -ingester.circuit-breaker.enabled + [experimental] Enable circuit breaking when making requests to ingesters + -ingester.circuit-breaker.failure-execution-threshold uint + [experimental] How many requests must have been executed in period for the circuit breaker to be eligible to open for the rate of failures (default 100) + -ingester.circuit-breaker.failure-threshold uint + [experimental] Max percentage of requests that can fail over period before the circuit breaker opens (default 10) + -ingester.circuit-breaker.initial-delay duration + [experimental] How long the circuit breaker should wait between creation and starting up. During that time both failures and successes will not be counted. + -ingester.circuit-breaker.push-timeout duration + How long is execution of ingester's Push supposed to last before it is reported as timeout in a circuit breaker. This configuration is used for circuit breakers only, and timeout expirations are not reported as errors + -ingester.circuit-breaker.thresholding-period duration + [experimental] Moving window of time that the percentage of failed requests is computed over (default 1m0s) -ingester.client.backoff-max-period duration Maximum delay when backing off. (default 10s) -ingester.client.backoff-min-period duration diff --git a/cmd/mimir/help.txt.tmpl b/cmd/mimir/help.txt.tmpl index 5812fdc60f6..0309a3518e3 100644 --- a/cmd/mimir/help.txt.tmpl +++ b/cmd/mimir/help.txt.tmpl @@ -389,6 +389,8 @@ Usage of ./cmd/mimir/mimir: Print basic help. -help-all Print help, also including advanced and experimental parameters. + -ingester.circuit-breaker.push-timeout duration + How long is execution of ingester's Push supposed to last before it is reported as timeout in a circuit breaker. This configuration is used for circuit breakers only, and timeout expirations are not reported as errors -ingester.max-global-metadata-per-metric int The maximum number of metadata per metric, across the cluster. 0 to disable. -ingester.max-global-metadata-per-user int diff --git a/docs/sources/mimir/configure/about-versioning.md b/docs/sources/mimir/configure/about-versioning.md index 4a8a16b318e..1e07c1fe988 100644 --- a/docs/sources/mimir/configure/about-versioning.md +++ b/docs/sources/mimir/configure/about-versioning.md @@ -117,12 +117,20 @@ The following features are currently experimental: - `-ingester.track-ingester-owned-series` - `-ingester.use-ingester-owned-series-for-limits` - `-ingester.owned-series-update-interval` + - Per-ingester circuit breaking based on requests timing out or hitting per-instance limits + - `-ingester.circuit-breaker.enabled` + - `-ingester.circuit-breaker.failure-threshold` + - `-ingester.circuit-breaker.failure-execution-threshold` + - `-ingester.circuit-breaker.thresholding-period` + - `-ingester.circuit-breaker.cooldown-period` + - `-ingester.circuit-breaker.initial-delay` + - `-ingester.circuit-breaker.push-timeout` - Ingester client - Per-ingester circuit breaking based on requests timing out or hitting per-instance limits - `-ingester.client.circuit-breaker.enabled` - `-ingester.client.circuit-breaker.failure-threshold` - `-ingester.client.circuit-breaker.failure-execution-threshold` - - `-ingester.client.circuit-breaker.period` + - `-ingester.client.circuit-breaker.thresholding-period` - `-ingester.client.circuit-breaker.cooldown-period` - Querier - Use of Redis cache backend (`-blocks-storage.bucket-store.metadata-cache.backend=redis`) diff --git a/docs/sources/mimir/configure/configuration-parameters/index.md b/docs/sources/mimir/configure/configuration-parameters/index.md index 24c1d24c84b..423629ccefd 100644 --- a/docs/sources/mimir/configure/configuration-parameters/index.md +++ b/docs/sources/mimir/configure/configuration-parameters/index.md @@ -1217,6 +1217,44 @@ instance_limits: # owned series as a result of detected change. # CLI flag: -ingester.owned-series-update-interval [owned_series_update_interval: | default = 15s] + +circuit_breaker: + # (experimental) Enable circuit breaking when making requests to ingesters + # CLI flag: -ingester.circuit-breaker.enabled + [enabled: | default = false] + + # (experimental) Max percentage of requests that can fail over period before + # the circuit breaker opens + # CLI flag: -ingester.circuit-breaker.failure-threshold + [failure_threshold: | default = 10] + + # (experimental) How many requests must have been executed in period for the + # circuit breaker to be eligible to open for the rate of failures + # CLI flag: -ingester.circuit-breaker.failure-execution-threshold + [failure_execution_threshold: | default = 100] + + # (experimental) Moving window of time that the percentage of failed requests + # is computed over + # CLI flag: -ingester.circuit-breaker.thresholding-period + [thresholding_period: | default = 1m] + + # (experimental) How long the circuit breaker will stay in the open state + # before allowing some requests + # CLI flag: -ingester.circuit-breaker.cooldown-period + [cooldown_period: | default = 10s] + + # (experimental) How long the circuit breaker should wait between creation and + # starting up. During that time both failures and successes will not be + # counted. + # CLI flag: -ingester.circuit-breaker.initial-delay + [initial_delay: | default = 0s] + + # (experiment) How long is execution of ingester's Push supposed to last + # before it is reported as timeout in a circuit breaker. This configuration is + # used for circuit breakers only, and timeout expirations are not reported as + # errors + # CLI flag: -ingester.circuit-breaker.push-timeout + [push_timeout: | default = 0s] ``` ### querier diff --git a/pkg/ingester/circuitbreaker.go b/pkg/ingester/circuitbreaker.go new file mode 100644 index 00000000000..852dacca54e --- /dev/null +++ b/pkg/ingester/circuitbreaker.go @@ -0,0 +1,227 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package ingester + +import ( + "context" + "flag" + "fmt" + "time" + + "github.com/failsafe-go/failsafe-go" + "github.com/failsafe-go/failsafe-go/circuitbreaker" + "github.com/go-kit/log" + "github.com/go-kit/log/level" + "github.com/grafana/dskit/middleware" + "github.com/pkg/errors" + "google.golang.org/grpc/codes" + + "github.com/grafana/mimir/pkg/mimirpb" +) + +type testCtxKey string + +const ( + resultSuccess = "success" + resultError = "error" + resultOpen = "circuit_breaker_open" + defaultTimeout = 2 * time.Second + testDelayKey testCtxKey = "test-delay" +) + +type CircuitBreakerConfig struct { + Enabled bool `yaml:"enabled" category:"experimental"` + FailureThreshold uint `yaml:"failure_threshold" category:"experimental"` + FailureExecutionThreshold uint `yaml:"failure_execution_threshold" category:"experimental"` + ThresholdingPeriod time.Duration `yaml:"thresholding_period" category:"experimental"` + CooldownPeriod time.Duration `yaml:"cooldown_period" category:"experimental"` + InitialDelay time.Duration `yaml:"initial_delay" category:"experimental"` + PushTimeout time.Duration `yaml:"push_timeout" category:"experiment"` + testModeEnabled bool `yaml:"-"` +} + +func (cfg *CircuitBreakerConfig) RegisterFlags(f *flag.FlagSet) { + prefix := "ingester.circuit-breaker." + f.BoolVar(&cfg.Enabled, prefix+"enabled", false, "Enable circuit breaking when making requests to ingesters") + f.UintVar(&cfg.FailureThreshold, prefix+"failure-threshold", 10, "Max percentage of requests that can fail over period before the circuit breaker opens") + f.UintVar(&cfg.FailureExecutionThreshold, prefix+"failure-execution-threshold", 100, "How many requests must have been executed in period for the circuit breaker to be eligible to open for the rate of failures") + f.DurationVar(&cfg.ThresholdingPeriod, prefix+"thresholding-period", time.Minute, "Moving window of time that the percentage of failed requests is computed over") + f.DurationVar(&cfg.CooldownPeriod, prefix+"cooldown-period", 10*time.Second, "How long the circuit breaker will stay in the open state before allowing some requests") + f.DurationVar(&cfg.InitialDelay, prefix+"initial-delay", 0, "How long the circuit breaker should wait between creation and starting up. During that time both failures and successes will not be counted.") + f.DurationVar(&cfg.PushTimeout, prefix+"push-timeout", 0, "How long is execution of ingester's Push supposed to last before it is reported as timeout in a circuit breaker. This configuration is used for circuit breakers only, and timeout expirations are not reported as errors") +} + +func (cfg *CircuitBreakerConfig) Validate() error { + return nil +} + +type circuitBreaker struct { + circuitbreaker.CircuitBreaker[any] + ingester *Ingester + executor failsafe.Executor[any] + startTime time.Time +} + +func newCircuitBreaker(ingester *Ingester) *circuitBreaker { + ingesterID := ingester.cfg.IngesterRing.InstanceID + cfg := ingester.cfg.CircuitBreakerConfig + metrics := ingester.metrics + // Initialize each of the known labels for circuit breaker metrics for this particular ingester. + transitionOpen := metrics.circuitBreakerTransitions.WithLabelValues(ingesterID, circuitbreaker.OpenState.String()) + transitionHalfOpen := metrics.circuitBreakerTransitions.WithLabelValues(ingesterID, circuitbreaker.HalfOpenState.String()) + transitionClosed := metrics.circuitBreakerTransitions.WithLabelValues(ingesterID, circuitbreaker.ClosedState.String()) + countSuccess := metrics.circuitBreakerResults.WithLabelValues(ingesterID, resultSuccess) + countError := metrics.circuitBreakerResults.WithLabelValues(ingesterID, resultError) + + cbBuilder := circuitbreaker.Builder[any](). + WithFailureThreshold(cfg.FailureThreshold). + WithDelay(cfg.CooldownPeriod). + OnFailure(func(failsafe.ExecutionEvent[any]) { + countError.Inc() + }). + OnSuccess(func(failsafe.ExecutionEvent[any]) { + countSuccess.Inc() + }). + OnClose(func(event circuitbreaker.StateChangedEvent) { + transitionClosed.Inc() + level.Info(ingester.logger).Log("msg", "circuit breaker is closed", "ingester", ingesterID, "previous", event.OldState, "current", event.NewState) + }). + OnOpen(func(event circuitbreaker.StateChangedEvent) { + transitionOpen.Inc() + level.Info(ingester.logger).Log("msg", "circuit breaker is open", "ingester", ingesterID, "previous", event.OldState, "current", event.NewState) + }). + OnHalfOpen(func(event circuitbreaker.StateChangedEvent) { + transitionHalfOpen.Inc() + level.Info(ingester.logger).Log("msg", "circuit breaker is half-open", "ingester", ingesterID, "previous", event.OldState, "current", event.NewState) + }). + HandleIf(func(_ any, err error) bool { return isFailure(err) }) + + if cfg.testModeEnabled { + // In case of testing purposes, we initialize the circuit breaker with count based failure thresholding, + // since it is more deterministic, and therefore it is easier to predict the outcome. + cbBuilder = cbBuilder.WithFailureThreshold(cfg.FailureThreshold) + } else { + // In case of production code, we prefer time based failure thresholding. + cbBuilder = cbBuilder.WithFailureRateThreshold(cfg.FailureThreshold, cfg.FailureExecutionThreshold, cfg.ThresholdingPeriod) + } + + cb := cbBuilder.Build() + return &circuitBreaker{ + CircuitBreaker: cb, + ingester: ingester, + executor: failsafe.NewExecutor[any](cb), + startTime: time.Now().Add(cfg.InitialDelay), + } +} + +func isFailure(err error) bool { + if err == nil { + return false + } + + // We only consider timeouts or ingester hitting a per-instance limit + // to be errors worthy of tripping the circuit breaker since these + // are specific to a particular ingester, not a user or request. + + if errors.Is(err, context.DeadlineExceeded) { + return true + } + + var ingesterErr ingesterError + if errors.As(err, &ingesterErr) { + return ingesterErr.errorCause() == mimirpb.INSTANCE_LIMIT + } + + return false +} + +func (cb *circuitBreaker) isActive() bool { + return cb.startTime.Before(time.Now()) +} + +func (cb *circuitBreaker) ingesterID() string { + return cb.ingester.cfg.IngesterRing.InstanceID +} + +func (cb *circuitBreaker) logger() log.Logger { + return cb.ingester.logger +} + +func (cb *circuitBreaker) metrics() *ingesterMetrics { + return cb.ingester.metrics +} + +func (cb *circuitBreaker) config() CircuitBreakerConfig { + return cb.ingester.cfg.CircuitBreakerConfig +} + +func (cb *circuitBreaker) get(ctx context.Context, callbackFnName string, callbackFn func() (any, error)) (any, error) { + res, err := cb.executor.Get(callbackFn) + if err != nil && errors.Is(err, circuitbreaker.ErrOpen) { + cb.metrics().circuitBreakerResults.WithLabelValues(cb.ingesterID(), resultOpen).Inc() + cbOpenErr := middleware.DoNotLogError{Err: newCircuitBreakerOpenError(cb.RemainingDelay())} + return res, newErrorWithStatus(cbOpenErr, codes.Unavailable) + } + return res, cb.processError(ctx, err, callbackFnName) +} + +func (cb *circuitBreaker) processError(ctx context.Context, err error, callbackFnName string) error { + if err == nil { + return nil + } + if errors.Is(err, ctx.Err()) { + level.Error(cb.logger()).Log("msg", fmt.Sprintf("callback function %s completed with an error found in the context", callbackFnName), "ingester", cb.ingesterID(), "ctxErr", ctx.Err()) + + // ctx.Err() was registered with the circuit breaker's executor, but we don't propagate it + return nil + } + + level.Error(cb.logger()).Log("msg", fmt.Sprintf("callback function %s completed with an error", callbackFnName), "ingester", cb.ingesterID(), "err", err) + return err +} + +func (cb *circuitBreaker) contextWithTimeout(parent context.Context, timeout time.Duration) (context.Context, context.CancelFunc) { + if timeout == 0 { + timeout = defaultTimeout + } + ctx, cancel := context.WithTimeout(context.WithoutCancel(parent), timeout) + if cb.config().testModeEnabled { + if initialDelay, ok := parent.Value(testDelayKey).(string); ok { + if d, err := time.ParseDuration(initialDelay); err == nil { + time.Sleep(d) + } + } + } + return ctx, cancel +} + +func (cb *circuitBreaker) StartPushRequest(ctx context.Context, reqSize int64) (context.Context, error) { + callbackCtx, callbackErr := cb.get(ctx, "ingester.startPushRequest", func() (any, error) { + callbackCtx, _, callbackErr := cb.ingester.startPushRequest(ctx, reqSize) + return callbackCtx, callbackErr + }) + if callbackErr == nil { + return callbackCtx.(context.Context), nil + } + return nil, callbackErr +} + +func (cb *circuitBreaker) Push(parent context.Context, req *mimirpb.WriteRequest) (*mimirpb.WriteResponse, error) { + ctx, cancel := cb.contextWithTimeout(parent, cb.config().PushTimeout) + defer cancel() + + callbackResult, callbackErr := cb.get(ctx, "ingester.push", func() (any, error) { + callbackResult, callbackErr := cb.ingester.push(ctx, req) + if callbackErr != nil { + return callbackResult, callbackErr + } + + // We return ctx.Err() in order to register it with the circuit breaker's executor. + return callbackResult, ctx.Err() + }) + + if callbackResult == nil { + return nil, callbackErr + } + return callbackResult.(*mimirpb.WriteResponse), callbackErr +} diff --git a/pkg/ingester/circuitbreaker_test.go b/pkg/ingester/circuitbreaker_test.go new file mode 100644 index 00000000000..aec3b3acdf3 --- /dev/null +++ b/pkg/ingester/circuitbreaker_test.go @@ -0,0 +1,193 @@ +// SPDX-License-Identifier: AGPL-3.0-only + +package ingester + +import ( + "context" + "fmt" + "strings" + "testing" + "time" + + "github.com/grafana/dskit/grpcutil" + "github.com/grafana/dskit/middleware" + "github.com/grafana/dskit/services" + "github.com/grafana/dskit/test" + "github.com/grafana/dskit/user" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/testutil" + "github.com/prometheus/prometheus/model/labels" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "google.golang.org/grpc/codes" + + "github.com/grafana/mimir/pkg/mimirpb" +) + +func TestIngester_Push_CircuitBreaker(t *testing.T) { + pushTimeout := 100 * time.Millisecond + tests := map[string]struct { + expectedErrorWhenCircuitBreakerClosed error + ctx func(context.Context) context.Context + limits InstanceLimits + }{ + "deadline exceeded": { + expectedErrorWhenCircuitBreakerClosed: nil, + limits: InstanceLimits{MaxInMemoryTenants: 3}, + ctx: func(ctx context.Context) context.Context { + return context.WithValue(ctx, testDelayKey, (2 * pushTimeout).String()) + }, + }, + "instance limit hit": { + expectedErrorWhenCircuitBreakerClosed: instanceLimitReachedError{}, + limits: InstanceLimits{MaxInMemoryTenants: 1}, + }, + } + + for initialDelayEnabled, initialDelayStatus := range map[bool]string{false: "disabled", true: "enabled"} { + for testName, testCase := range tests { + t.Run(fmt.Sprintf("%s with initial delay %s", testName, initialDelayStatus), func(t *testing.T) { + metricLabelAdapters := [][]mimirpb.LabelAdapter{{{Name: labels.MetricName, Value: "test"}}} + metricNames := []string{ + "cortex_ingester_circuit_breaker_results_total", + "cortex_ingester_circuit_breaker_transitions_total", + } + + registry := prometheus.NewRegistry() + + // Create a mocked ingester + cfg := defaultIngesterTestConfig(t) + cfg.ActiveSeriesMetrics.IdleTimeout = 100 * time.Millisecond + cfg.InstanceLimitsFn = func() *InstanceLimits { + return &testCase.limits + } + failureThreshold := 2 + var initialDelay time.Duration + if initialDelayEnabled { + initialDelay = 200 * time.Millisecond + } + cfg.CircuitBreakerConfig = CircuitBreakerConfig{ + Enabled: true, + FailureThreshold: uint(failureThreshold), + CooldownPeriod: 10 * time.Second, + InitialDelay: initialDelay, + PushTimeout: pushTimeout, + testModeEnabled: true, + } + + i, err := prepareIngesterWithBlocksStorage(t, cfg, nil, registry) + require.NoError(t, err) + require.NoError(t, services.StartAndAwaitRunning(context.Background(), i)) + defer services.StopAndAwaitTerminated(context.Background(), i) //nolint:errcheck + + // Wait until the ingester is healthy + test.Poll(t, 100*time.Millisecond, 1, func() interface{} { + return i.lifecycler.HealthyInstancesCount() + }) + + // the first request is successful + ctx := user.InjectOrgID(context.Background(), "test-0") + req := mimirpb.ToWriteRequest( + metricLabelAdapters, + []mimirpb.Sample{{Value: 1, TimestampMs: 8}}, + nil, + nil, + mimirpb.API, + ) + _, err = i.Push(ctx, req) + require.NoError(t, err) + + count := 0 + + // Push timeseries for each user + for _, userID := range []string{"test-1", "test-2"} { + reqs := []*mimirpb.WriteRequest{ + mimirpb.ToWriteRequest( + metricLabelAdapters, + []mimirpb.Sample{{Value: 1, TimestampMs: 9}}, + nil, + nil, + mimirpb.API, + ), + mimirpb.ToWriteRequest( + metricLabelAdapters, + []mimirpb.Sample{{Value: 2, TimestampMs: 10}}, + nil, + nil, + mimirpb.API, + ), + } + + for _, req := range reqs { + ctx := user.InjectOrgID(context.Background(), userID) + count++ + if testCase.ctx != nil { + ctx = testCase.ctx(ctx) + } + _, err = i.Push(ctx, req) + if initialDelayEnabled { + if testCase.expectedErrorWhenCircuitBreakerClosed != nil { + require.ErrorAs(t, err, &testCase.expectedErrorWhenCircuitBreakerClosed) + } else { + require.NoError(t, err) + } + } else { + if count <= failureThreshold { + if testCase.expectedErrorWhenCircuitBreakerClosed != nil { + require.ErrorAs(t, err, &testCase.expectedErrorWhenCircuitBreakerClosed) + } + } else { + checkCircuitBreakerOpenErr(ctx, err, t) + } + } + } + } + + // Check tracked Prometheus metrics + var expectedMetrics string + if initialDelayEnabled { + expectedMetrics = ` + # HELP cortex_ingester_circuit_breaker_results_total Results of executing requests via the circuit breaker + # TYPE cortex_ingester_circuit_breaker_results_total counter + cortex_ingester_circuit_breaker_results_total{ingester="localhost",result="error"} 0 + cortex_ingester_circuit_breaker_results_total{ingester="localhost",result="success"} 0 + # HELP cortex_ingester_circuit_breaker_transitions_total Number of times the circuit breaker has entered a state + # TYPE cortex_ingester_circuit_breaker_transitions_total counter + cortex_ingester_circuit_breaker_transitions_total{ingester="localhost",state="closed"} 0 + cortex_ingester_circuit_breaker_transitions_total{ingester="localhost",state="half-open"} 0 + cortex_ingester_circuit_breaker_transitions_total{ingester="localhost",state="open"} 0 + ` + } else { + expectedMetrics = ` + # HELP cortex_ingester_circuit_breaker_results_total Results of executing requests via the circuit breaker + # TYPE cortex_ingester_circuit_breaker_results_total counter + cortex_ingester_circuit_breaker_results_total{ingester="localhost",result="circuit_breaker_open"} 2 + cortex_ingester_circuit_breaker_results_total{ingester="localhost",result="error"} 2 + cortex_ingester_circuit_breaker_results_total{ingester="localhost",result="success"} 1 + # HELP cortex_ingester_circuit_breaker_transitions_total Number of times the circuit breaker has entered a state + # TYPE cortex_ingester_circuit_breaker_transitions_total counter + cortex_ingester_circuit_breaker_transitions_total{ingester="localhost",state="closed"} 0 + cortex_ingester_circuit_breaker_transitions_total{ingester="localhost",state="half-open"} 0 + cortex_ingester_circuit_breaker_transitions_total{ingester="localhost",state="open"} 1 + ` + } + assert.NoError(t, testutil.GatherAndCompare(registry, strings.NewReader(expectedMetrics), metricNames...)) + }) + } + } +} + +func checkCircuitBreakerOpenErr(ctx context.Context, err error, t *testing.T) { + var cbOpenErr circuitBreakerOpenError + require.ErrorAs(t, err, &cbOpenErr) + + var optional middleware.OptionalLogging + require.ErrorAs(t, err, &optional) + + shouldLog, _ := optional.ShouldLog(ctx) + require.False(t, shouldLog, "expected not to log via .ShouldLog()") + + s, ok := grpcutil.ErrorToStatus(err) + require.True(t, ok, "expected to be able to convert to gRPC status") + require.Equal(t, codes.Unavailable, s.Code()) +} diff --git a/pkg/ingester/errors.go b/pkg/ingester/errors.go index 56186be67ae..c39b00d0c2a 100644 --- a/pkg/ingester/errors.go +++ b/pkg/ingester/errors.go @@ -11,6 +11,7 @@ import ( "net/http" "time" + "github.com/failsafe-go/failsafe-go/circuitbreaker" "github.com/grafana/dskit/grpcutil" "github.com/grafana/dskit/httpgrpc" "github.com/grafana/dskit/middleware" @@ -489,6 +490,24 @@ func (e ingesterPushGrpcDisabledError) errorCause() mimirpb.ErrorCause { // Ensure that ingesterPushGrpcDisabledError is an ingesterError. var _ ingesterError = ingesterPushGrpcDisabledError{} +type circuitBreakerOpenError struct { + remainingDelay time.Duration +} + +func newCircuitBreakerOpenError(remainingDelay time.Duration) circuitBreakerOpenError { + return circuitBreakerOpenError{remainingDelay: remainingDelay} +} + +func (e circuitBreakerOpenError) Error() string { + return fmt.Sprintf("%s with remaining delay %s", circuitbreaker.ErrOpen.Error(), e.remainingDelay.String()) +} + +func (e circuitBreakerOpenError) errorCause() mimirpb.ErrorCause { + return mimirpb.CIRCUIT_BREAKER_OPEN +} + +var _ ingesterError = circuitBreakerOpenError{} + type ingesterErrSamplers struct { sampleTimestampTooOld *log.Sampler sampleTimestampTooOldOOOEnabled *log.Sampler diff --git a/pkg/ingester/errors_test.go b/pkg/ingester/errors_test.go index 6029e2e2734..b56f565ee75 100644 --- a/pkg/ingester/errors_test.go +++ b/pkg/ingester/errors_test.go @@ -246,6 +246,24 @@ func TestTooBusyError(t *testing.T) { checkIngesterError(t, wrappedErr, mimirpb.TOO_BUSY, false) } +func TestNewCircuitBreakerOpenError(t *testing.T) { + remainingDelay := 1 * time.Second + expectedMsg := fmt.Sprintf("circuit breaker open with remaining delay %s", remainingDelay.String()) + err := newCircuitBreakerOpenError(remainingDelay) + require.Error(t, err) + require.EqualError(t, err, expectedMsg) + checkIngesterError(t, err, mimirpb.CIRCUIT_BREAKER_OPEN, false) + + wrappedErr := fmt.Errorf("wrapped: %w", err) + require.ErrorIs(t, wrappedErr, err) + require.ErrorAs(t, wrappedErr, &circuitBreakerOpenError{}) + + wrappedWithUserErr := wrapOrAnnotateWithUser(err, userID) + require.ErrorIs(t, wrappedWithUserErr, err) + require.ErrorAs(t, wrappedWithUserErr, &circuitBreakerOpenError{}) + checkIngesterError(t, wrappedErr, mimirpb.CIRCUIT_BREAKER_OPEN, false) +} + func TestNewErrorWithStatus(t *testing.T) { errMsg := "this is an error" ingesterErr := mockIngesterErr(errMsg) diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index 240bee2cc6d..0f0c868146e 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -215,6 +215,8 @@ type Config struct { // This config can be overridden in tests. limitMetricsUpdatePeriod time.Duration `yaml:"-"` + + CircuitBreakerConfig CircuitBreakerConfig `yaml:"circuit_breaker" category:"experimental"` } // RegisterFlags adds the flags required to config this to the given FlagSet @@ -223,6 +225,7 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet, logger log.Logger) { cfg.IngesterPartitionRing.RegisterFlags(f) cfg.DefaultLimits.RegisterFlags(f) cfg.ActiveSeriesMetrics.RegisterFlags(f) + cfg.CircuitBreakerConfig.RegisterFlags(f) f.DurationVar(&cfg.MetadataRetainPeriod, "ingester.metadata-retain-period", 10*time.Minute, "Period at which metadata we have not seen will remain in memory before being deleted.") f.DurationVar(&cfg.RateUpdatePeriod, "ingester.rate-update-period", 15*time.Second, "Period with which to update the per-tenant ingestion rates.") @@ -258,7 +261,13 @@ func (cfg *Config) Validate(logger log.Logger) error { util.WarnDeprecatedConfig(deprecatedReturnOnlyGRPCErrorsFlag, logger) } - return cfg.IngesterRing.Validate() + err := cfg.IngesterRing.Validate() + + if err == nil { + return cfg.CircuitBreakerConfig.Validate() + } + + return err } func (cfg *Config) getIgnoreSeriesLimitForMetricNamesMap() map[string]struct{} { @@ -349,6 +358,8 @@ type Ingester struct { ingestReader *ingest.PartitionReader ingestPartitionID int32 ingestPartitionLifecycler *ring.PartitionInstanceLifecycler + + circuitBreaker *circuitBreaker } func newIngester(cfg Config, limits *validation.Overrides, registerer prometheus.Registerer, logger log.Logger) (*Ingester, error) { @@ -393,6 +404,10 @@ func New(cfg Config, limits *validation.Overrides, ingestersRing ring.ReadRing, i.metrics = newIngesterMetrics(registerer, cfg.ActiveSeriesMetrics.Enabled, i.getInstanceLimits, i.ingestionRate, &i.inflightPushRequests, &i.inflightPushRequestsBytes) i.activeGroups = activeGroupsCleanupService + if cfg.CircuitBreakerConfig.Enabled { + i.circuitBreaker = newCircuitBreaker(i) + } + if registerer != nil { promauto.With(registerer).NewGaugeFunc(prometheus.GaugeOpts{ Name: "cortex_ingester_oldest_unshipped_block_timestamp_seconds", @@ -960,6 +975,9 @@ type pushRequestState struct { // StartPushRequest checks if ingester can start push request, and increments relevant counters. // If new push request cannot be started, errors convertible to gRPC status code are returned, and metrics are updated. func (i *Ingester) StartPushRequest(ctx context.Context, reqSize int64) (context.Context, error) { + if i.isCircuitBreakerActive() { + return i.circuitBreaker.StartPushRequest(ctx, reqSize) + } ctx, _, err := i.startPushRequest(ctx, reqSize) return ctx, err } @@ -3749,8 +3767,22 @@ func (i *Ingester) PushToStorage(ctx context.Context, req *mimirpb.WriteRequest) return nil } +func (i *Ingester) isCircuitBreakerActive() bool { + if i.circuitBreaker == nil { + return false + } + return i.circuitBreaker.isActive() +} + // Push implements client.IngesterServer, which is registered into gRPC server. func (i *Ingester) Push(ctx context.Context, req *mimirpb.WriteRequest) (*mimirpb.WriteResponse, error) { + if i.isCircuitBreakerActive() { + return i.circuitBreaker.Push(ctx, req) + } + return i.push(ctx, req) +} + +func (i *Ingester) push(ctx context.Context, req *mimirpb.WriteRequest) (*mimirpb.WriteResponse, error) { if !i.cfg.PushGrpcMethodEnabled { return nil, errPushGrpcDisabled } diff --git a/pkg/ingester/metrics.go b/pkg/ingester/metrics.go index 833a802fb04..d756d4c86e9 100644 --- a/pkg/ingester/metrics.go +++ b/pkg/ingester/metrics.go @@ -83,6 +83,9 @@ type ingesterMetrics struct { // Count number of requests rejected due to utilization based limiting. utilizationLimitedRequests *prometheus.CounterVec + + circuitBreakerTransitions *prometheus.CounterVec + circuitBreakerResults *prometheus.CounterVec } func newIngesterMetrics( @@ -375,6 +378,15 @@ func newIngesterMetrics( Name: "cortex_ingester_prepare_shutdown_requested", Help: "If the ingester has been requested to prepare for shutdown via endpoint or marker file.", }), + + circuitBreakerTransitions: promauto.With(r).NewCounterVec(prometheus.CounterOpts{ + Name: "cortex_ingester_circuit_breaker_transitions_total", + Help: "Number of times the circuit breaker has entered a state", + }, []string{"ingester", "state"}), + circuitBreakerResults: promauto.With(r).NewCounterVec(prometheus.CounterOpts{ + Name: "cortex_ingester_circuit_breaker_results_total", + Help: "Results of executing requests via the circuit breaker", + }, []string{"ingester", "result"}), } // Initialize expected rejected request labels From 3a8279df2765df220509ac7dfe544c1a45e2fc6b Mon Sep 17 00:00:00 2001 From: Yuri Nikolic Date: Mon, 27 May 2024 20:48:50 +0200 Subject: [PATCH 02/16] Fixing review findings Signed-off-by: Yuri Nikolic --- pkg/ingester/circuitbreaker.go | 123 +++++++++++++++++----------- pkg/ingester/circuitbreaker_test.go | 21 +++-- pkg/ingester/ingester.go | 23 ++---- pkg/ingester/metrics.go | 12 --- 4 files changed, 99 insertions(+), 80 deletions(-) diff --git a/pkg/ingester/circuitbreaker.go b/pkg/ingester/circuitbreaker.go index 852dacca54e..a87c79e078f 100644 --- a/pkg/ingester/circuitbreaker.go +++ b/pkg/ingester/circuitbreaker.go @@ -5,7 +5,6 @@ package ingester import ( "context" "flag" - "fmt" "time" "github.com/failsafe-go/failsafe-go" @@ -14,6 +13,8 @@ import ( "github.com/go-kit/log/level" "github.com/grafana/dskit/middleware" "github.com/pkg/errors" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" "google.golang.org/grpc/codes" "github.com/grafana/mimir/pkg/mimirpb" @@ -29,6 +30,34 @@ const ( testDelayKey testCtxKey = "test-delay" ) +type circuitBreakerMetrics struct { + circuitBreakerCurrentState *prometheus.GaugeVec + + circuitBreakerTransitions *prometheus.CounterVec + circuitBreakerResults *prometheus.CounterVec +} + +type startPushRequestFn func(context.Context, int64) (context.Context, bool, error) + +type pushRequestFn func(ctx context.Context, req *mimirpb.WriteRequest) (*mimirpb.WriteResponse, error) + +func newCircuitBreakerMetrics(r prometheus.Registerer) *circuitBreakerMetrics { + return &circuitBreakerMetrics{ + circuitBreakerCurrentState: promauto.With(r).NewGaugeVec(prometheus.GaugeOpts{ + Name: "cortex_ingester_circuit_breaker_current_state", + Help: "Boolean set to 1 whenever the circuit breaker is in a state corresponding to the label name.", + }, []string{"state"}), + circuitBreakerTransitions: promauto.With(r).NewCounterVec(prometheus.CounterOpts{ + Name: "cortex_ingester_circuit_breaker_transitions_total", + Help: "Number of times the circuit breaker has entered a state.", + }, []string{"ingester", "state"}), + circuitBreakerResults: promauto.With(r).NewCounterVec(prometheus.CounterOpts{ + Name: "cortex_ingester_circuit_breaker_results_total", + Help: "Results of executing requests via the circuit breaker.", + }, []string{"ingester", "result"}), + } +} + type CircuitBreakerConfig struct { Enabled bool `yaml:"enabled" category:"experimental"` FailureThreshold uint `yaml:"failure_threshold" category:"experimental"` @@ -56,22 +85,26 @@ func (cfg *CircuitBreakerConfig) Validate() error { } type circuitBreaker struct { + cfg CircuitBreakerConfig circuitbreaker.CircuitBreaker[any] - ingester *Ingester - executor failsafe.Executor[any] - startTime time.Time + executor failsafe.Executor[any] + ingesterID string + logger log.Logger + metrics *circuitBreakerMetrics + startTime time.Time } -func newCircuitBreaker(ingester *Ingester) *circuitBreaker { - ingesterID := ingester.cfg.IngesterRing.InstanceID - cfg := ingester.cfg.CircuitBreakerConfig - metrics := ingester.metrics +func newCircuitBreaker(cfg CircuitBreakerConfig, ingesterID string, logger log.Logger, registerer prometheus.Registerer) *circuitBreaker { + metrics := newCircuitBreakerMetrics(registerer) // Initialize each of the known labels for circuit breaker metrics for this particular ingester. transitionOpen := metrics.circuitBreakerTransitions.WithLabelValues(ingesterID, circuitbreaker.OpenState.String()) transitionHalfOpen := metrics.circuitBreakerTransitions.WithLabelValues(ingesterID, circuitbreaker.HalfOpenState.String()) transitionClosed := metrics.circuitBreakerTransitions.WithLabelValues(ingesterID, circuitbreaker.ClosedState.String()) countSuccess := metrics.circuitBreakerResults.WithLabelValues(ingesterID, resultSuccess) countError := metrics.circuitBreakerResults.WithLabelValues(ingesterID, resultError) + gaugeOpen := metrics.circuitBreakerCurrentState.WithLabelValues(circuitbreaker.OpenState.String()) + gaugeHalfOpen := metrics.circuitBreakerCurrentState.WithLabelValues(circuitbreaker.HalfOpenState.String()) + gaugeClosed := metrics.circuitBreakerCurrentState.WithLabelValues(circuitbreaker.ClosedState.String()) cbBuilder := circuitbreaker.Builder[any](). WithFailureThreshold(cfg.FailureThreshold). @@ -84,15 +117,24 @@ func newCircuitBreaker(ingester *Ingester) *circuitBreaker { }). OnClose(func(event circuitbreaker.StateChangedEvent) { transitionClosed.Inc() - level.Info(ingester.logger).Log("msg", "circuit breaker is closed", "ingester", ingesterID, "previous", event.OldState, "current", event.NewState) + gaugeOpen.Set(0) + gaugeHalfOpen.Set(0) + gaugeClosed.Set(1) + level.Info(logger).Log("msg", "circuit breaker is closed", "ingester", ingesterID, "previous", event.OldState, "current", event.NewState) }). OnOpen(func(event circuitbreaker.StateChangedEvent) { transitionOpen.Inc() - level.Info(ingester.logger).Log("msg", "circuit breaker is open", "ingester", ingesterID, "previous", event.OldState, "current", event.NewState) + gaugeOpen.Set(1) + gaugeHalfOpen.Set(0) + gaugeClosed.Set(0) + level.Warn(logger).Log("msg", "circuit breaker is open", "ingester", ingesterID, "previous", event.OldState, "current", event.NewState) }). OnHalfOpen(func(event circuitbreaker.StateChangedEvent) { transitionHalfOpen.Inc() - level.Info(ingester.logger).Log("msg", "circuit breaker is half-open", "ingester", ingesterID, "previous", event.OldState, "current", event.NewState) + gaugeOpen.Set(0) + gaugeHalfOpen.Set(1) + gaugeClosed.Set(0) + level.Info(logger).Log("msg", "circuit breaker is half-open", "ingester", ingesterID, "previous", event.OldState, "current", event.NewState) }). HandleIf(func(_ any, err error) bool { return isFailure(err) }) @@ -107,9 +149,12 @@ func newCircuitBreaker(ingester *Ingester) *circuitBreaker { cb := cbBuilder.Build() return &circuitBreaker{ + cfg: cfg, CircuitBreaker: cb, - ingester: ingester, executor: failsafe.NewExecutor[any](cb), + ingesterID: ingesterID, + logger: logger, + metrics: metrics, startTime: time.Now().Add(cfg.InitialDelay), } } @@ -136,47 +181,31 @@ func isFailure(err error) bool { } func (cb *circuitBreaker) isActive() bool { + if cb == nil { + return false + } return cb.startTime.Before(time.Now()) } -func (cb *circuitBreaker) ingesterID() string { - return cb.ingester.cfg.IngesterRing.InstanceID -} - -func (cb *circuitBreaker) logger() log.Logger { - return cb.ingester.logger -} - -func (cb *circuitBreaker) metrics() *ingesterMetrics { - return cb.ingester.metrics -} - -func (cb *circuitBreaker) config() CircuitBreakerConfig { - return cb.ingester.cfg.CircuitBreakerConfig -} - -func (cb *circuitBreaker) get(ctx context.Context, callbackFnName string, callbackFn func() (any, error)) (any, error) { - res, err := cb.executor.Get(callbackFn) +func (cb *circuitBreaker) get(ctx context.Context, op func() (any, error)) (any, error) { + res, err := cb.executor.Get(op) if err != nil && errors.Is(err, circuitbreaker.ErrOpen) { - cb.metrics().circuitBreakerResults.WithLabelValues(cb.ingesterID(), resultOpen).Inc() + cb.metrics.circuitBreakerResults.WithLabelValues(cb.ingesterID, resultOpen).Inc() cbOpenErr := middleware.DoNotLogError{Err: newCircuitBreakerOpenError(cb.RemainingDelay())} return res, newErrorWithStatus(cbOpenErr, codes.Unavailable) } - return res, cb.processError(ctx, err, callbackFnName) + return res, cb.processError(ctx, err) } -func (cb *circuitBreaker) processError(ctx context.Context, err error, callbackFnName string) error { +func (cb *circuitBreaker) processError(ctx context.Context, err error) error { if err == nil { return nil } if errors.Is(err, ctx.Err()) { - level.Error(cb.logger()).Log("msg", fmt.Sprintf("callback function %s completed with an error found in the context", callbackFnName), "ingester", cb.ingesterID(), "ctxErr", ctx.Err()) - // ctx.Err() was registered with the circuit breaker's executor, but we don't propagate it return nil } - level.Error(cb.logger()).Log("msg", fmt.Sprintf("callback function %s completed with an error", callbackFnName), "ingester", cb.ingesterID(), "err", err) return err } @@ -185,19 +214,17 @@ func (cb *circuitBreaker) contextWithTimeout(parent context.Context, timeout tim timeout = defaultTimeout } ctx, cancel := context.WithTimeout(context.WithoutCancel(parent), timeout) - if cb.config().testModeEnabled { - if initialDelay, ok := parent.Value(testDelayKey).(string); ok { - if d, err := time.ParseDuration(initialDelay); err == nil { - time.Sleep(d) - } + if cb.cfg.testModeEnabled { + if initialDelay, ok := parent.Value(testDelayKey).(time.Duration); ok { + time.Sleep(initialDelay) } } return ctx, cancel } -func (cb *circuitBreaker) StartPushRequest(ctx context.Context, reqSize int64) (context.Context, error) { - callbackCtx, callbackErr := cb.get(ctx, "ingester.startPushRequest", func() (any, error) { - callbackCtx, _, callbackErr := cb.ingester.startPushRequest(ctx, reqSize) +func (cb *circuitBreaker) StartPushRequest(ctx context.Context, reqSize int64, startPushRequest startPushRequestFn) (context.Context, error) { + callbackCtx, callbackErr := cb.get(ctx, func() (any, error) { + callbackCtx, _, callbackErr := startPushRequest(ctx, reqSize) return callbackCtx, callbackErr }) if callbackErr == nil { @@ -206,12 +233,12 @@ func (cb *circuitBreaker) StartPushRequest(ctx context.Context, reqSize int64) ( return nil, callbackErr } -func (cb *circuitBreaker) Push(parent context.Context, req *mimirpb.WriteRequest) (*mimirpb.WriteResponse, error) { - ctx, cancel := cb.contextWithTimeout(parent, cb.config().PushTimeout) +func (cb *circuitBreaker) Push(parent context.Context, req *mimirpb.WriteRequest, push pushRequestFn) (*mimirpb.WriteResponse, error) { + ctx, cancel := cb.contextWithTimeout(parent, cb.cfg.PushTimeout) defer cancel() - callbackResult, callbackErr := cb.get(ctx, "ingester.push", func() (any, error) { - callbackResult, callbackErr := cb.ingester.push(ctx, req) + callbackResult, callbackErr := cb.get(ctx, func() (any, error) { + callbackResult, callbackErr := push(ctx, req) if callbackErr != nil { return callbackResult, callbackErr } diff --git a/pkg/ingester/circuitbreaker_test.go b/pkg/ingester/circuitbreaker_test.go index aec3b3acdf3..58cf861ce93 100644 --- a/pkg/ingester/circuitbreaker_test.go +++ b/pkg/ingester/circuitbreaker_test.go @@ -35,7 +35,7 @@ func TestIngester_Push_CircuitBreaker(t *testing.T) { expectedErrorWhenCircuitBreakerClosed: nil, limits: InstanceLimits{MaxInMemoryTenants: 3}, ctx: func(ctx context.Context) context.Context { - return context.WithValue(ctx, testDelayKey, (2 * pushTimeout).String()) + return context.WithValue(ctx, testDelayKey, 2*pushTimeout) }, }, "instance limit hit": { @@ -51,6 +51,7 @@ func TestIngester_Push_CircuitBreaker(t *testing.T) { metricNames := []string{ "cortex_ingester_circuit_breaker_results_total", "cortex_ingester_circuit_breaker_transitions_total", + "cortex_ingester_circuit_breaker_current_state", } registry := prometheus.NewRegistry() @@ -147,28 +148,38 @@ func TestIngester_Push_CircuitBreaker(t *testing.T) { var expectedMetrics string if initialDelayEnabled { expectedMetrics = ` - # HELP cortex_ingester_circuit_breaker_results_total Results of executing requests via the circuit breaker + # HELP cortex_ingester_circuit_breaker_results_total Results of executing requests via the circuit breaker. # TYPE cortex_ingester_circuit_breaker_results_total counter cortex_ingester_circuit_breaker_results_total{ingester="localhost",result="error"} 0 cortex_ingester_circuit_breaker_results_total{ingester="localhost",result="success"} 0 - # HELP cortex_ingester_circuit_breaker_transitions_total Number of times the circuit breaker has entered a state + # HELP cortex_ingester_circuit_breaker_transitions_total Number of times the circuit breaker has entered a state. # TYPE cortex_ingester_circuit_breaker_transitions_total counter cortex_ingester_circuit_breaker_transitions_total{ingester="localhost",state="closed"} 0 cortex_ingester_circuit_breaker_transitions_total{ingester="localhost",state="half-open"} 0 cortex_ingester_circuit_breaker_transitions_total{ingester="localhost",state="open"} 0 + # HELP cortex_ingester_circuit_breaker_current_state Boolean set to 1 whenever the circuit breaker is in a state corresponding to the label name. + # TYPE cortex_ingester_circuit_breaker_current_state gauge + cortex_ingester_circuit_breaker_current_state{state="open"} 0 + cortex_ingester_circuit_breaker_current_state{state="half-open"} 0 + cortex_ingester_circuit_breaker_current_state{state="closed"} 0 ` } else { expectedMetrics = ` - # HELP cortex_ingester_circuit_breaker_results_total Results of executing requests via the circuit breaker + # HELP cortex_ingester_circuit_breaker_results_total Results of executing requests via the circuit breaker. # TYPE cortex_ingester_circuit_breaker_results_total counter cortex_ingester_circuit_breaker_results_total{ingester="localhost",result="circuit_breaker_open"} 2 cortex_ingester_circuit_breaker_results_total{ingester="localhost",result="error"} 2 cortex_ingester_circuit_breaker_results_total{ingester="localhost",result="success"} 1 - # HELP cortex_ingester_circuit_breaker_transitions_total Number of times the circuit breaker has entered a state + # HELP cortex_ingester_circuit_breaker_transitions_total Number of times the circuit breaker has entered a state. # TYPE cortex_ingester_circuit_breaker_transitions_total counter cortex_ingester_circuit_breaker_transitions_total{ingester="localhost",state="closed"} 0 cortex_ingester_circuit_breaker_transitions_total{ingester="localhost",state="half-open"} 0 cortex_ingester_circuit_breaker_transitions_total{ingester="localhost",state="open"} 1 + # HELP cortex_ingester_circuit_breaker_current_state Boolean set to 1 whenever the circuit breaker is in a state corresponding to the label name. + # TYPE cortex_ingester_circuit_breaker_current_state gauge + cortex_ingester_circuit_breaker_current_state{state="open"} 1 + cortex_ingester_circuit_breaker_current_state{state="half-open"} 0 + cortex_ingester_circuit_breaker_current_state{state="closed"} 0 ` } assert.NoError(t, testutil.GatherAndCompare(registry, strings.NewReader(expectedMetrics), metricNames...)) diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index 0f0c868146e..cbbab427d7a 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -263,11 +263,11 @@ func (cfg *Config) Validate(logger log.Logger) error { err := cfg.IngesterRing.Validate() - if err == nil { - return cfg.CircuitBreakerConfig.Validate() + if err != nil { + return err } - return err + return cfg.CircuitBreakerConfig.Validate() } func (cfg *Config) getIgnoreSeriesLimitForMetricNamesMap() map[string]struct{} { @@ -405,7 +405,7 @@ func New(cfg Config, limits *validation.Overrides, ingestersRing ring.ReadRing, i.activeGroups = activeGroupsCleanupService if cfg.CircuitBreakerConfig.Enabled { - i.circuitBreaker = newCircuitBreaker(i) + i.circuitBreaker = newCircuitBreaker(cfg.CircuitBreakerConfig, cfg.IngesterRing.InstanceID, logger, registerer) } if registerer != nil { @@ -975,8 +975,8 @@ type pushRequestState struct { // StartPushRequest checks if ingester can start push request, and increments relevant counters. // If new push request cannot be started, errors convertible to gRPC status code are returned, and metrics are updated. func (i *Ingester) StartPushRequest(ctx context.Context, reqSize int64) (context.Context, error) { - if i.isCircuitBreakerActive() { - return i.circuitBreaker.StartPushRequest(ctx, reqSize) + if i.circuitBreaker.isActive() { + return i.circuitBreaker.StartPushRequest(ctx, reqSize, i.startPushRequest) } ctx, _, err := i.startPushRequest(ctx, reqSize) return ctx, err @@ -3767,17 +3767,10 @@ func (i *Ingester) PushToStorage(ctx context.Context, req *mimirpb.WriteRequest) return nil } -func (i *Ingester) isCircuitBreakerActive() bool { - if i.circuitBreaker == nil { - return false - } - return i.circuitBreaker.isActive() -} - // Push implements client.IngesterServer, which is registered into gRPC server. func (i *Ingester) Push(ctx context.Context, req *mimirpb.WriteRequest) (*mimirpb.WriteResponse, error) { - if i.isCircuitBreakerActive() { - return i.circuitBreaker.Push(ctx, req) + if i.circuitBreaker.isActive() { + return i.circuitBreaker.Push(ctx, req, i.push) } return i.push(ctx, req) } diff --git a/pkg/ingester/metrics.go b/pkg/ingester/metrics.go index d756d4c86e9..833a802fb04 100644 --- a/pkg/ingester/metrics.go +++ b/pkg/ingester/metrics.go @@ -83,9 +83,6 @@ type ingesterMetrics struct { // Count number of requests rejected due to utilization based limiting. utilizationLimitedRequests *prometheus.CounterVec - - circuitBreakerTransitions *prometheus.CounterVec - circuitBreakerResults *prometheus.CounterVec } func newIngesterMetrics( @@ -378,15 +375,6 @@ func newIngesterMetrics( Name: "cortex_ingester_prepare_shutdown_requested", Help: "If the ingester has been requested to prepare for shutdown via endpoint or marker file.", }), - - circuitBreakerTransitions: promauto.With(r).NewCounterVec(prometheus.CounterOpts{ - Name: "cortex_ingester_circuit_breaker_transitions_total", - Help: "Number of times the circuit breaker has entered a state", - }, []string{"ingester", "state"}), - circuitBreakerResults: promauto.With(r).NewCounterVec(prometheus.CounterOpts{ - Name: "cortex_ingester_circuit_breaker_results_total", - Help: "Results of executing requests via the circuit breaker", - }, []string{"ingester", "result"}), } // Initialize expected rejected request labels From 186bc2b4e091830eb9828e29e9b60a67055716b7 Mon Sep 17 00:00:00 2001 From: Yuri Nikolic Date: Tue, 28 May 2024 16:36:13 +0200 Subject: [PATCH 03/16] Fixing review findigs Signed-off-by: Yuri Nikolic --- pkg/ingester/circuitbreaker.go | 106 +++++++++------------------- pkg/ingester/circuitbreaker_test.go | 36 ++++------ pkg/ingester/ingester.go | 26 ++++--- 3 files changed, 63 insertions(+), 105 deletions(-) diff --git a/pkg/ingester/circuitbreaker.go b/pkg/ingester/circuitbreaker.go index a87c79e078f..ef24aa309d4 100644 --- a/pkg/ingester/circuitbreaker.go +++ b/pkg/ingester/circuitbreaker.go @@ -7,15 +7,12 @@ import ( "flag" "time" - "github.com/failsafe-go/failsafe-go" "github.com/failsafe-go/failsafe-go/circuitbreaker" "github.com/go-kit/log" "github.com/go-kit/log/level" - "github.com/grafana/dskit/middleware" "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" - "google.golang.org/grpc/codes" "github.com/grafana/mimir/pkg/mimirpb" ) @@ -23,11 +20,10 @@ import ( type testCtxKey string const ( - resultSuccess = "success" - resultError = "error" - resultOpen = "circuit_breaker_open" - defaultTimeout = 2 * time.Second - testDelayKey testCtxKey = "test-delay" + resultSuccess = "success" + resultError = "error" + resultOpen = "circuit_breaker_open" + testDelayKey testCtxKey = "test-delay" ) type circuitBreakerMetrics struct { @@ -37,10 +33,6 @@ type circuitBreakerMetrics struct { circuitBreakerResults *prometheus.CounterVec } -type startPushRequestFn func(context.Context, int64) (context.Context, bool, error) - -type pushRequestFn func(ctx context.Context, req *mimirpb.WriteRequest) (*mimirpb.WriteResponse, error) - func newCircuitBreakerMetrics(r prometheus.Registerer) *circuitBreakerMetrics { return &circuitBreakerMetrics{ circuitBreakerCurrentState: promauto.With(r).NewGaugeVec(prometheus.GaugeOpts{ @@ -85,9 +77,8 @@ func (cfg *CircuitBreakerConfig) Validate() error { } type circuitBreaker struct { - cfg CircuitBreakerConfig circuitbreaker.CircuitBreaker[any] - executor failsafe.Executor[any] + cfg CircuitBreakerConfig ingesterID string logger log.Logger metrics *circuitBreakerMetrics @@ -100,8 +91,6 @@ func newCircuitBreaker(cfg CircuitBreakerConfig, ingesterID string, logger log.L transitionOpen := metrics.circuitBreakerTransitions.WithLabelValues(ingesterID, circuitbreaker.OpenState.String()) transitionHalfOpen := metrics.circuitBreakerTransitions.WithLabelValues(ingesterID, circuitbreaker.HalfOpenState.String()) transitionClosed := metrics.circuitBreakerTransitions.WithLabelValues(ingesterID, circuitbreaker.ClosedState.String()) - countSuccess := metrics.circuitBreakerResults.WithLabelValues(ingesterID, resultSuccess) - countError := metrics.circuitBreakerResults.WithLabelValues(ingesterID, resultError) gaugeOpen := metrics.circuitBreakerCurrentState.WithLabelValues(circuitbreaker.OpenState.String()) gaugeHalfOpen := metrics.circuitBreakerCurrentState.WithLabelValues(circuitbreaker.HalfOpenState.String()) gaugeClosed := metrics.circuitBreakerCurrentState.WithLabelValues(circuitbreaker.ClosedState.String()) @@ -109,12 +98,6 @@ func newCircuitBreaker(cfg CircuitBreakerConfig, ingesterID string, logger log.L cbBuilder := circuitbreaker.Builder[any](). WithFailureThreshold(cfg.FailureThreshold). WithDelay(cfg.CooldownPeriod). - OnFailure(func(failsafe.ExecutionEvent[any]) { - countError.Inc() - }). - OnSuccess(func(failsafe.ExecutionEvent[any]) { - countSuccess.Inc() - }). OnClose(func(event circuitbreaker.StateChangedEvent) { transitionClosed.Inc() gaugeOpen.Set(0) @@ -151,7 +134,6 @@ func newCircuitBreaker(cfg CircuitBreakerConfig, ingesterID string, logger log.L return &circuitBreaker{ cfg: cfg, CircuitBreaker: cb, - executor: failsafe.NewExecutor[any](cb), ingesterID: ingesterID, logger: logger, metrics: metrics, @@ -187,68 +169,48 @@ func (cb *circuitBreaker) isActive() bool { return cb.startTime.Before(time.Now()) } -func (cb *circuitBreaker) get(ctx context.Context, op func() (any, error)) (any, error) { - res, err := cb.executor.Get(op) - if err != nil && errors.Is(err, circuitbreaker.ErrOpen) { +func (cb *circuitBreaker) tryAcquirePermit() error { + if !cb.isActive() { + return nil + } + if !cb.CircuitBreaker.TryAcquirePermit() { cb.metrics.circuitBreakerResults.WithLabelValues(cb.ingesterID, resultOpen).Inc() - cbOpenErr := middleware.DoNotLogError{Err: newCircuitBreakerOpenError(cb.RemainingDelay())} - return res, newErrorWithStatus(cbOpenErr, codes.Unavailable) + return newCircuitBreakerOpenError(cb.RemainingDelay()) } - return res, cb.processError(ctx, err) + return nil } -func (cb *circuitBreaker) processError(ctx context.Context, err error) error { - if err == nil { - return nil - } - if errors.Is(err, ctx.Err()) { - // ctx.Err() was registered with the circuit breaker's executor, but we don't propagate it - return nil +func (cb *circuitBreaker) recordSuccess() { + if !cb.isActive() { + return } + cb.CircuitBreaker.RecordSuccess() + cb.metrics.circuitBreakerResults.WithLabelValues(cb.ingesterID, resultSuccess).Inc() +} - return err +func (cb *circuitBreaker) recordError(err error) { + if !cb.isActive() { + return + } + cb.CircuitBreaker.RecordError(err) + cb.metrics.circuitBreakerResults.WithLabelValues(cb.ingesterID, resultError).Inc() } -func (cb *circuitBreaker) contextWithTimeout(parent context.Context, timeout time.Duration) (context.Context, context.CancelFunc) { - if timeout == 0 { - timeout = defaultTimeout +func (cb *circuitBreaker) finishPushRequest(ctx context.Context, startTimestamp time.Time, err error) { + if !cb.isActive() { + return } - ctx, cancel := context.WithTimeout(context.WithoutCancel(parent), timeout) if cb.cfg.testModeEnabled { - if initialDelay, ok := parent.Value(testDelayKey).(time.Duration); ok { + if initialDelay, ok := ctx.Value(testDelayKey).(time.Duration); ok { time.Sleep(initialDelay) } } - return ctx, cancel -} - -func (cb *circuitBreaker) StartPushRequest(ctx context.Context, reqSize int64, startPushRequest startPushRequestFn) (context.Context, error) { - callbackCtx, callbackErr := cb.get(ctx, func() (any, error) { - callbackCtx, _, callbackErr := startPushRequest(ctx, reqSize) - return callbackCtx, callbackErr - }) - if callbackErr == nil { - return callbackCtx.(context.Context), nil + if cb.cfg.PushTimeout < time.Since(startTimestamp) { + err = context.DeadlineExceeded } - return nil, callbackErr -} - -func (cb *circuitBreaker) Push(parent context.Context, req *mimirpb.WriteRequest, push pushRequestFn) (*mimirpb.WriteResponse, error) { - ctx, cancel := cb.contextWithTimeout(parent, cb.cfg.PushTimeout) - defer cancel() - - callbackResult, callbackErr := cb.get(ctx, func() (any, error) { - callbackResult, callbackErr := push(ctx, req) - if callbackErr != nil { - return callbackResult, callbackErr - } - - // We return ctx.Err() in order to register it with the circuit breaker's executor. - return callbackResult, ctx.Err() - }) - - if callbackResult == nil { - return nil, callbackErr + if err == nil { + cb.recordSuccess() + } else { + cb.recordError(err) } - return callbackResult.(*mimirpb.WriteResponse), callbackErr } diff --git a/pkg/ingester/circuitbreaker_test.go b/pkg/ingester/circuitbreaker_test.go index 58cf861ce93..37f8e11bff9 100644 --- a/pkg/ingester/circuitbreaker_test.go +++ b/pkg/ingester/circuitbreaker_test.go @@ -9,7 +9,6 @@ import ( "testing" "time" - "github.com/grafana/dskit/grpcutil" "github.com/grafana/dskit/middleware" "github.com/grafana/dskit/services" "github.com/grafana/dskit/test" @@ -19,9 +18,9 @@ import ( "github.com/prometheus/prometheus/model/labels" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" - "google.golang.org/grpc/codes" "github.com/grafana/mimir/pkg/mimirpb" + "github.com/grafana/mimir/pkg/util/validation" ) func TestIngester_Push_CircuitBreaker(t *testing.T) { @@ -76,8 +75,9 @@ func TestIngester_Push_CircuitBreaker(t *testing.T) { testModeEnabled: true, } - i, err := prepareIngesterWithBlocksStorage(t, cfg, nil, registry) + overrides, err := validation.NewOverrides(defaultLimitsTestConfig(), nil) require.NoError(t, err) + i, _, _ := createTestIngesterWithIngestStorage(t, &cfg, overrides, registry) require.NoError(t, services.StartAndAwaitRunning(context.Background(), i)) defer services.StopAndAwaitTerminated(context.Background(), i) //nolint:errcheck @@ -95,7 +95,7 @@ func TestIngester_Push_CircuitBreaker(t *testing.T) { nil, mimirpb.API, ) - _, err = i.Push(ctx, req) + err = i.PushToStorage(ctx, req) require.NoError(t, err) count := 0 @@ -125,7 +125,7 @@ func TestIngester_Push_CircuitBreaker(t *testing.T) { if testCase.ctx != nil { ctx = testCase.ctx(ctx) } - _, err = i.Push(ctx, req) + err = i.PushToStorage(ctx, req) if initialDelayEnabled { if testCase.expectedErrorWhenCircuitBreakerClosed != nil { require.ErrorAs(t, err, &testCase.expectedErrorWhenCircuitBreakerClosed) @@ -148,15 +148,11 @@ func TestIngester_Push_CircuitBreaker(t *testing.T) { var expectedMetrics string if initialDelayEnabled { expectedMetrics = ` - # HELP cortex_ingester_circuit_breaker_results_total Results of executing requests via the circuit breaker. - # TYPE cortex_ingester_circuit_breaker_results_total counter - cortex_ingester_circuit_breaker_results_total{ingester="localhost",result="error"} 0 - cortex_ingester_circuit_breaker_results_total{ingester="localhost",result="success"} 0 # HELP cortex_ingester_circuit_breaker_transitions_total Number of times the circuit breaker has entered a state. # TYPE cortex_ingester_circuit_breaker_transitions_total counter - cortex_ingester_circuit_breaker_transitions_total{ingester="localhost",state="closed"} 0 - cortex_ingester_circuit_breaker_transitions_total{ingester="localhost",state="half-open"} 0 - cortex_ingester_circuit_breaker_transitions_total{ingester="localhost",state="open"} 0 + cortex_ingester_circuit_breaker_transitions_total{ingester="ingester-zone-a-0",state="closed"} 0 + cortex_ingester_circuit_breaker_transitions_total{ingester="ingester-zone-a-0",state="half-open"} 0 + cortex_ingester_circuit_breaker_transitions_total{ingester="ingester-zone-a-0",state="open"} 0 # HELP cortex_ingester_circuit_breaker_current_state Boolean set to 1 whenever the circuit breaker is in a state corresponding to the label name. # TYPE cortex_ingester_circuit_breaker_current_state gauge cortex_ingester_circuit_breaker_current_state{state="open"} 0 @@ -167,14 +163,14 @@ func TestIngester_Push_CircuitBreaker(t *testing.T) { expectedMetrics = ` # HELP cortex_ingester_circuit_breaker_results_total Results of executing requests via the circuit breaker. # TYPE cortex_ingester_circuit_breaker_results_total counter - cortex_ingester_circuit_breaker_results_total{ingester="localhost",result="circuit_breaker_open"} 2 - cortex_ingester_circuit_breaker_results_total{ingester="localhost",result="error"} 2 - cortex_ingester_circuit_breaker_results_total{ingester="localhost",result="success"} 1 + cortex_ingester_circuit_breaker_results_total{ingester="ingester-zone-a-0",result="circuit_breaker_open"} 2 + cortex_ingester_circuit_breaker_results_total{ingester="ingester-zone-a-0",result="error"} 2 + cortex_ingester_circuit_breaker_results_total{ingester="ingester-zone-a-0",result="success"} 1 # HELP cortex_ingester_circuit_breaker_transitions_total Number of times the circuit breaker has entered a state. # TYPE cortex_ingester_circuit_breaker_transitions_total counter - cortex_ingester_circuit_breaker_transitions_total{ingester="localhost",state="closed"} 0 - cortex_ingester_circuit_breaker_transitions_total{ingester="localhost",state="half-open"} 0 - cortex_ingester_circuit_breaker_transitions_total{ingester="localhost",state="open"} 1 + cortex_ingester_circuit_breaker_transitions_total{ingester="ingester-zone-a-0",state="closed"} 0 + cortex_ingester_circuit_breaker_transitions_total{ingester="ingester-zone-a-0",state="half-open"} 0 + cortex_ingester_circuit_breaker_transitions_total{ingester="ingester-zone-a-0",state="open"} 1 # HELP cortex_ingester_circuit_breaker_current_state Boolean set to 1 whenever the circuit breaker is in a state corresponding to the label name. # TYPE cortex_ingester_circuit_breaker_current_state gauge cortex_ingester_circuit_breaker_current_state{state="open"} 1 @@ -197,8 +193,4 @@ func checkCircuitBreakerOpenErr(ctx context.Context, err error, t *testing.T) { shouldLog, _ := optional.ShouldLog(ctx) require.False(t, shouldLog, "expected not to log via .ShouldLog()") - - s, ok := grpcutil.ErrorToStatus(err) - require.True(t, ok, "expected to be able to convert to gRPC status") - require.Equal(t, codes.Unavailable, s.Code()) } diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index cbbab427d7a..488dc676f31 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -975,9 +975,6 @@ type pushRequestState struct { // StartPushRequest checks if ingester can start push request, and increments relevant counters. // If new push request cannot be started, errors convertible to gRPC status code are returned, and metrics are updated. func (i *Ingester) StartPushRequest(ctx context.Context, reqSize int64) (context.Context, error) { - if i.circuitBreaker.isActive() { - return i.circuitBreaker.StartPushRequest(ctx, reqSize, i.startPushRequest) - } ctx, _, err := i.startPushRequest(ctx, reqSize) return ctx, err } @@ -997,6 +994,15 @@ func (i *Ingester) FinishPushRequest(ctx context.Context) { // // The shouldFinish flag tells if the caller must call finish on this request. If not, there is already someone in the call stack who will do that. func (i *Ingester) startPushRequest(ctx context.Context, reqSize int64) (_ context.Context, shouldFinish bool, err error) { + if err := i.circuitBreaker.tryAcquirePermit(); err != nil { + return nil, false, err + } + defer func() { + if err != nil { + i.circuitBreaker.recordError(err) + } + }() + if err := i.checkAvailableForPush(); err != nil { return nil, false, err } @@ -1062,7 +1068,7 @@ func (i *Ingester) finishPushRequest(reqSize int64) { } // PushWithCleanup is the Push() implementation for blocks storage and takes a WriteRequest and adds it to the TSDB head. -func (i *Ingester) PushWithCleanup(ctx context.Context, req *mimirpb.WriteRequest, cleanUp func()) error { +func (i *Ingester) PushWithCleanup(ctx context.Context, req *mimirpb.WriteRequest, cleanUp func()) (err error) { // NOTE: because we use `unsafe` in deserialisation, we must not // retain anything from `req` past the exit from this function. defer cleanUp() @@ -1081,6 +1087,11 @@ func (i *Ingester) PushWithCleanup(ctx context.Context, req *mimirpb.WriteReques } } + start := time.Now() + defer func() { + i.circuitBreaker.finishPushRequest(ctx, start, err) + }() + userID, err := tenant.TenantID(ctx) if err != nil { return err @@ -3769,13 +3780,6 @@ func (i *Ingester) PushToStorage(ctx context.Context, req *mimirpb.WriteRequest) // Push implements client.IngesterServer, which is registered into gRPC server. func (i *Ingester) Push(ctx context.Context, req *mimirpb.WriteRequest) (*mimirpb.WriteResponse, error) { - if i.circuitBreaker.isActive() { - return i.circuitBreaker.Push(ctx, req, i.push) - } - return i.push(ctx, req) -} - -func (i *Ingester) push(ctx context.Context, req *mimirpb.WriteRequest) (*mimirpb.WriteResponse, error) { if !i.cfg.PushGrpcMethodEnabled { return nil, errPushGrpcDisabled } From 020db98cf884e16b4685b7f62832e2d55c9b4cca Mon Sep 17 00:00:00 2001 From: Yuri Nikolic Date: Tue, 28 May 2024 18:47:50 +0200 Subject: [PATCH 04/16] Implementing the gauge as NewGaugeFunc --- pkg/ingester/circuitbreaker.go | 89 +++++++++++++++++++---------- pkg/ingester/circuitbreaker_test.go | 2 +- 2 files changed, 60 insertions(+), 31 deletions(-) diff --git a/pkg/ingester/circuitbreaker.go b/pkg/ingester/circuitbreaker.go index ef24aa309d4..58cd1d2ced8 100644 --- a/pkg/ingester/circuitbreaker.go +++ b/pkg/ingester/circuitbreaker.go @@ -24,21 +24,58 @@ const ( resultError = "error" resultOpen = "circuit_breaker_open" testDelayKey testCtxKey = "test-delay" + + circuitBreakerCurrentStateGaugeName = "cortex_ingester_circuit_breaker_current_state" + circuitBreakerCurrentStateGaugeHelp = "Boolean set to 1 whenever the circuit breaker is in a state corresponding to the label name." + circuitBreakerCurrentStateGaugeLabel = "state" ) type circuitBreakerMetrics struct { circuitBreakerCurrentState *prometheus.GaugeVec + circuitBreakerOpenStateGauge prometheus.GaugeFunc + circuitBreakerHalfOpenStateGauge prometheus.GaugeFunc + circuitBreakerClosedStateGauge prometheus.GaugeFunc + circuitBreakerTransitions *prometheus.CounterVec circuitBreakerResults *prometheus.CounterVec } -func newCircuitBreakerMetrics(r prometheus.Registerer) *circuitBreakerMetrics { +func newCircuitBreakerMetrics(r prometheus.Registerer, currentStateFn func() circuitbreaker.State) *circuitBreakerMetrics { return &circuitBreakerMetrics{ - circuitBreakerCurrentState: promauto.With(r).NewGaugeVec(prometheus.GaugeOpts{ - Name: "cortex_ingester_circuit_breaker_current_state", - Help: "Boolean set to 1 whenever the circuit breaker is in a state corresponding to the label name.", - }, []string{"state"}), + circuitBreakerOpenStateGauge: promauto.With(r).NewGaugeFunc(prometheus.GaugeOpts{ + Name: circuitBreakerCurrentStateGaugeName, + Help: circuitBreakerCurrentStateGaugeHelp, + ConstLabels: map[string]string{circuitBreakerCurrentStateGaugeLabel: circuitbreaker.OpenState.String()}, + }, func() float64 { + if currentStateFn() == circuitbreaker.OpenState { + return 1 + } else { + return 0 + } + }), + circuitBreakerHalfOpenStateGauge: promauto.With(r).NewGaugeFunc(prometheus.GaugeOpts{ + Name: circuitBreakerCurrentStateGaugeName, + Help: circuitBreakerCurrentStateGaugeHelp, + ConstLabels: map[string]string{circuitBreakerCurrentStateGaugeLabel: circuitbreaker.HalfOpenState.String()}, + }, func() float64 { + if currentStateFn() == circuitbreaker.HalfOpenState { + return 1 + } else { + return 0 + } + }), + circuitBreakerClosedStateGauge: promauto.With(r).NewGaugeFunc(prometheus.GaugeOpts{ + Name: circuitBreakerCurrentStateGaugeName, + Help: circuitBreakerCurrentStateGaugeHelp, + ConstLabels: map[string]string{circuitBreakerCurrentStateGaugeLabel: circuitbreaker.ClosedState.String()}, + }, func() float64 { + if currentStateFn() == circuitbreaker.ClosedState { + return 1 + } else { + return 0 + } + }), circuitBreakerTransitions: promauto.With(r).NewCounterVec(prometheus.CounterOpts{ Name: "cortex_ingester_circuit_breaker_transitions_total", Help: "Number of times the circuit breaker has entered a state.", @@ -86,37 +123,32 @@ type circuitBreaker struct { } func newCircuitBreaker(cfg CircuitBreakerConfig, ingesterID string, logger log.Logger, registerer prometheus.Registerer) *circuitBreaker { - metrics := newCircuitBreakerMetrics(registerer) + cb := circuitBreaker{ + cfg: cfg, + ingesterID: ingesterID, + logger: logger, + startTime: time.Now().Add(cfg.InitialDelay), + } + + cb.metrics = newCircuitBreakerMetrics(registerer, cb.State) // Initialize each of the known labels for circuit breaker metrics for this particular ingester. - transitionOpen := metrics.circuitBreakerTransitions.WithLabelValues(ingesterID, circuitbreaker.OpenState.String()) - transitionHalfOpen := metrics.circuitBreakerTransitions.WithLabelValues(ingesterID, circuitbreaker.HalfOpenState.String()) - transitionClosed := metrics.circuitBreakerTransitions.WithLabelValues(ingesterID, circuitbreaker.ClosedState.String()) - gaugeOpen := metrics.circuitBreakerCurrentState.WithLabelValues(circuitbreaker.OpenState.String()) - gaugeHalfOpen := metrics.circuitBreakerCurrentState.WithLabelValues(circuitbreaker.HalfOpenState.String()) - gaugeClosed := metrics.circuitBreakerCurrentState.WithLabelValues(circuitbreaker.ClosedState.String()) + transitionOpen := cb.metrics.circuitBreakerTransitions.WithLabelValues(ingesterID, circuitbreaker.OpenState.String()) + transitionHalfOpen := cb.metrics.circuitBreakerTransitions.WithLabelValues(ingesterID, circuitbreaker.HalfOpenState.String()) + transitionClosed := cb.metrics.circuitBreakerTransitions.WithLabelValues(ingesterID, circuitbreaker.ClosedState.String()) cbBuilder := circuitbreaker.Builder[any](). WithFailureThreshold(cfg.FailureThreshold). WithDelay(cfg.CooldownPeriod). OnClose(func(event circuitbreaker.StateChangedEvent) { transitionClosed.Inc() - gaugeOpen.Set(0) - gaugeHalfOpen.Set(0) - gaugeClosed.Set(1) level.Info(logger).Log("msg", "circuit breaker is closed", "ingester", ingesterID, "previous", event.OldState, "current", event.NewState) }). OnOpen(func(event circuitbreaker.StateChangedEvent) { transitionOpen.Inc() - gaugeOpen.Set(1) - gaugeHalfOpen.Set(0) - gaugeClosed.Set(0) level.Warn(logger).Log("msg", "circuit breaker is open", "ingester", ingesterID, "previous", event.OldState, "current", event.NewState) }). OnHalfOpen(func(event circuitbreaker.StateChangedEvent) { transitionHalfOpen.Inc() - gaugeOpen.Set(0) - gaugeHalfOpen.Set(1) - gaugeClosed.Set(0) level.Info(logger).Log("msg", "circuit breaker is half-open", "ingester", ingesterID, "previous", event.OldState, "current", event.NewState) }). HandleIf(func(_ any, err error) bool { return isFailure(err) }) @@ -130,15 +162,8 @@ func newCircuitBreaker(cfg CircuitBreakerConfig, ingesterID string, logger log.L cbBuilder = cbBuilder.WithFailureRateThreshold(cfg.FailureThreshold, cfg.FailureExecutionThreshold, cfg.ThresholdingPeriod) } - cb := cbBuilder.Build() - return &circuitBreaker{ - cfg: cfg, - CircuitBreaker: cb, - ingesterID: ingesterID, - logger: logger, - metrics: metrics, - startTime: time.Now().Add(cfg.InitialDelay), - } + cb.CircuitBreaker = cbBuilder.Build() + return &cb } func isFailure(err error) bool { @@ -162,6 +187,10 @@ func isFailure(err error) bool { return false } +func (cb *circuitBreaker) State() circuitbreaker.State { + return cb.CircuitBreaker.State() +} + func (cb *circuitBreaker) isActive() bool { if cb == nil { return false diff --git a/pkg/ingester/circuitbreaker_test.go b/pkg/ingester/circuitbreaker_test.go index 37f8e11bff9..933b8af9437 100644 --- a/pkg/ingester/circuitbreaker_test.go +++ b/pkg/ingester/circuitbreaker_test.go @@ -157,7 +157,7 @@ func TestIngester_Push_CircuitBreaker(t *testing.T) { # TYPE cortex_ingester_circuit_breaker_current_state gauge cortex_ingester_circuit_breaker_current_state{state="open"} 0 cortex_ingester_circuit_breaker_current_state{state="half-open"} 0 - cortex_ingester_circuit_breaker_current_state{state="closed"} 0 + cortex_ingester_circuit_breaker_current_state{state="closed"} 1 ` } else { expectedMetrics = ` From ece7eebcdbdf141e9eab8ce7ef54c63fae15da46 Mon Sep 17 00:00:00 2001 From: Yuri Nikolic Date: Tue, 28 May 2024 21:38:48 +0200 Subject: [PATCH 05/16] Fixing review findings Signed-off-by: Yuri Nikolic --- pkg/ingester/circuitbreaker.go | 19 +- pkg/ingester/circuitbreaker_test.go | 349 ++++++++++++++++++++++++++++ pkg/ingester/ingester.go | 60 +++-- 3 files changed, 401 insertions(+), 27 deletions(-) diff --git a/pkg/ingester/circuitbreaker.go b/pkg/ingester/circuitbreaker.go index 58cd1d2ced8..d3d83755c21 100644 --- a/pkg/ingester/circuitbreaker.go +++ b/pkg/ingester/circuitbreaker.go @@ -10,9 +10,12 @@ import ( "github.com/failsafe-go/failsafe-go/circuitbreaker" "github.com/go-kit/log" "github.com/go-kit/log/level" + "github.com/grafana/dskit/grpcutil" + "github.com/grafana/dskit/middleware" "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" + "google.golang.org/grpc/codes" "github.com/grafana/mimir/pkg/mimirpb" ) @@ -179,6 +182,11 @@ func isFailure(err error) bool { return true } + statusCode := grpcutil.ErrorToStatusCode(err) + if statusCode == codes.DeadlineExceeded { + return true + } + var ingesterErr ingesterError if errors.As(err, &ingesterErr) { return ingesterErr.errorCause() == mimirpb.INSTANCE_LIMIT @@ -204,7 +212,7 @@ func (cb *circuitBreaker) tryAcquirePermit() error { } if !cb.CircuitBreaker.TryAcquirePermit() { cb.metrics.circuitBreakerResults.WithLabelValues(cb.ingesterID, resultOpen).Inc() - return newCircuitBreakerOpenError(cb.RemainingDelay()) + return middleware.DoNotLogError{Err: newCircuitBreakerOpenError(cb.RemainingDelay())} } return nil } @@ -225,16 +233,16 @@ func (cb *circuitBreaker) recordError(err error) { cb.metrics.circuitBreakerResults.WithLabelValues(cb.ingesterID, resultError).Inc() } -func (cb *circuitBreaker) finishPushRequest(ctx context.Context, startTimestamp time.Time, err error) { +func (cb *circuitBreaker) finishPushRequest(ctx context.Context, duration time.Duration, err error) error { if !cb.isActive() { - return + return nil } if cb.cfg.testModeEnabled { if initialDelay, ok := ctx.Value(testDelayKey).(time.Duration); ok { - time.Sleep(initialDelay) + duration += initialDelay } } - if cb.cfg.PushTimeout < time.Since(startTimestamp) { + if cb.cfg.PushTimeout < duration { err = context.DeadlineExceeded } if err == nil { @@ -242,4 +250,5 @@ func (cb *circuitBreaker) finishPushRequest(ctx context.Context, startTimestamp } else { cb.recordError(err) } + return err } diff --git a/pkg/ingester/circuitbreaker_test.go b/pkg/ingester/circuitbreaker_test.go index 933b8af9437..bb527237c06 100644 --- a/pkg/ingester/circuitbreaker_test.go +++ b/pkg/ingester/circuitbreaker_test.go @@ -9,6 +9,8 @@ import ( "testing" "time" + "github.com/go-kit/log" + "github.com/gogo/status" "github.com/grafana/dskit/middleware" "github.com/grafana/dskit/services" "github.com/grafana/dskit/test" @@ -18,11 +20,236 @@ import ( "github.com/prometheus/prometheus/model/labels" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "google.golang.org/grpc/codes" "github.com/grafana/mimir/pkg/mimirpb" "github.com/grafana/mimir/pkg/util/validation" ) +func TestIsFailure(t *testing.T) { + t.Run("no error", func(t *testing.T) { + require.False(t, isFailure(nil)) + }) + + t.Run("context cancelled", func(t *testing.T) { + require.False(t, isFailure(context.Canceled)) + require.False(t, isFailure(fmt.Errorf("%w", context.Canceled))) + }) + + t.Run("gRPC context cancelled", func(t *testing.T) { + err := status.Error(codes.Canceled, "cancelled!") + require.False(t, isFailure(err)) + require.False(t, isFailure(fmt.Errorf("%w", err))) + }) + + t.Run("gRPC deadline exceeded", func(t *testing.T) { + err := status.Error(codes.DeadlineExceeded, "broken!") + require.True(t, isFailure(err)) + require.True(t, isFailure(fmt.Errorf("%w", err))) + }) + + t.Run("gRPC unavailable with INSTANCE_LIMIT details", func(t *testing.T) { + err := newInstanceLimitReachedError("broken") + require.True(t, isFailure(err)) + require.True(t, isFailure(fmt.Errorf("%w", err))) + }) + + t.Run("gRPC unavailable with SERVICE_UNAVAILABLE details is not a failure", func(t *testing.T) { + stat := status.New(codes.Unavailable, "broken!") + stat, err := stat.WithDetails(&mimirpb.ErrorDetails{Cause: mimirpb.SERVICE_UNAVAILABLE}) + require.NoError(t, err) + err = stat.Err() + require.False(t, isFailure(err)) + require.False(t, isFailure(fmt.Errorf("%w", err))) + }) + + t.Run("gRPC unavailable without details is not a failure", func(t *testing.T) { + err := status.Error(codes.Unavailable, "broken!") + require.False(t, isFailure(err)) + require.False(t, isFailure(fmt.Errorf("%w", err))) + }) +} + +func TestCircuitBreaker_IsActive(t *testing.T) { + var cb *circuitBreaker + + require.False(t, cb.isActive()) + + registry := prometheus.NewRegistry() + cfg := CircuitBreakerConfig{Enabled: true, InitialDelay: 10 * time.Millisecond} + cb = newCircuitBreaker(cfg, "ingester", log.NewNopLogger(), registry) + // When InitialDelay is set, circuit breaker is not immediately active. + require.False(t, cb.isActive()) + + // After InitialDelay passed, circuit breaker becomes active. + time.Sleep(10 * time.Millisecond) + require.True(t, cb.isActive()) +} + +func TestCircuitBreaker_TryAcquirePermit(t *testing.T) { + metricNames := []string{ + "cortex_ingester_circuit_breaker_results_total", + "cortex_ingester_circuit_breaker_transitions_total", + "cortex_ingester_circuit_breaker_current_state", + } + cfg := CircuitBreakerConfig{Enabled: true, CooldownPeriod: 10 * time.Second} + testCases := map[string]struct { + circuitBreakerSetup func(*circuitBreaker) + expectedCircuitBreakerError bool + expectedMetrics string + }{ + "if circuit breaker closed, no error returned": { + circuitBreakerSetup: func(cb *circuitBreaker) { cb.Close() }, + expectedCircuitBreakerError: false, + }, + "if circuit breaker open, a circuitBreakerErrorOpen is returned": { + circuitBreakerSetup: func(cb *circuitBreaker) { cb.Open() }, + expectedCircuitBreakerError: true, + expectedMetrics: ` + # HELP cortex_ingester_circuit_breaker_results_total Results of executing requests via the circuit breaker. + # TYPE cortex_ingester_circuit_breaker_results_total counter + cortex_ingester_circuit_breaker_results_total{ingester="ingester",result="circuit_breaker_open"} 1 + # HELP cortex_ingester_circuit_breaker_transitions_total Number of times the circuit breaker has entered a state. + # TYPE cortex_ingester_circuit_breaker_transitions_total counter + cortex_ingester_circuit_breaker_transitions_total{ingester="ingester",state="closed"} 0 + cortex_ingester_circuit_breaker_transitions_total{ingester="ingester",state="half-open"} 0 + cortex_ingester_circuit_breaker_transitions_total{ingester="ingester",state="open"} 1 + # HELP cortex_ingester_circuit_breaker_current_state Boolean set to 1 whenever the circuit breaker is in a state corresponding to the label name. + # TYPE cortex_ingester_circuit_breaker_current_state gauge + cortex_ingester_circuit_breaker_current_state{state="open"} 1 + cortex_ingester_circuit_breaker_current_state{state="half-open"} 0 + cortex_ingester_circuit_breaker_current_state{state="closed"} 0 + `, + }, + "if circuit breaker half-open, a circuitBreakerErrorOpen is returned": { + circuitBreakerSetup: func(cb *circuitBreaker) { cb.HalfOpen() }, + expectedCircuitBreakerError: true, + expectedMetrics: ` + # HELP cortex_ingester_circuit_breaker_results_total Results of executing requests via the circuit breaker. + # TYPE cortex_ingester_circuit_breaker_results_total counter + cortex_ingester_circuit_breaker_results_total{ingester="ingester",result="circuit_breaker_open"} 1 + # HELP cortex_ingester_circuit_breaker_transitions_total Number of times the circuit breaker has entered a state. + # TYPE cortex_ingester_circuit_breaker_transitions_total counter + cortex_ingester_circuit_breaker_transitions_total{ingester="ingester",state="closed"} 0 + cortex_ingester_circuit_breaker_transitions_total{ingester="ingester",state="half-open"} 1 + cortex_ingester_circuit_breaker_transitions_total{ingester="ingester",state="open"} 0 + # HELP cortex_ingester_circuit_breaker_current_state Boolean set to 1 whenever the circuit breaker is in a state corresponding to the label name. + # TYPE cortex_ingester_circuit_breaker_current_state gauge + cortex_ingester_circuit_breaker_current_state{state="open"} 0 + cortex_ingester_circuit_breaker_current_state{state="half-open"} 1 + cortex_ingester_circuit_breaker_current_state{state="closed"} 0 + `, + }, + } + ctx := context.Background() + for testName, testCase := range testCases { + t.Run(testName, func(t *testing.T) { + registry := prometheus.NewRegistry() + cb := newCircuitBreaker(cfg, "ingester", log.NewNopLogger(), registry) + testCase.circuitBreakerSetup(cb) + err := cb.tryAcquirePermit() + if testCase.expectedCircuitBreakerError { + checkCircuitBreakerOpenErr(ctx, err, t) + assert.NoError(t, testutil.GatherAndCompare(registry, strings.NewReader(testCase.expectedMetrics), metricNames...)) + } else { + require.NoError(t, err) + } + }) + } +} + +func TestCircuitBreaker_RecordSuccess(t *testing.T) { + registry := prometheus.NewRegistry() + metricNames := []string{ + "cortex_ingester_circuit_breaker_results_total", + } + cfg := CircuitBreakerConfig{Enabled: true, CooldownPeriod: 10 * time.Second} + cb := newCircuitBreaker(cfg, "ingester", log.NewNopLogger(), registry) + cb.recordSuccess() + expectedMetrics := ` + # HELP cortex_ingester_circuit_breaker_results_total Results of executing requests via the circuit breaker. + # TYPE cortex_ingester_circuit_breaker_results_total counter + cortex_ingester_circuit_breaker_results_total{ingester="ingester",result="success"} 1 + ` + assert.NoError(t, testutil.GatherAndCompare(registry, strings.NewReader(expectedMetrics), metricNames...)) +} + +func TestCircuitBreaker_RecordError(t *testing.T) { + registry := prometheus.NewRegistry() + metricNames := []string{ + "cortex_ingester_circuit_breaker_results_total", + } + cfg := CircuitBreakerConfig{Enabled: true, CooldownPeriod: 10 * time.Second} + cb := newCircuitBreaker(cfg, "ingester", log.NewNopLogger(), registry) + cb.recordError(context.DeadlineExceeded) + expectedMetrics := ` + # HELP cortex_ingester_circuit_breaker_results_total Results of executing requests via the circuit breaker. + # TYPE cortex_ingester_circuit_breaker_results_total counter + cortex_ingester_circuit_breaker_results_total{ingester="ingester",result="error"} 1 + ` + assert.NoError(t, testutil.GatherAndCompare(registry, strings.NewReader(expectedMetrics), metricNames...)) +} + +func TestCircuitBreaker_FinishPushRequest(t *testing.T) { + metricNames := []string{ + "cortex_ingester_circuit_breaker_results_total", + } + cfg := CircuitBreakerConfig{ + Enabled: true, + PushTimeout: 2 * time.Second, + } + testCases := map[string]struct { + delay time.Duration + err error + expectedErr error + expectedMetrics string + }{ + "delay lower than PushTimeout and no input err give success": { + delay: 1 * time.Second, + err: nil, + expectedMetrics: ` + # HELP cortex_ingester_circuit_breaker_results_total Results of executing requests via the circuit breaker. + # TYPE cortex_ingester_circuit_breaker_results_total counter + cortex_ingester_circuit_breaker_results_total{ingester="ingester",result="success"} 1 + `, + }, + "delay higher than PushTimeout and no input error give context deadline exceeded error": { + delay: 3 * time.Second, + err: nil, + expectedErr: context.DeadlineExceeded, + expectedMetrics: ` + # HELP cortex_ingester_circuit_breaker_results_total Results of executing requests via the circuit breaker. + # TYPE cortex_ingester_circuit_breaker_results_total counter + cortex_ingester_circuit_breaker_results_total{ingester="ingester",result="error"} 1 + `, + }, + "delay higher than PushTimeout and an input error different from context deadline exceeded give context deadline exceeded error": { + delay: 3 * time.Second, + err: newInstanceLimitReachedError("error"), + expectedErr: context.DeadlineExceeded, + expectedMetrics: ` + # HELP cortex_ingester_circuit_breaker_results_total Results of executing requests via the circuit breaker. + # TYPE cortex_ingester_circuit_breaker_results_total counter + cortex_ingester_circuit_breaker_results_total{ingester="ingester",result="error"} 1 + `, + }, + } + for testName, testCase := range testCases { + registry := prometheus.NewRegistry() + ctx := context.Background() + cb := newCircuitBreaker(cfg, "ingester", log.NewNopLogger(), registry) + t.Run(testName, func(t *testing.T) { + err := cb.finishPushRequest(ctx, testCase.delay, testCase.err) + if testCase.expectedErr == nil { + require.NoError(t, err) + } else { + require.ErrorIs(t, err, testCase.expectedErr) + } + assert.NoError(t, testutil.GatherAndCompare(registry, strings.NewReader(testCase.expectedMetrics), metricNames...)) + }) + } +} + func TestIngester_Push_CircuitBreaker(t *testing.T) { pushTimeout := 100 * time.Millisecond tests := map[string]struct { @@ -194,3 +421,125 @@ func checkCircuitBreakerOpenErr(ctx context.Context, err error, t *testing.T) { shouldLog, _ := optional.ShouldLog(ctx) require.False(t, shouldLog, "expected not to log via .ShouldLog()") } + +func TestIngester_StartPushRequest_CircuitBreakerOpen(t *testing.T) { + reg := prometheus.NewPedanticRegistry() + cfg := defaultIngesterTestConfig(t) + cfg.CircuitBreakerConfig = CircuitBreakerConfig{Enabled: true, CooldownPeriod: 10 * time.Second} + + i, err := prepareIngesterWithBlocksStorage(t, cfg, nil, reg) + require.NoError(t, err) + require.NoError(t, services.StartAndAwaitRunning(context.Background(), i)) + defer services.StopAndAwaitTerminated(context.Background(), i) //nolint:errcheck + + // Wait until the ingester is healthy + test.Poll(t, 100*time.Millisecond, 1, func() interface{} { + return i.lifecycler.HealthyInstancesCount() + }) + + ctx := user.InjectOrgID(context.Background(), "test") + + // If i's circuit breaker is closed, StartPushRequest is successful. + i.circuitBreaker.Close() + _, err = i.StartPushRequest(ctx, 0) + require.NoError(t, err) + + // If i's circuit breaker is open, StartPushRequest returns a circuitBreakerOpenError. + i.circuitBreaker.Open() + _, err = i.StartPushRequest(ctx, 0) + require.Error(t, err) + require.ErrorAs(t, err, &circuitBreakerOpenError{}) + + metricNames := []string{ + "cortex_ingester_circuit_breaker_results_total", + "cortex_ingester_circuit_breaker_transitions_total", + "cortex_ingester_circuit_breaker_current_state", + } + expectedMetrics := ` + # HELP cortex_ingester_circuit_breaker_results_total Results of executing requests via the circuit breaker. + # TYPE cortex_ingester_circuit_breaker_results_total counter + cortex_ingester_circuit_breaker_results_total{ingester="localhost",result="circuit_breaker_open"} 1 + # HELP cortex_ingester_circuit_breaker_transitions_total Number of times the circuit breaker has entered a state. + # TYPE cortex_ingester_circuit_breaker_transitions_total counter + cortex_ingester_circuit_breaker_transitions_total{ingester="localhost",state="closed"} 0 + cortex_ingester_circuit_breaker_transitions_total{ingester="localhost",state="half-open"} 0 + cortex_ingester_circuit_breaker_transitions_total{ingester="localhost",state="open"} 1 + # HELP cortex_ingester_circuit_breaker_current_state Boolean set to 1 whenever the circuit breaker is in a state corresponding to the label name. + # TYPE cortex_ingester_circuit_breaker_current_state gauge + cortex_ingester_circuit_breaker_current_state{state="open"} 1 + cortex_ingester_circuit_breaker_current_state{state="half-open"} 0 + cortex_ingester_circuit_breaker_current_state{state="closed"} 0 + ` + assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(expectedMetrics), metricNames...)) +} + +func TestIngester_FinishPushRequest(t *testing.T) { + metricNames := []string{ + "cortex_ingester_circuit_breaker_results_total", + } + testCases := map[string]struct { + delay time.Duration + err error + expectedMetrics string + }{ + "delay lower than PushTimeout and no input err give success": { + delay: 1 * time.Second, + err: nil, + expectedMetrics: ` + # HELP cortex_ingester_circuit_breaker_results_total Results of executing requests via the circuit breaker. + # TYPE cortex_ingester_circuit_breaker_results_total counter + cortex_ingester_circuit_breaker_results_total{ingester="localhost",result="success"} 1 + `, + }, + "delay higher than PushTimeout and no input error give context deadline exceeded error": { + delay: 3 * time.Second, + err: nil, + expectedMetrics: ` + # HELP cortex_ingester_circuit_breaker_results_total Results of executing requests via the circuit breaker. + # TYPE cortex_ingester_circuit_breaker_results_total counter + cortex_ingester_circuit_breaker_results_total{ingester="localhost",result="error"} 1 + `, + }, + "delay higher than PushTimeout and an input error different from context deadline exceeded give context deadline exceeded error": { + delay: 3 * time.Second, + err: newInstanceLimitReachedError("error"), + expectedMetrics: ` + # HELP cortex_ingester_circuit_breaker_results_total Results of executing requests via the circuit breaker. + # TYPE cortex_ingester_circuit_breaker_results_total counter + cortex_ingester_circuit_breaker_results_total{ingester="localhost",result="error"} 1 + `, + }, + } + for testName, testCase := range testCases { + reg := prometheus.NewPedanticRegistry() + cfg := defaultIngesterTestConfig(t) + cfg.CircuitBreakerConfig = CircuitBreakerConfig{ + Enabled: true, + PushTimeout: 2 * time.Second, + } + + i, err := prepareIngesterWithBlocksStorage(t, cfg, nil, reg) + require.NoError(t, err) + + require.NoError(t, services.StartAndAwaitRunning(context.Background(), i)) + defer services.StopAndAwaitTerminated(context.Background(), i) //nolint:errcheck + + // Wait until the ingester is healthy + test.Poll(t, 100*time.Millisecond, 1, func() interface{} { + return i.lifecycler.HealthyInstancesCount() + }) + + ctx := user.InjectOrgID(context.Background(), "test") + + st := &pushRequestState{ + requestDuration: testCase.delay, + err: testCase.err, + } + ctx = context.WithValue(ctx, pushReqCtxKey, st) + + t.Run(testName, func(t *testing.T) { + i.FinishPushRequest(ctx) + assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(testCase.expectedMetrics), metricNames...)) + }) + } +} diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index 488dc676f31..b0eb9d297dc 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -969,7 +969,9 @@ type ctxKey int var pushReqCtxKey ctxKey = 1 type pushRequestState struct { - requestSize int64 + requestSize int64 + requestDuration time.Duration + err error } // StartPushRequest checks if ingester can start push request, and increments relevant counters. @@ -984,7 +986,7 @@ func (i *Ingester) FinishPushRequest(ctx context.Context) { if !ok { return } - i.finishPushRequest(st.requestSize) + i.finishPushRequest(ctx, st.requestSize, st.requestDuration, st.err) } // This method can be called in two ways: 1. Ingester.PushWithCleanup, or 2. Ingester.StartPushRequest via gRPC server's method limiter. @@ -994,15 +996,6 @@ func (i *Ingester) FinishPushRequest(ctx context.Context) { // // The shouldFinish flag tells if the caller must call finish on this request. If not, there is already someone in the call stack who will do that. func (i *Ingester) startPushRequest(ctx context.Context, reqSize int64) (_ context.Context, shouldFinish bool, err error) { - if err := i.circuitBreaker.tryAcquirePermit(); err != nil { - return nil, false, err - } - defer func() { - if err != nil { - i.circuitBreaker.recordError(err) - } - }() - if err := i.checkAvailableForPush(); err != nil { return nil, false, err } @@ -1011,6 +1004,15 @@ func (i *Ingester) startPushRequest(ctx context.Context, reqSize int64) (_ conte // If state is already in context, this means we already passed through StartPushRequest for this request. return ctx, false, nil } + + // We try to acquire a permit from the circuit breaker. + // If it is not possible, it is because the circuit breaker is open, and a circuitBreakerOpenError is returned. + // If it is possible, a permit has to be released by recording either a success or a failure with the circuit + // breaker. This is done in finishPushRequest(). + if err := i.circuitBreaker.tryAcquirePermit(); err != nil { + return nil, false, err + } + st := &pushRequestState{ requestSize: reqSize, } @@ -1030,7 +1032,11 @@ func (i *Ingester) startPushRequest(ctx context.Context, reqSize int64) (_ conte defer func() { if finishRequestInDefer { - i.finishPushRequest(reqSize) + // If startPushRequest() require finishPushRequest() to be executed in defer, + // it is because one the per-instance limits has been hit. In this case, the + // corresponding error has to be passed to finishPushRequest() in order to + // record the failure with the circuit breaker. + i.finishPushRequest(ctx, reqSize, 0, err) } }() @@ -1060,11 +1066,12 @@ func (i *Ingester) startPushRequest(ctx context.Context, reqSize int64) (_ conte return ctx, true, nil } -func (i *Ingester) finishPushRequest(reqSize int64) { +func (i *Ingester) finishPushRequest(ctx context.Context, reqSize int64, duration time.Duration, err error) { i.inflightPushRequests.Dec() if reqSize > 0 { i.inflightPushRequestsBytes.Sub(reqSize) } + i.circuitBreaker.finishPushRequest(ctx, duration, err) } // PushWithCleanup is the Push() implementation for blocks storage and takes a WriteRequest and adds it to the TSDB head. @@ -1073,25 +1080,34 @@ func (i *Ingester) PushWithCleanup(ctx context.Context, req *mimirpb.WriteReques // retain anything from `req` past the exit from this function. defer cleanUp() + start := time.Now() // Only start/finish request here when the request comes NOT from grpc handlers (i.e., from ingest.Store). // NOTE: request coming from grpc handler may end up calling start multiple times during its lifetime (e.g., when migrating to ingest storage). // startPushRequest handles this. if i.cfg.IngestStorageConfig.Enabled || !i.cfg.LimitInflightRequestsUsingGrpcMethodLimiter { reqSize := int64(req.Size()) - _, shouldFinish, err := i.startPushRequest(ctx, reqSize) - if err != nil { - return middleware.DoNotLogError{Err: err} + _, shouldFinish, startPushErr := i.startPushRequest(ctx, reqSize) + if startPushErr != nil { + return middleware.DoNotLogError{Err: startPushErr} } if shouldFinish { - defer i.finishPushRequest(reqSize) + defer func() { + i.finishPushRequest(ctx, reqSize, time.Since(start), err) + }() } + } else { + defer func() { + // We enrich the pushRequestState contained in the context with this PushWithCleanUp() + // call duration, and a possible error it returns. These data are needed during a + // successive call to FinishPushRequest(). + if st, ok := ctx.Value(pushReqCtxKey).(*pushRequestState); ok { + st.requestDuration = time.Since(start) + st.err = err + ctx = context.WithValue(ctx, pushReqCtxKey, st) + } + }() } - start := time.Now() - defer func() { - i.circuitBreaker.finishPushRequest(ctx, start, err) - }() - userID, err := tenant.TenantID(ctx) if err != nil { return err From 8c5cabedd3a6f3fe58144f380d992b629ec71cb4 Mon Sep 17 00:00:00 2001 From: Yuri Nikolic Date: Tue, 28 May 2024 22:06:12 +0200 Subject: [PATCH 06/16] Fixing lint issues Signed-off-by: Yuri Nikolic --- pkg/ingester/circuitbreaker.go | 11 +++-------- pkg/ingester/ingester.go | 2 +- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/pkg/ingester/circuitbreaker.go b/pkg/ingester/circuitbreaker.go index d3d83755c21..765c637e511 100644 --- a/pkg/ingester/circuitbreaker.go +++ b/pkg/ingester/circuitbreaker.go @@ -34,8 +34,6 @@ const ( ) type circuitBreakerMetrics struct { - circuitBreakerCurrentState *prometheus.GaugeVec - circuitBreakerOpenStateGauge prometheus.GaugeFunc circuitBreakerHalfOpenStateGauge prometheus.GaugeFunc circuitBreakerClosedStateGauge prometheus.GaugeFunc @@ -53,9 +51,8 @@ func newCircuitBreakerMetrics(r prometheus.Registerer, currentStateFn func() cir }, func() float64 { if currentStateFn() == circuitbreaker.OpenState { return 1 - } else { - return 0 } + return 0 }), circuitBreakerHalfOpenStateGauge: promauto.With(r).NewGaugeFunc(prometheus.GaugeOpts{ Name: circuitBreakerCurrentStateGaugeName, @@ -64,9 +61,8 @@ func newCircuitBreakerMetrics(r prometheus.Registerer, currentStateFn func() cir }, func() float64 { if currentStateFn() == circuitbreaker.HalfOpenState { return 1 - } else { - return 0 } + return 0 }), circuitBreakerClosedStateGauge: promauto.With(r).NewGaugeFunc(prometheus.GaugeOpts{ Name: circuitBreakerCurrentStateGaugeName, @@ -75,9 +71,8 @@ func newCircuitBreakerMetrics(r prometheus.Registerer, currentStateFn func() cir }, func() float64 { if currentStateFn() == circuitbreaker.ClosedState { return 1 - } else { - return 0 } + return 0 }), circuitBreakerTransitions: promauto.With(r).NewCounterVec(prometheus.CounterOpts{ Name: "cortex_ingester_circuit_breaker_transitions_total", diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index b0eb9d297dc..3a265a78237 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -1071,7 +1071,7 @@ func (i *Ingester) finishPushRequest(ctx context.Context, reqSize int64, duratio if reqSize > 0 { i.inflightPushRequestsBytes.Sub(reqSize) } - i.circuitBreaker.finishPushRequest(ctx, duration, err) + _ = i.circuitBreaker.finishPushRequest(ctx, duration, err) } // PushWithCleanup is the Push() implementation for blocks storage and takes a WriteRequest and adds it to the TSDB head. From f927f76ab123a2fe976a5458ac00446706d10c9d Mon Sep 17 00:00:00 2001 From: Yuri Nikolic Date: Tue, 28 May 2024 23:02:14 +0200 Subject: [PATCH 07/16] Adding test for hitting deadline when ingester.Push is used Signed-off-by: Yuri Nikolic --- pkg/ingester/circuitbreaker_test.go | 143 +++++++++++++++++++++++++++- 1 file changed, 142 insertions(+), 1 deletion(-) diff --git a/pkg/ingester/circuitbreaker_test.go b/pkg/ingester/circuitbreaker_test.go index bb527237c06..d6b90613695 100644 --- a/pkg/ingester/circuitbreaker_test.go +++ b/pkg/ingester/circuitbreaker_test.go @@ -9,6 +9,7 @@ import ( "testing" "time" + "github.com/failsafe-go/failsafe-go/circuitbreaker" "github.com/go-kit/log" "github.com/gogo/status" "github.com/grafana/dskit/middleware" @@ -250,7 +251,7 @@ func TestCircuitBreaker_FinishPushRequest(t *testing.T) { } } -func TestIngester_Push_CircuitBreaker(t *testing.T) { +func TestIngester_PushToStorage_CircuitBreaker(t *testing.T) { pushTimeout := 100 * time.Millisecond tests := map[string]struct { expectedErrorWhenCircuitBreakerClosed error @@ -543,3 +544,143 @@ func TestIngester_FinishPushRequest(t *testing.T) { }) } } + +func TestIngester_Push_CircuitBreaker_DeadlineExceeded(t *testing.T) { + pushTimeout := 100 * time.Millisecond + for initialDelayEnabled, initialDelayStatus := range map[bool]string{false: "disabled", true: "enabled"} { + t.Run(fmt.Sprintf("test slow push with initial delay %s", initialDelayStatus), func(t *testing.T) { + metricLabelAdapters := [][]mimirpb.LabelAdapter{{{Name: labels.MetricName, Value: "test"}}} + metricNames := []string{ + "cortex_ingester_circuit_breaker_results_total", + "cortex_ingester_circuit_breaker_transitions_total", + "cortex_ingester_circuit_breaker_current_state", + } + + registry := prometheus.NewRegistry() + + // Create a mocked ingester + cfg := defaultIngesterTestConfig(t) + cfg.ActiveSeriesMetrics.IdleTimeout = 100 * time.Millisecond + failureThreshold := 2 + var initialDelay time.Duration + if initialDelayEnabled { + initialDelay = 200 * time.Millisecond + } + cfg.CircuitBreakerConfig = CircuitBreakerConfig{ + Enabled: true, + FailureThreshold: uint(failureThreshold), + CooldownPeriod: 10 * time.Second, + InitialDelay: initialDelay, + PushTimeout: pushTimeout, + testModeEnabled: true, + } + + i, err := prepareIngesterWithBlocksStorage(t, cfg, nil, registry) + require.NoError(t, err) + + require.NoError(t, services.StartAndAwaitRunning(context.Background(), i)) + defer services.StopAndAwaitTerminated(context.Background(), i) //nolint:errcheck + + // Wait until the ingester is healthy + test.Poll(t, 100*time.Millisecond, 1, func() interface{} { + return i.lifecycler.HealthyInstancesCount() + }) + + // the first request is successful + ctx := user.InjectOrgID(context.Background(), "test-0") + req := mimirpb.ToWriteRequest( + metricLabelAdapters, + []mimirpb.Sample{{Value: 1, TimestampMs: 8}}, + nil, + nil, + mimirpb.API, + ) + err = i.PushToStorage(ctx, req) + require.NoError(t, err) + + count := 0 + + // Push timeseries for each user + for _, userID := range []string{"test-1", "test-2"} { + reqs := []*mimirpb.WriteRequest{ + mimirpb.ToWriteRequest( + metricLabelAdapters, + []mimirpb.Sample{{Value: 1, TimestampMs: 9}}, + nil, + nil, + mimirpb.API, + ), + mimirpb.ToWriteRequest( + metricLabelAdapters, + []mimirpb.Sample{{Value: 2, TimestampMs: 10}}, + nil, + nil, + mimirpb.API, + ), + } + + for _, req := range reqs { + ctx := user.InjectOrgID(context.Background(), userID) + ctx = context.WithValue(ctx, testDelayKey, 2*pushTimeout) + count++ + + ctx, err = i.StartPushRequest(ctx, int64(req.Size())) + if initialDelayEnabled || count <= failureThreshold { + // If initial delay is enabled we expect no deadline exceeded errors + // to be registered with the circuit breaker. + // If initial delay is disabled, and the circuit breaker registered + // less than failureThreshold deadline exceeded errors, it is still + // closed. + require.NoError(t, err) + require.Equal(t, circuitbreaker.ClosedState, i.circuitBreaker.State()) + st, ok := ctx.Value(pushReqCtxKey).(*pushRequestState) + require.True(t, ok) + require.Equal(t, int64(req.Size()), st.requestSize) + _, err = i.Push(ctx, req) + require.NoError(t, err) + i.FinishPushRequest(ctx) + } else { + require.Equal(t, circuitbreaker.OpenState, i.circuitBreaker.State()) + require.Nil(t, ctx) + checkCircuitBreakerOpenErr(ctx, err, t) + } + } + } + + // Check tracked Prometheus metrics + var expectedMetrics string + if initialDelayEnabled { + expectedMetrics = ` + # HELP cortex_ingester_circuit_breaker_transitions_total Number of times the circuit breaker has entered a state. + # TYPE cortex_ingester_circuit_breaker_transitions_total counter + cortex_ingester_circuit_breaker_transitions_total{ingester="localhost",state="closed"} 0 + cortex_ingester_circuit_breaker_transitions_total{ingester="localhost",state="half-open"} 0 + cortex_ingester_circuit_breaker_transitions_total{ingester="localhost",state="open"} 0 + # HELP cortex_ingester_circuit_breaker_current_state Boolean set to 1 whenever the circuit breaker is in a state corresponding to the label name. + # TYPE cortex_ingester_circuit_breaker_current_state gauge + cortex_ingester_circuit_breaker_current_state{state="open"} 0 + cortex_ingester_circuit_breaker_current_state{state="half-open"} 0 + cortex_ingester_circuit_breaker_current_state{state="closed"} 1 + ` + } else { + expectedMetrics = ` + # HELP cortex_ingester_circuit_breaker_results_total Results of executing requests via the circuit breaker. + # TYPE cortex_ingester_circuit_breaker_results_total counter + cortex_ingester_circuit_breaker_results_total{ingester="localhost",result="circuit_breaker_open"} 2 + cortex_ingester_circuit_breaker_results_total{ingester="localhost",result="error"} 2 + # HELP cortex_ingester_circuit_breaker_transitions_total Number of times the circuit breaker has entered a state. + # TYPE cortex_ingester_circuit_breaker_transitions_total counter + cortex_ingester_circuit_breaker_transitions_total{ingester="localhost",state="closed"} 0 + cortex_ingester_circuit_breaker_transitions_total{ingester="localhost",state="half-open"} 0 + cortex_ingester_circuit_breaker_transitions_total{ingester="localhost",state="open"} 1 + # HELP cortex_ingester_circuit_breaker_current_state Boolean set to 1 whenever the circuit breaker is in a state corresponding to the label name. + # TYPE cortex_ingester_circuit_breaker_current_state gauge + cortex_ingester_circuit_breaker_current_state{state="open"} 1 + cortex_ingester_circuit_breaker_current_state{state="half-open"} 0 + cortex_ingester_circuit_breaker_current_state{state="closed"} 0 + ` + } + assert.NoError(t, testutil.GatherAndCompare(registry, strings.NewReader(expectedMetrics), metricNames...)) + }) + } +} From 0c3fbd48f39867c9fdd5910176da2c0ed67fdbbf Mon Sep 17 00:00:00 2001 From: Yuri Nikolic Date: Fri, 31 May 2024 14:28:02 +0200 Subject: [PATCH 08/16] Fixing review findings Signed-off-by: Yuri Nikolic --- cmd/mimir/config-descriptor.json | 6 +- cmd/mimir/help-all.txt.tmpl | 4 +- cmd/mimir/help.txt.tmpl | 2 +- .../configuration-parameters/index.md | 6 +- pkg/ingester/circuitbreaker.go | 210 +++++----- pkg/ingester/circuitbreaker_test.go | 386 ++++++++++++------ pkg/ingester/ingester.go | 78 ++-- 7 files changed, 406 insertions(+), 286 deletions(-) diff --git a/cmd/mimir/config-descriptor.json b/cmd/mimir/config-descriptor.json index b47db34ed85..d35075dcd8d 100644 --- a/cmd/mimir/config-descriptor.json +++ b/cmd/mimir/config-descriptor.json @@ -3160,12 +3160,12 @@ }, { "kind": "field", - "name": "failure_threshold", + "name": "failure_threshold_percentage", "required": false, "desc": "Max percentage of requests that can fail over period before the circuit breaker opens", "fieldValue": null, "fieldDefaultValue": 10, - "fieldFlag": "ingester.circuit-breaker.failure-threshold", + "fieldFlag": "ingester.circuit-breaker.failure-threshold-percentage", "fieldType": "int", "fieldCategory": "experimental" }, @@ -3219,7 +3219,7 @@ "required": false, "desc": "How long is execution of ingester's Push supposed to last before it is reported as timeout in a circuit breaker. This configuration is used for circuit breakers only, and timeout expirations are not reported as errors", "fieldValue": null, - "fieldDefaultValue": 0, + "fieldDefaultValue": 2000000000, "fieldFlag": "ingester.circuit-breaker.push-timeout", "fieldType": "duration", "fieldCategory": "experiment" diff --git a/cmd/mimir/help-all.txt.tmpl b/cmd/mimir/help-all.txt.tmpl index da93d0dc327..b66023e203a 100644 --- a/cmd/mimir/help-all.txt.tmpl +++ b/cmd/mimir/help-all.txt.tmpl @@ -1313,12 +1313,12 @@ Usage of ./cmd/mimir/mimir: [experimental] Enable circuit breaking when making requests to ingesters -ingester.circuit-breaker.failure-execution-threshold uint [experimental] How many requests must have been executed in period for the circuit breaker to be eligible to open for the rate of failures (default 100) - -ingester.circuit-breaker.failure-threshold uint + -ingester.circuit-breaker.failure-threshold-percentage uint [experimental] Max percentage of requests that can fail over period before the circuit breaker opens (default 10) -ingester.circuit-breaker.initial-delay duration [experimental] How long the circuit breaker should wait between creation and starting up. During that time both failures and successes will not be counted. -ingester.circuit-breaker.push-timeout duration - How long is execution of ingester's Push supposed to last before it is reported as timeout in a circuit breaker. This configuration is used for circuit breakers only, and timeout expirations are not reported as errors + How long is execution of ingester's Push supposed to last before it is reported as timeout in a circuit breaker. This configuration is used for circuit breakers only, and timeout expirations are not reported as errors (default 2s) -ingester.circuit-breaker.thresholding-period duration [experimental] Moving window of time that the percentage of failed requests is computed over (default 1m0s) -ingester.client.backoff-max-period duration diff --git a/cmd/mimir/help.txt.tmpl b/cmd/mimir/help.txt.tmpl index 0309a3518e3..ceafc3b62af 100644 --- a/cmd/mimir/help.txt.tmpl +++ b/cmd/mimir/help.txt.tmpl @@ -390,7 +390,7 @@ Usage of ./cmd/mimir/mimir: -help-all Print help, also including advanced and experimental parameters. -ingester.circuit-breaker.push-timeout duration - How long is execution of ingester's Push supposed to last before it is reported as timeout in a circuit breaker. This configuration is used for circuit breakers only, and timeout expirations are not reported as errors + How long is execution of ingester's Push supposed to last before it is reported as timeout in a circuit breaker. This configuration is used for circuit breakers only, and timeout expirations are not reported as errors (default 2s) -ingester.max-global-metadata-per-metric int The maximum number of metadata per metric, across the cluster. 0 to disable. -ingester.max-global-metadata-per-user int diff --git a/docs/sources/mimir/configure/configuration-parameters/index.md b/docs/sources/mimir/configure/configuration-parameters/index.md index 423629ccefd..e45ebf41126 100644 --- a/docs/sources/mimir/configure/configuration-parameters/index.md +++ b/docs/sources/mimir/configure/configuration-parameters/index.md @@ -1225,8 +1225,8 @@ circuit_breaker: # (experimental) Max percentage of requests that can fail over period before # the circuit breaker opens - # CLI flag: -ingester.circuit-breaker.failure-threshold - [failure_threshold: | default = 10] + # CLI flag: -ingester.circuit-breaker.failure-threshold-percentage + [failure_threshold_percentage: | default = 10] # (experimental) How many requests must have been executed in period for the # circuit breaker to be eligible to open for the rate of failures @@ -1254,7 +1254,7 @@ circuit_breaker: # used for circuit breakers only, and timeout expirations are not reported as # errors # CLI flag: -ingester.circuit-breaker.push-timeout - [push_timeout: | default = 0s] + [push_timeout: | default = 2s] ``` ### querier diff --git a/pkg/ingester/circuitbreaker.go b/pkg/ingester/circuitbreaker.go index 765c637e511..a1f7b78f9fe 100644 --- a/pkg/ingester/circuitbreaker.go +++ b/pkg/ingester/circuitbreaker.go @@ -23,144 +23,122 @@ import ( type testCtxKey string const ( - resultSuccess = "success" - resultError = "error" - resultOpen = "circuit_breaker_open" - testDelayKey testCtxKey = "test-delay" - - circuitBreakerCurrentStateGaugeName = "cortex_ingester_circuit_breaker_current_state" - circuitBreakerCurrentStateGaugeHelp = "Boolean set to 1 whenever the circuit breaker is in a state corresponding to the label name." - circuitBreakerCurrentStateGaugeLabel = "state" + resultSuccess = "success" + resultError = "error" + resultOpen = "circuit_breaker_open" + defaultPushTimeout = 2 * time.Second + testDelayKey testCtxKey = "test-delay" ) type circuitBreakerMetrics struct { - circuitBreakerOpenStateGauge prometheus.GaugeFunc - circuitBreakerHalfOpenStateGauge prometheus.GaugeFunc - circuitBreakerClosedStateGauge prometheus.GaugeFunc - circuitBreakerTransitions *prometheus.CounterVec circuitBreakerResults *prometheus.CounterVec } func newCircuitBreakerMetrics(r prometheus.Registerer, currentStateFn func() circuitbreaker.State) *circuitBreakerMetrics { - return &circuitBreakerMetrics{ - circuitBreakerOpenStateGauge: promauto.With(r).NewGaugeFunc(prometheus.GaugeOpts{ - Name: circuitBreakerCurrentStateGaugeName, - Help: circuitBreakerCurrentStateGaugeHelp, - ConstLabels: map[string]string{circuitBreakerCurrentStateGaugeLabel: circuitbreaker.OpenState.String()}, - }, func() float64 { - if currentStateFn() == circuitbreaker.OpenState { - return 1 - } - return 0 - }), - circuitBreakerHalfOpenStateGauge: promauto.With(r).NewGaugeFunc(prometheus.GaugeOpts{ - Name: circuitBreakerCurrentStateGaugeName, - Help: circuitBreakerCurrentStateGaugeHelp, - ConstLabels: map[string]string{circuitBreakerCurrentStateGaugeLabel: circuitbreaker.HalfOpenState.String()}, - }, func() float64 { - if currentStateFn() == circuitbreaker.HalfOpenState { - return 1 - } - return 0 - }), - circuitBreakerClosedStateGauge: promauto.With(r).NewGaugeFunc(prometheus.GaugeOpts{ - Name: circuitBreakerCurrentStateGaugeName, - Help: circuitBreakerCurrentStateGaugeHelp, - ConstLabels: map[string]string{circuitBreakerCurrentStateGaugeLabel: circuitbreaker.ClosedState.String()}, - }, func() float64 { - if currentStateFn() == circuitbreaker.ClosedState { - return 1 - } - return 0 - }), + cbMetrics := &circuitBreakerMetrics{ circuitBreakerTransitions: promauto.With(r).NewCounterVec(prometheus.CounterOpts{ Name: "cortex_ingester_circuit_breaker_transitions_total", Help: "Number of times the circuit breaker has entered a state.", - }, []string{"ingester", "state"}), + }, []string{"state"}), circuitBreakerResults: promauto.With(r).NewCounterVec(prometheus.CounterOpts{ Name: "cortex_ingester_circuit_breaker_results_total", Help: "Results of executing requests via the circuit breaker.", - }, []string{"ingester", "result"}), + }, []string{"result"}), + } + circuitBreakerCurrentStateGaugeFn := func(state circuitbreaker.State) prometheus.GaugeFunc { + return promauto.With(r).NewGaugeFunc(prometheus.GaugeOpts{ + Name: "cortex_ingester_circuit_breaker_current_state", + Help: "Boolean set to 1 whenever the circuit breaker is in a state corresponding to the label name.", + ConstLabels: map[string]string{"state": state.String()}, + }, func() float64 { + if currentStateFn() == state { + return 1 + } + return 0 + }) + } + for _, s := range []circuitbreaker.State{circuitbreaker.OpenState, circuitbreaker.HalfOpenState, circuitbreaker.ClosedState} { + circuitBreakerCurrentStateGaugeFn(s) + // We initialize all possible states for the circuitBreakerTransitions metrics + cbMetrics.circuitBreakerTransitions.WithLabelValues(s.String()) + } + for _, r := range []string{resultSuccess, resultError, resultOpen} { + // We initialize all possible results for the circuitBreakerResults metrics + cbMetrics.circuitBreakerResults.WithLabelValues(r) } + return cbMetrics } type CircuitBreakerConfig struct { - Enabled bool `yaml:"enabled" category:"experimental"` - FailureThreshold uint `yaml:"failure_threshold" category:"experimental"` - FailureExecutionThreshold uint `yaml:"failure_execution_threshold" category:"experimental"` - ThresholdingPeriod time.Duration `yaml:"thresholding_period" category:"experimental"` - CooldownPeriod time.Duration `yaml:"cooldown_period" category:"experimental"` - InitialDelay time.Duration `yaml:"initial_delay" category:"experimental"` - PushTimeout time.Duration `yaml:"push_timeout" category:"experiment"` - testModeEnabled bool `yaml:"-"` + Enabled bool `yaml:"enabled" category:"experimental"` + FailureThresholdPercentage uint `yaml:"failure_threshold_percentage" category:"experimental"` + FailureExecutionThreshold uint `yaml:"failure_execution_threshold" category:"experimental"` + ThresholdingPeriod time.Duration `yaml:"thresholding_period" category:"experimental"` + CooldownPeriod time.Duration `yaml:"cooldown_period" category:"experimental"` + InitialDelay time.Duration `yaml:"initial_delay" category:"experimental"` + PushTimeout time.Duration `yaml:"push_timeout" category:"experiment"` + testModeEnabled bool `yaml:"-"` } func (cfg *CircuitBreakerConfig) RegisterFlags(f *flag.FlagSet) { prefix := "ingester.circuit-breaker." f.BoolVar(&cfg.Enabled, prefix+"enabled", false, "Enable circuit breaking when making requests to ingesters") - f.UintVar(&cfg.FailureThreshold, prefix+"failure-threshold", 10, "Max percentage of requests that can fail over period before the circuit breaker opens") + f.UintVar(&cfg.FailureThresholdPercentage, prefix+"failure-threshold-percentage", 10, "Max percentage of requests that can fail over period before the circuit breaker opens") f.UintVar(&cfg.FailureExecutionThreshold, prefix+"failure-execution-threshold", 100, "How many requests must have been executed in period for the circuit breaker to be eligible to open for the rate of failures") f.DurationVar(&cfg.ThresholdingPeriod, prefix+"thresholding-period", time.Minute, "Moving window of time that the percentage of failed requests is computed over") f.DurationVar(&cfg.CooldownPeriod, prefix+"cooldown-period", 10*time.Second, "How long the circuit breaker will stay in the open state before allowing some requests") f.DurationVar(&cfg.InitialDelay, prefix+"initial-delay", 0, "How long the circuit breaker should wait between creation and starting up. During that time both failures and successes will not be counted.") - f.DurationVar(&cfg.PushTimeout, prefix+"push-timeout", 0, "How long is execution of ingester's Push supposed to last before it is reported as timeout in a circuit breaker. This configuration is used for circuit breakers only, and timeout expirations are not reported as errors") -} - -func (cfg *CircuitBreakerConfig) Validate() error { - return nil + f.DurationVar(&cfg.PushTimeout, prefix+"push-timeout", defaultPushTimeout, "How long is execution of ingester's Push supposed to last before it is reported as timeout in a circuit breaker. This configuration is used for circuit breakers only, and timeout expirations are not reported as errors") } type circuitBreaker struct { - circuitbreaker.CircuitBreaker[any] - cfg CircuitBreakerConfig - ingesterID string - logger log.Logger - metrics *circuitBreakerMetrics - startTime time.Time + cfg CircuitBreakerConfig + logger log.Logger + metrics *circuitBreakerMetrics + startTime time.Time + cb circuitbreaker.CircuitBreaker[any] } -func newCircuitBreaker(cfg CircuitBreakerConfig, ingesterID string, logger log.Logger, registerer prometheus.Registerer) *circuitBreaker { +func newCircuitBreaker(cfg CircuitBreakerConfig, logger log.Logger, registerer prometheus.Registerer) *circuitBreaker { cb := circuitBreaker{ - cfg: cfg, - ingesterID: ingesterID, - logger: logger, - startTime: time.Now().Add(cfg.InitialDelay), + cfg: cfg, + logger: logger, + startTime: time.Now().Add(cfg.InitialDelay), } - cb.metrics = newCircuitBreakerMetrics(registerer, cb.State) - // Initialize each of the known labels for circuit breaker metrics for this particular ingester. - transitionOpen := cb.metrics.circuitBreakerTransitions.WithLabelValues(ingesterID, circuitbreaker.OpenState.String()) - transitionHalfOpen := cb.metrics.circuitBreakerTransitions.WithLabelValues(ingesterID, circuitbreaker.HalfOpenState.String()) - transitionClosed := cb.metrics.circuitBreakerTransitions.WithLabelValues(ingesterID, circuitbreaker.ClosedState.String()) + circuitBreakerTransitionsCounterFn := func(metrics *circuitBreakerMetrics, state circuitbreaker.State) prometheus.Counter { + return metrics.circuitBreakerTransitions.WithLabelValues(state.String()) + } cbBuilder := circuitbreaker.Builder[any](). - WithFailureThreshold(cfg.FailureThreshold). + WithFailureThreshold(cfg.FailureThresholdPercentage). WithDelay(cfg.CooldownPeriod). OnClose(func(event circuitbreaker.StateChangedEvent) { - transitionClosed.Inc() - level.Info(logger).Log("msg", "circuit breaker is closed", "ingester", ingesterID, "previous", event.OldState, "current", event.NewState) + circuitBreakerTransitionsCounterFn(cb.metrics, circuitbreaker.ClosedState).Inc() + level.Info(logger).Log("msg", "circuit breaker is closed", "previous", event.OldState, "current", event.NewState) }). OnOpen(func(event circuitbreaker.StateChangedEvent) { - transitionOpen.Inc() - level.Warn(logger).Log("msg", "circuit breaker is open", "ingester", ingesterID, "previous", event.OldState, "current", event.NewState) + circuitBreakerTransitionsCounterFn(cb.metrics, circuitbreaker.OpenState).Inc() + level.Warn(logger).Log("msg", "circuit breaker is open", "previous", event.OldState, "current", event.NewState) }). OnHalfOpen(func(event circuitbreaker.StateChangedEvent) { - transitionHalfOpen.Inc() - level.Info(logger).Log("msg", "circuit breaker is half-open", "ingester", ingesterID, "previous", event.OldState, "current", event.NewState) + circuitBreakerTransitionsCounterFn(cb.metrics, circuitbreaker.HalfOpenState).Inc() + level.Info(logger).Log("msg", "circuit breaker is half-open", "previous", event.OldState, "current", event.NewState) }). HandleIf(func(_ any, err error) bool { return isFailure(err) }) if cfg.testModeEnabled { // In case of testing purposes, we initialize the circuit breaker with count based failure thresholding, // since it is more deterministic, and therefore it is easier to predict the outcome. - cbBuilder = cbBuilder.WithFailureThreshold(cfg.FailureThreshold) + cbBuilder = cbBuilder.WithFailureThreshold(cfg.FailureThresholdPercentage) } else { // In case of production code, we prefer time based failure thresholding. - cbBuilder = cbBuilder.WithFailureRateThreshold(cfg.FailureThreshold, cfg.FailureExecutionThreshold, cfg.ThresholdingPeriod) + cbBuilder = cbBuilder.WithFailureRateThreshold(cfg.FailureThresholdPercentage, cfg.FailureExecutionThreshold, cfg.ThresholdingPeriod) } - cb.CircuitBreaker = cbBuilder.Build() + cb.cb = cbBuilder.Build() + cb.metrics = newCircuitBreakerMetrics(registerer, cb.cb.State) return &cb } @@ -190,10 +168,6 @@ func isFailure(err error) bool { return false } -func (cb *circuitBreaker) State() circuitbreaker.State { - return cb.CircuitBreaker.State() -} - func (cb *circuitBreaker) isActive() bool { if cb == nil { return false @@ -201,49 +175,55 @@ func (cb *circuitBreaker) isActive() bool { return cb.startTime.Before(time.Now()) } -func (cb *circuitBreaker) tryAcquirePermit() error { +// tryAcquirePermit tries to acquire a permit to use the circuit breaker and returns whether a permit was acquired. +// If the circuit breaker is not yet active, a status false and no error are returned. +// If it was not possible to acquire a permit, this means that the circuit breaker is open. In this case, a status +// false and an circuitBreakerOpenError are returned. +// If it was possible to acquire a permit, a status true and no error are returned. In this case, the permission +// will be automatically released once when a result is recorded by calling resultRecorded. +func (cb *circuitBreaker) tryAcquirePermit() (bool, error) { if !cb.isActive() { - return nil + return false, nil } - if !cb.CircuitBreaker.TryAcquirePermit() { - cb.metrics.circuitBreakerResults.WithLabelValues(cb.ingesterID, resultOpen).Inc() - return middleware.DoNotLogError{Err: newCircuitBreakerOpenError(cb.RemainingDelay())} + if !cb.cb.TryAcquirePermit() { + cb.metrics.circuitBreakerResults.WithLabelValues(resultOpen).Inc() + return false, middleware.DoNotLogError{Err: newCircuitBreakerOpenError(cb.cb.RemainingDelay())} } - return nil + return true, nil } -func (cb *circuitBreaker) recordSuccess() { +func (cb *circuitBreaker) recordResult(err error) { if !cb.isActive() { return } - cb.CircuitBreaker.RecordSuccess() - cb.metrics.circuitBreakerResults.WithLabelValues(cb.ingesterID, resultSuccess).Inc() -} - -func (cb *circuitBreaker) recordError(err error) { - if !cb.isActive() { - return + if err != nil && isFailure(err) { + cb.cb.RecordError(err) + cb.metrics.circuitBreakerResults.WithLabelValues(resultError).Inc() + } else { + cb.metrics.circuitBreakerResults.WithLabelValues(resultSuccess).Inc() + cb.cb.RecordSuccess() } - cb.CircuitBreaker.RecordError(err) - cb.metrics.circuitBreakerResults.WithLabelValues(cb.ingesterID, resultError).Inc() } -func (cb *circuitBreaker) finishPushRequest(ctx context.Context, duration time.Duration, err error) error { +// finishPushRequest records the result of a push request with this circuit breaker. +// If the circuit breaker is not active, finishPushRequest does nothing. +// If the passed duration of a push request exceeds the configured maximal push request duration, +// a context.DeadlineExceeded error is recorded and returned, independently of the passed pushErr, +// the actual error that occurred during the push request. +// Otherwise, the given pushErr is recorded and returned. +// The returned error is needed only for testing purposes. +func (cb *circuitBreaker) finishPushRequest(ctx context.Context, duration time.Duration, pushErr error) error { if !cb.isActive() { return nil } if cb.cfg.testModeEnabled { - if initialDelay, ok := ctx.Value(testDelayKey).(time.Duration); ok { - duration += initialDelay + if testDelay, ok := ctx.Value(testDelayKey).(time.Duration); ok { + duration += testDelay } } if cb.cfg.PushTimeout < duration { - err = context.DeadlineExceeded - } - if err == nil { - cb.recordSuccess() - } else { - cb.recordError(err) + pushErr = context.DeadlineExceeded } - return err + cb.recordResult(pushErr) + return pushErr } diff --git a/pkg/ingester/circuitbreaker_test.go b/pkg/ingester/circuitbreaker_test.go index d6b90613695..4eb6e99e90e 100644 --- a/pkg/ingester/circuitbreaker_test.go +++ b/pkg/ingester/circuitbreaker_test.go @@ -78,7 +78,7 @@ func TestCircuitBreaker_IsActive(t *testing.T) { registry := prometheus.NewRegistry() cfg := CircuitBreakerConfig{Enabled: true, InitialDelay: 10 * time.Millisecond} - cb = newCircuitBreaker(cfg, "ingester", log.NewNopLogger(), registry) + cb = newCircuitBreaker(cfg, log.NewNopLogger(), registry) // When InitialDelay is set, circuit breaker is not immediately active. require.False(t, cb.isActive()) @@ -93,28 +93,39 @@ func TestCircuitBreaker_TryAcquirePermit(t *testing.T) { "cortex_ingester_circuit_breaker_transitions_total", "cortex_ingester_circuit_breaker_current_state", } - cfg := CircuitBreakerConfig{Enabled: true, CooldownPeriod: 10 * time.Second} testCases := map[string]struct { + initialDelay time.Duration circuitBreakerSetup func(*circuitBreaker) + expectedStatus bool expectedCircuitBreakerError bool expectedMetrics string }{ - "if circuit breaker closed, no error returned": { - circuitBreakerSetup: func(cb *circuitBreaker) { cb.Close() }, + "if circuit breaker is not active, status false and no error are returned": { + initialDelay: 1 * time.Minute, + circuitBreakerSetup: func(cb *circuitBreaker) { cb.cb.Close() }, + expectedStatus: false, + expectedCircuitBreakerError: false, + }, + "if circuit breaker closed, status true and no error are returned": { + circuitBreakerSetup: func(cb *circuitBreaker) { cb.cb.Close() }, + expectedStatus: true, expectedCircuitBreakerError: false, }, - "if circuit breaker open, a circuitBreakerErrorOpen is returned": { - circuitBreakerSetup: func(cb *circuitBreaker) { cb.Open() }, + "if circuit breaker open, status false and a circuitBreakerErrorOpen are returned": { + circuitBreakerSetup: func(cb *circuitBreaker) { cb.cb.Open() }, + expectedStatus: false, expectedCircuitBreakerError: true, expectedMetrics: ` # HELP cortex_ingester_circuit_breaker_results_total Results of executing requests via the circuit breaker. # TYPE cortex_ingester_circuit_breaker_results_total counter - cortex_ingester_circuit_breaker_results_total{ingester="ingester",result="circuit_breaker_open"} 1 + cortex_ingester_circuit_breaker_results_total{result="success"} 0 + cortex_ingester_circuit_breaker_results_total{result="error"} 0 + cortex_ingester_circuit_breaker_results_total{result="circuit_breaker_open"} 1 # HELP cortex_ingester_circuit_breaker_transitions_total Number of times the circuit breaker has entered a state. # TYPE cortex_ingester_circuit_breaker_transitions_total counter - cortex_ingester_circuit_breaker_transitions_total{ingester="ingester",state="closed"} 0 - cortex_ingester_circuit_breaker_transitions_total{ingester="ingester",state="half-open"} 0 - cortex_ingester_circuit_breaker_transitions_total{ingester="ingester",state="open"} 1 + cortex_ingester_circuit_breaker_transitions_total{state="closed"} 0 + cortex_ingester_circuit_breaker_transitions_total{state="half-open"} 0 + cortex_ingester_circuit_breaker_transitions_total{state="open"} 1 # HELP cortex_ingester_circuit_breaker_current_state Boolean set to 1 whenever the circuit breaker is in a state corresponding to the label name. # TYPE cortex_ingester_circuit_breaker_current_state gauge cortex_ingester_circuit_breaker_current_state{state="open"} 1 @@ -122,18 +133,21 @@ func TestCircuitBreaker_TryAcquirePermit(t *testing.T) { cortex_ingester_circuit_breaker_current_state{state="closed"} 0 `, }, - "if circuit breaker half-open, a circuitBreakerErrorOpen is returned": { - circuitBreakerSetup: func(cb *circuitBreaker) { cb.HalfOpen() }, + "if circuit breaker half-open, status false and a circuitBreakerErrorOpen are returned": { + circuitBreakerSetup: func(cb *circuitBreaker) { cb.cb.HalfOpen() }, + expectedStatus: false, expectedCircuitBreakerError: true, expectedMetrics: ` # HELP cortex_ingester_circuit_breaker_results_total Results of executing requests via the circuit breaker. # TYPE cortex_ingester_circuit_breaker_results_total counter - cortex_ingester_circuit_breaker_results_total{ingester="ingester",result="circuit_breaker_open"} 1 + cortex_ingester_circuit_breaker_results_total{result="success"} 0 + cortex_ingester_circuit_breaker_results_total{result="error"} 0 + cortex_ingester_circuit_breaker_results_total{result="circuit_breaker_open"} 1 # HELP cortex_ingester_circuit_breaker_transitions_total Number of times the circuit breaker has entered a state. # TYPE cortex_ingester_circuit_breaker_transitions_total counter - cortex_ingester_circuit_breaker_transitions_total{ingester="ingester",state="closed"} 0 - cortex_ingester_circuit_breaker_transitions_total{ingester="ingester",state="half-open"} 1 - cortex_ingester_circuit_breaker_transitions_total{ingester="ingester",state="open"} 0 + cortex_ingester_circuit_breaker_transitions_total{state="closed"} 0 + cortex_ingester_circuit_breaker_transitions_total{state="half-open"} 1 + cortex_ingester_circuit_breaker_transitions_total{state="open"} 0 # HELP cortex_ingester_circuit_breaker_current_state Boolean set to 1 whenever the circuit breaker is in a state corresponding to the label name. # TYPE cortex_ingester_circuit_breaker_current_state gauge cortex_ingester_circuit_breaker_current_state{state="open"} 0 @@ -146,9 +160,11 @@ func TestCircuitBreaker_TryAcquirePermit(t *testing.T) { for testName, testCase := range testCases { t.Run(testName, func(t *testing.T) { registry := prometheus.NewRegistry() - cb := newCircuitBreaker(cfg, "ingester", log.NewNopLogger(), registry) + cfg := CircuitBreakerConfig{Enabled: true, CooldownPeriod: 10 * time.Second, InitialDelay: testCase.initialDelay} + cb := newCircuitBreaker(cfg, log.NewNopLogger(), registry) testCase.circuitBreakerSetup(cb) - err := cb.tryAcquirePermit() + status, err := cb.tryAcquirePermit() + require.Equal(t, testCase.expectedStatus, status) if testCase.expectedCircuitBreakerError { checkCircuitBreakerOpenErr(ctx, err, t) assert.NoError(t, testutil.GatherAndCompare(registry, strings.NewReader(testCase.expectedMetrics), metricNames...)) @@ -159,88 +175,152 @@ func TestCircuitBreaker_TryAcquirePermit(t *testing.T) { } } -func TestCircuitBreaker_RecordSuccess(t *testing.T) { - registry := prometheus.NewRegistry() +func TestCircuitBreaker_RecordResult(t *testing.T) { metricNames := []string{ "cortex_ingester_circuit_breaker_results_total", } - cfg := CircuitBreakerConfig{Enabled: true, CooldownPeriod: 10 * time.Second} - cb := newCircuitBreaker(cfg, "ingester", log.NewNopLogger(), registry) - cb.recordSuccess() - expectedMetrics := ` - # HELP cortex_ingester_circuit_breaker_results_total Results of executing requests via the circuit breaker. - # TYPE cortex_ingester_circuit_breaker_results_total counter - cortex_ingester_circuit_breaker_results_total{ingester="ingester",result="success"} 1 - ` - assert.NoError(t, testutil.GatherAndCompare(registry, strings.NewReader(expectedMetrics), metricNames...)) -} - -func TestCircuitBreaker_RecordError(t *testing.T) { - registry := prometheus.NewRegistry() - metricNames := []string{ - "cortex_ingester_circuit_breaker_results_total", + testCases := map[string]struct { + err error + expectedMetrics string + }{ + "successful execution records a success": { + err: nil, + expectedMetrics: ` + # HELP cortex_ingester_circuit_breaker_results_total Results of executing requests via the circuit breaker. + # TYPE cortex_ingester_circuit_breaker_results_total counter + cortex_ingester_circuit_breaker_results_total{result="success"} 1 + cortex_ingester_circuit_breaker_results_total{result="error"} 0 + cortex_ingester_circuit_breaker_results_total{result="circuit_breaker_open"} 0 + `, + }, + "erroneous execution not passing the failure check records a success": { + err: context.Canceled, + expectedMetrics: ` + # HELP cortex_ingester_circuit_breaker_results_total Results of executing requests via the circuit breaker. + # TYPE cortex_ingester_circuit_breaker_results_total counter + cortex_ingester_circuit_breaker_results_total{result="success"} 1 + cortex_ingester_circuit_breaker_results_total{result="error"} 0 + cortex_ingester_circuit_breaker_results_total{result="circuit_breaker_open"} 0 + `, + }, + "erroneous execution passing the failure check records an error": { + err: context.DeadlineExceeded, + expectedMetrics: ` + # HELP cortex_ingester_circuit_breaker_results_total Results of executing requests via the circuit breaker. + # TYPE cortex_ingester_circuit_breaker_results_total counter + cortex_ingester_circuit_breaker_results_total{result="success"} 0 + cortex_ingester_circuit_breaker_results_total{result="error"} 1 + cortex_ingester_circuit_breaker_results_total{result="circuit_breaker_open"} 0 + `, + }, } cfg := CircuitBreakerConfig{Enabled: true, CooldownPeriod: 10 * time.Second} - cb := newCircuitBreaker(cfg, "ingester", log.NewNopLogger(), registry) - cb.recordError(context.DeadlineExceeded) - expectedMetrics := ` - # HELP cortex_ingester_circuit_breaker_results_total Results of executing requests via the circuit breaker. - # TYPE cortex_ingester_circuit_breaker_results_total counter - cortex_ingester_circuit_breaker_results_total{ingester="ingester",result="error"} 1 - ` - assert.NoError(t, testutil.GatherAndCompare(registry, strings.NewReader(expectedMetrics), metricNames...)) + for testName, testCase := range testCases { + t.Run(testName, func(t *testing.T) { + registry := prometheus.NewRegistry() + cb := newCircuitBreaker(cfg, log.NewNopLogger(), registry) + cb.recordResult(testCase.err) + assert.NoError(t, testutil.GatherAndCompare(registry, strings.NewReader(testCase.expectedMetrics), metricNames...)) + }) + } } func TestCircuitBreaker_FinishPushRequest(t *testing.T) { metricNames := []string{ "cortex_ingester_circuit_breaker_results_total", } - cfg := CircuitBreakerConfig{ - Enabled: true, - PushTimeout: 2 * time.Second, - } testCases := map[string]struct { - delay time.Duration - err error - expectedErr error - expectedMetrics string + pushRequestDuration time.Duration + initialDelay time.Duration + err error + expectedErr error + expectedMetrics string }{ - "delay lower than PushTimeout and no input err give success": { - delay: 1 * time.Second, - err: nil, + "with a permit acquired, pushRequestDuration lower than PushTimeout and no input error, finishPushRequest gives success": { + pushRequestDuration: 1 * time.Second, + err: nil, expectedMetrics: ` # HELP cortex_ingester_circuit_breaker_results_total Results of executing requests via the circuit breaker. # TYPE cortex_ingester_circuit_breaker_results_total counter - cortex_ingester_circuit_breaker_results_total{ingester="ingester",result="success"} 1 + cortex_ingester_circuit_breaker_results_total{result="success"} 1 + cortex_ingester_circuit_breaker_results_total{result="error"} 0 + cortex_ingester_circuit_breaker_results_total{result="circuit_breaker_open"} 0 `, }, - "delay higher than PushTimeout and no input error give context deadline exceeded error": { - delay: 3 * time.Second, - err: nil, - expectedErr: context.DeadlineExceeded, + "with circuit breaker not active, pushRequestDuration lower than PushTimeout and no input error, finishPushRequest does nothing": { + pushRequestDuration: 1 * time.Second, + initialDelay: 1 * time.Minute, + err: nil, expectedMetrics: ` # HELP cortex_ingester_circuit_breaker_results_total Results of executing requests via the circuit breaker. # TYPE cortex_ingester_circuit_breaker_results_total counter - cortex_ingester_circuit_breaker_results_total{ingester="ingester",result="error"} 1 + cortex_ingester_circuit_breaker_results_total{result="success"} 0 + cortex_ingester_circuit_breaker_results_total{result="error"} 0 + cortex_ingester_circuit_breaker_results_total{result="circuit_breaker_open"} 0 `, }, - "delay higher than PushTimeout and an input error different from context deadline exceeded give context deadline exceeded error": { - delay: 3 * time.Second, - err: newInstanceLimitReachedError("error"), - expectedErr: context.DeadlineExceeded, + "with circuit breaker active, pushRequestDuration higher than PushTimeout and no input error, finishPushRequest gives context deadline exceeded error": { + pushRequestDuration: 3 * time.Second, + err: nil, + expectedErr: context.DeadlineExceeded, expectedMetrics: ` # HELP cortex_ingester_circuit_breaker_results_total Results of executing requests via the circuit breaker. # TYPE cortex_ingester_circuit_breaker_results_total counter - cortex_ingester_circuit_breaker_results_total{ingester="ingester",result="error"} 1 + cortex_ingester_circuit_breaker_results_total{result="success"} 0 + cortex_ingester_circuit_breaker_results_total{result="error"} 1 + cortex_ingester_circuit_breaker_results_total{result="circuit_breaker_open"} 0 + `, + }, + "with circuit breaker not active, pushRequestDuration higher than PushTimeout and no input error, finishPushRequest does nothing": { + pushRequestDuration: 3 * time.Second, + initialDelay: 1 * time.Minute, + err: nil, + expectedErr: nil, + expectedMetrics: ` + # HELP cortex_ingester_circuit_breaker_results_total Results of executing requests via the circuit breaker. + # TYPE cortex_ingester_circuit_breaker_results_total counter + cortex_ingester_circuit_breaker_results_total{result="success"} 0 + cortex_ingester_circuit_breaker_results_total{result="error"} 0 + cortex_ingester_circuit_breaker_results_total{result="circuit_breaker_open"} 0 + `, + }, + "with circuit breaker active, pushRequestDuration higher than PushTimeout and an input error different from context deadline exceeded, finishPushRequest gives context deadline exceeded error": { + pushRequestDuration: 3 * time.Second, + err: newInstanceLimitReachedError("error"), + expectedErr: context.DeadlineExceeded, + expectedMetrics: ` + # HELP cortex_ingester_circuit_breaker_results_total Results of executing requests via the circuit breaker. + # TYPE cortex_ingester_circuit_breaker_results_total counter + cortex_ingester_circuit_breaker_results_total{result="success"} 0 + cortex_ingester_circuit_breaker_results_total{result="error"} 1 + cortex_ingester_circuit_breaker_results_total{result="circuit_breaker_open"} 0 + `, + }, + "with circuit breaker not active, pushRequestDuration higher than PushTimeout and an input error different from context deadline exceeded, finishPushRequest does nothing": { + pushRequestDuration: 3 * time.Second, + initialDelay: 1 * time.Minute, + err: newInstanceLimitReachedError("error"), + expectedErr: nil, + expectedMetrics: ` + # HELP cortex_ingester_circuit_breaker_results_total Results of executing requests via the circuit breaker. + # TYPE cortex_ingester_circuit_breaker_results_total counter + cortex_ingester_circuit_breaker_results_total{result="success"} 0 + cortex_ingester_circuit_breaker_results_total{result="error"} 0 + cortex_ingester_circuit_breaker_results_total{result="circuit_breaker_open"} 0 `, }, } for testName, testCase := range testCases { - registry := prometheus.NewRegistry() - ctx := context.Background() - cb := newCircuitBreaker(cfg, "ingester", log.NewNopLogger(), registry) t.Run(testName, func(t *testing.T) { - err := cb.finishPushRequest(ctx, testCase.delay, testCase.err) + registry := prometheus.NewRegistry() + ctx := context.Background() + cfg := CircuitBreakerConfig{ + Enabled: true, + InitialDelay: testCase.initialDelay, + PushTimeout: 2 * time.Second, + } + cb := newCircuitBreaker(cfg, log.NewNopLogger(), registry) + err := cb.finishPushRequest(ctx, testCase.pushRequestDuration, testCase.err) if testCase.expectedErr == nil { require.NoError(t, err) } else { @@ -295,12 +375,12 @@ func TestIngester_PushToStorage_CircuitBreaker(t *testing.T) { initialDelay = 200 * time.Millisecond } cfg.CircuitBreakerConfig = CircuitBreakerConfig{ - Enabled: true, - FailureThreshold: uint(failureThreshold), - CooldownPeriod: 10 * time.Second, - InitialDelay: initialDelay, - PushTimeout: pushTimeout, - testModeEnabled: true, + Enabled: true, + FailureThresholdPercentage: uint(failureThreshold), + CooldownPeriod: 10 * time.Second, + InitialDelay: initialDelay, + PushTimeout: pushTimeout, + testModeEnabled: true, } overrides, err := validation.NewOverrides(defaultLimitsTestConfig(), nil) @@ -376,11 +456,16 @@ func TestIngester_PushToStorage_CircuitBreaker(t *testing.T) { var expectedMetrics string if initialDelayEnabled { expectedMetrics = ` + # HELP cortex_ingester_circuit_breaker_results_total Results of executing requests via the circuit breaker. + # TYPE cortex_ingester_circuit_breaker_results_total counter + cortex_ingester_circuit_breaker_results_total{result="circuit_breaker_open"} 0 + cortex_ingester_circuit_breaker_results_total{result="error"} 0 + cortex_ingester_circuit_breaker_results_total{result="success"} 0 # HELP cortex_ingester_circuit_breaker_transitions_total Number of times the circuit breaker has entered a state. # TYPE cortex_ingester_circuit_breaker_transitions_total counter - cortex_ingester_circuit_breaker_transitions_total{ingester="ingester-zone-a-0",state="closed"} 0 - cortex_ingester_circuit_breaker_transitions_total{ingester="ingester-zone-a-0",state="half-open"} 0 - cortex_ingester_circuit_breaker_transitions_total{ingester="ingester-zone-a-0",state="open"} 0 + cortex_ingester_circuit_breaker_transitions_total{state="closed"} 0 + cortex_ingester_circuit_breaker_transitions_total{state="half-open"} 0 + cortex_ingester_circuit_breaker_transitions_total{state="open"} 0 # HELP cortex_ingester_circuit_breaker_current_state Boolean set to 1 whenever the circuit breaker is in a state corresponding to the label name. # TYPE cortex_ingester_circuit_breaker_current_state gauge cortex_ingester_circuit_breaker_current_state{state="open"} 0 @@ -391,14 +476,14 @@ func TestIngester_PushToStorage_CircuitBreaker(t *testing.T) { expectedMetrics = ` # HELP cortex_ingester_circuit_breaker_results_total Results of executing requests via the circuit breaker. # TYPE cortex_ingester_circuit_breaker_results_total counter - cortex_ingester_circuit_breaker_results_total{ingester="ingester-zone-a-0",result="circuit_breaker_open"} 2 - cortex_ingester_circuit_breaker_results_total{ingester="ingester-zone-a-0",result="error"} 2 - cortex_ingester_circuit_breaker_results_total{ingester="ingester-zone-a-0",result="success"} 1 + cortex_ingester_circuit_breaker_results_total{result="circuit_breaker_open"} 2 + cortex_ingester_circuit_breaker_results_total{result="error"} 2 + cortex_ingester_circuit_breaker_results_total{result="success"} 1 # HELP cortex_ingester_circuit_breaker_transitions_total Number of times the circuit breaker has entered a state. # TYPE cortex_ingester_circuit_breaker_transitions_total counter - cortex_ingester_circuit_breaker_transitions_total{ingester="ingester-zone-a-0",state="closed"} 0 - cortex_ingester_circuit_breaker_transitions_total{ingester="ingester-zone-a-0",state="half-open"} 0 - cortex_ingester_circuit_breaker_transitions_total{ingester="ingester-zone-a-0",state="open"} 1 + cortex_ingester_circuit_breaker_transitions_total{state="closed"} 0 + cortex_ingester_circuit_breaker_transitions_total{state="half-open"} 0 + cortex_ingester_circuit_breaker_transitions_total{state="open"} 1 # HELP cortex_ingester_circuit_breaker_current_state Boolean set to 1 whenever the circuit breaker is in a state corresponding to the label name. # TYPE cortex_ingester_circuit_breaker_current_state gauge cortex_ingester_circuit_breaker_current_state{state="open"} 1 @@ -441,12 +526,12 @@ func TestIngester_StartPushRequest_CircuitBreakerOpen(t *testing.T) { ctx := user.InjectOrgID(context.Background(), "test") // If i's circuit breaker is closed, StartPushRequest is successful. - i.circuitBreaker.Close() + i.circuitBreaker.cb.Close() _, err = i.StartPushRequest(ctx, 0) require.NoError(t, err) // If i's circuit breaker is open, StartPushRequest returns a circuitBreakerOpenError. - i.circuitBreaker.Open() + i.circuitBreaker.cb.Open() _, err = i.StartPushRequest(ctx, 0) require.Error(t, err) require.ErrorAs(t, err, &circuitBreakerOpenError{}) @@ -459,12 +544,14 @@ func TestIngester_StartPushRequest_CircuitBreakerOpen(t *testing.T) { expectedMetrics := ` # HELP cortex_ingester_circuit_breaker_results_total Results of executing requests via the circuit breaker. # TYPE cortex_ingester_circuit_breaker_results_total counter - cortex_ingester_circuit_breaker_results_total{ingester="localhost",result="circuit_breaker_open"} 1 + cortex_ingester_circuit_breaker_results_total{result="success"} 0 + cortex_ingester_circuit_breaker_results_total{result="error"} 0 + cortex_ingester_circuit_breaker_results_total{result="circuit_breaker_open"} 1 # HELP cortex_ingester_circuit_breaker_transitions_total Number of times the circuit breaker has entered a state. # TYPE cortex_ingester_circuit_breaker_transitions_total counter - cortex_ingester_circuit_breaker_transitions_total{ingester="localhost",state="closed"} 0 - cortex_ingester_circuit_breaker_transitions_total{ingester="localhost",state="half-open"} 0 - cortex_ingester_circuit_breaker_transitions_total{ingester="localhost",state="open"} 1 + cortex_ingester_circuit_breaker_transitions_total{state="closed"} 0 + cortex_ingester_circuit_breaker_transitions_total{state="half-open"} 0 + cortex_ingester_circuit_breaker_transitions_total{state="open"} 1 # HELP cortex_ingester_circuit_breaker_current_state Boolean set to 1 whenever the circuit breaker is in a state corresponding to the label name. # TYPE cortex_ingester_circuit_breaker_current_state gauge cortex_ingester_circuit_breaker_current_state{state="open"} 1 @@ -479,35 +566,81 @@ func TestIngester_FinishPushRequest(t *testing.T) { "cortex_ingester_circuit_breaker_results_total", } testCases := map[string]struct { - delay time.Duration - err error - expectedMetrics string + pushRequestDuration time.Duration + acquiredCircuitBreakerPermit bool + err error + expectedMetrics string }{ - "delay lower than PushTimeout and no input err give success": { - delay: 1 * time.Second, - err: nil, + "with a permit acquired, pushRequestDuration lower than PushTimeout and no input err, finishPushRequest gives success": { + pushRequestDuration: 1 * time.Second, + acquiredCircuitBreakerPermit: true, + err: nil, expectedMetrics: ` # HELP cortex_ingester_circuit_breaker_results_total Results of executing requests via the circuit breaker. # TYPE cortex_ingester_circuit_breaker_results_total counter - cortex_ingester_circuit_breaker_results_total{ingester="localhost",result="success"} 1 + cortex_ingester_circuit_breaker_results_total{result="success"} 1 + cortex_ingester_circuit_breaker_results_total{result="error"} 0 + cortex_ingester_circuit_breaker_results_total{result="circuit_breaker_open"} 0 `, }, - "delay higher than PushTimeout and no input error give context deadline exceeded error": { - delay: 3 * time.Second, - err: nil, + "when a permit not acquired, pushRequestDuration lower than PushTimeout and no input err, finishPusRequest does nothing": { + pushRequestDuration: 1 * time.Second, + acquiredCircuitBreakerPermit: false, + err: nil, expectedMetrics: ` # HELP cortex_ingester_circuit_breaker_results_total Results of executing requests via the circuit breaker. # TYPE cortex_ingester_circuit_breaker_results_total counter - cortex_ingester_circuit_breaker_results_total{ingester="localhost",result="error"} 1 + cortex_ingester_circuit_breaker_results_total{result="success"} 0 + cortex_ingester_circuit_breaker_results_total{result="error"} 0 + cortex_ingester_circuit_breaker_results_total{result="circuit_breaker_open"} 0 `, }, - "delay higher than PushTimeout and an input error different from context deadline exceeded give context deadline exceeded error": { - delay: 3 * time.Second, - err: newInstanceLimitReachedError("error"), + "with a permit acquired, pushRequestDuration higher than PushTimeout and no input error, finishPushRequest gives context deadline exceeded error": { + pushRequestDuration: 3 * time.Second, + acquiredCircuitBreakerPermit: true, + err: nil, expectedMetrics: ` # HELP cortex_ingester_circuit_breaker_results_total Results of executing requests via the circuit breaker. # TYPE cortex_ingester_circuit_breaker_results_total counter - cortex_ingester_circuit_breaker_results_total{ingester="localhost",result="error"} 1 + cortex_ingester_circuit_breaker_results_total{result="success"} 0 + cortex_ingester_circuit_breaker_results_total{result="error"} 1 + cortex_ingester_circuit_breaker_results_total{result="circuit_breaker_open"} 0 + `, + }, + "with a permit not acquired, pushRequestDuration higher than PushTimeout and no input error, finishPushRequest does nothing": { + pushRequestDuration: 3 * time.Second, + acquiredCircuitBreakerPermit: false, + err: nil, + expectedMetrics: ` + # HELP cortex_ingester_circuit_breaker_results_total Results of executing requests via the circuit breaker. + # TYPE cortex_ingester_circuit_breaker_results_total counter + cortex_ingester_circuit_breaker_results_total{result="success"} 0 + cortex_ingester_circuit_breaker_results_total{result="error"} 0 + cortex_ingester_circuit_breaker_results_total{result="circuit_breaker_open"} 0 + `, + }, + "with a permit acquired, pushRequestDuration higher than PushTimeout and an input error different from context deadline exceeded, finishPushRequest gives context deadline exceeded error": { + pushRequestDuration: 3 * time.Second, + acquiredCircuitBreakerPermit: true, + err: newInstanceLimitReachedError("error"), + expectedMetrics: ` + # HELP cortex_ingester_circuit_breaker_results_total Results of executing requests via the circuit breaker. + # TYPE cortex_ingester_circuit_breaker_results_total counter + cortex_ingester_circuit_breaker_results_total{result="success"} 0 + cortex_ingester_circuit_breaker_results_total{result="error"} 1 + cortex_ingester_circuit_breaker_results_total{result="circuit_breaker_open"} 0 + `, + }, + "with a permit not acquired, pushRequestDuration higher than PushTimeout and an input error different from context deadline exceeded, finishPushRequest does nothing": { + pushRequestDuration: 3 * time.Second, + acquiredCircuitBreakerPermit: false, + err: newInstanceLimitReachedError("error"), + expectedMetrics: ` + # HELP cortex_ingester_circuit_breaker_results_total Results of executing requests via the circuit breaker. + # TYPE cortex_ingester_circuit_breaker_results_total counter + cortex_ingester_circuit_breaker_results_total{result="success"} 0 + cortex_ingester_circuit_breaker_results_total{result="error"} 0 + cortex_ingester_circuit_breaker_results_total{result="circuit_breaker_open"} 0 `, }, } @@ -533,8 +666,9 @@ func TestIngester_FinishPushRequest(t *testing.T) { ctx := user.InjectOrgID(context.Background(), "test") st := &pushRequestState{ - requestDuration: testCase.delay, - err: testCase.err, + requestDuration: testCase.pushRequestDuration, + acquiredCircuitBreakerPermit: testCase.acquiredCircuitBreakerPermit, + err: testCase.err, } ctx = context.WithValue(ctx, pushReqCtxKey, st) @@ -567,12 +701,12 @@ func TestIngester_Push_CircuitBreaker_DeadlineExceeded(t *testing.T) { initialDelay = 200 * time.Millisecond } cfg.CircuitBreakerConfig = CircuitBreakerConfig{ - Enabled: true, - FailureThreshold: uint(failureThreshold), - CooldownPeriod: 10 * time.Second, - InitialDelay: initialDelay, - PushTimeout: pushTimeout, - testModeEnabled: true, + Enabled: true, + FailureThresholdPercentage: uint(failureThreshold), + CooldownPeriod: 10 * time.Second, + InitialDelay: initialDelay, + PushTimeout: pushTimeout, + testModeEnabled: true, } i, err := prepareIngesterWithBlocksStorage(t, cfg, nil, registry) @@ -632,7 +766,7 @@ func TestIngester_Push_CircuitBreaker_DeadlineExceeded(t *testing.T) { // less than failureThreshold deadline exceeded errors, it is still // closed. require.NoError(t, err) - require.Equal(t, circuitbreaker.ClosedState, i.circuitBreaker.State()) + require.Equal(t, circuitbreaker.ClosedState, i.circuitBreaker.cb.State()) st, ok := ctx.Value(pushReqCtxKey).(*pushRequestState) require.True(t, ok) require.Equal(t, int64(req.Size()), st.requestSize) @@ -640,7 +774,7 @@ func TestIngester_Push_CircuitBreaker_DeadlineExceeded(t *testing.T) { require.NoError(t, err) i.FinishPushRequest(ctx) } else { - require.Equal(t, circuitbreaker.OpenState, i.circuitBreaker.State()) + require.Equal(t, circuitbreaker.OpenState, i.circuitBreaker.cb.State()) require.Nil(t, ctx) checkCircuitBreakerOpenErr(ctx, err, t) } @@ -651,11 +785,16 @@ func TestIngester_Push_CircuitBreaker_DeadlineExceeded(t *testing.T) { var expectedMetrics string if initialDelayEnabled { expectedMetrics = ` + # HELP cortex_ingester_circuit_breaker_results_total Results of executing requests via the circuit breaker. + # TYPE cortex_ingester_circuit_breaker_results_total counter + cortex_ingester_circuit_breaker_results_total{result="success"} 0 + cortex_ingester_circuit_breaker_results_total{result="error"} 0 + cortex_ingester_circuit_breaker_results_total{result="circuit_breaker_open"} 0 # HELP cortex_ingester_circuit_breaker_transitions_total Number of times the circuit breaker has entered a state. # TYPE cortex_ingester_circuit_breaker_transitions_total counter - cortex_ingester_circuit_breaker_transitions_total{ingester="localhost",state="closed"} 0 - cortex_ingester_circuit_breaker_transitions_total{ingester="localhost",state="half-open"} 0 - cortex_ingester_circuit_breaker_transitions_total{ingester="localhost",state="open"} 0 + cortex_ingester_circuit_breaker_transitions_total{state="closed"} 0 + cortex_ingester_circuit_breaker_transitions_total{state="half-open"} 0 + cortex_ingester_circuit_breaker_transitions_total{state="open"} 0 # HELP cortex_ingester_circuit_breaker_current_state Boolean set to 1 whenever the circuit breaker is in a state corresponding to the label name. # TYPE cortex_ingester_circuit_breaker_current_state gauge cortex_ingester_circuit_breaker_current_state{state="open"} 0 @@ -666,13 +805,14 @@ func TestIngester_Push_CircuitBreaker_DeadlineExceeded(t *testing.T) { expectedMetrics = ` # HELP cortex_ingester_circuit_breaker_results_total Results of executing requests via the circuit breaker. # TYPE cortex_ingester_circuit_breaker_results_total counter - cortex_ingester_circuit_breaker_results_total{ingester="localhost",result="circuit_breaker_open"} 2 - cortex_ingester_circuit_breaker_results_total{ingester="localhost",result="error"} 2 + cortex_ingester_circuit_breaker_results_total{result="success"} 0 + cortex_ingester_circuit_breaker_results_total{result="error"} 2 + cortex_ingester_circuit_breaker_results_total{result="circuit_breaker_open"} 2 # HELP cortex_ingester_circuit_breaker_transitions_total Number of times the circuit breaker has entered a state. # TYPE cortex_ingester_circuit_breaker_transitions_total counter - cortex_ingester_circuit_breaker_transitions_total{ingester="localhost",state="closed"} 0 - cortex_ingester_circuit_breaker_transitions_total{ingester="localhost",state="half-open"} 0 - cortex_ingester_circuit_breaker_transitions_total{ingester="localhost",state="open"} 1 + cortex_ingester_circuit_breaker_transitions_total{state="closed"} 0 + cortex_ingester_circuit_breaker_transitions_total{state="half-open"} 0 + cortex_ingester_circuit_breaker_transitions_total{state="open"} 1 # HELP cortex_ingester_circuit_breaker_current_state Boolean set to 1 whenever the circuit breaker is in a state corresponding to the label name. # TYPE cortex_ingester_circuit_breaker_current_state gauge cortex_ingester_circuit_breaker_current_state{state="open"} 1 diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index 3a265a78237..e3a0aa8f07a 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -261,13 +261,7 @@ func (cfg *Config) Validate(logger log.Logger) error { util.WarnDeprecatedConfig(deprecatedReturnOnlyGRPCErrorsFlag, logger) } - err := cfg.IngesterRing.Validate() - - if err != nil { - return err - } - - return cfg.CircuitBreakerConfig.Validate() + return cfg.IngesterRing.Validate() } func (cfg *Config) getIgnoreSeriesLimitForMetricNamesMap() map[string]struct{} { @@ -405,7 +399,7 @@ func New(cfg Config, limits *validation.Overrides, ingestersRing ring.ReadRing, i.activeGroups = activeGroupsCleanupService if cfg.CircuitBreakerConfig.Enabled { - i.circuitBreaker = newCircuitBreaker(cfg.CircuitBreakerConfig, cfg.IngesterRing.InstanceID, logger, registerer) + i.circuitBreaker = newCircuitBreaker(cfg.CircuitBreakerConfig, logger, registerer) } if registerer != nil { @@ -969,15 +963,16 @@ type ctxKey int var pushReqCtxKey ctxKey = 1 type pushRequestState struct { - requestSize int64 - requestDuration time.Duration - err error + requestSize int64 + requestDuration time.Duration + acquiredCircuitBreakerPermit bool + err error } // StartPushRequest checks if ingester can start push request, and increments relevant counters. // If new push request cannot be started, errors convertible to gRPC status code are returned, and metrics are updated. func (i *Ingester) StartPushRequest(ctx context.Context, reqSize int64) (context.Context, error) { - ctx, _, err := i.startPushRequest(ctx, reqSize) + ctx, _, _, err := i.startPushRequest(ctx, reqSize) return ctx, err } @@ -986,7 +981,7 @@ func (i *Ingester) FinishPushRequest(ctx context.Context) { if !ok { return } - i.finishPushRequest(ctx, st.requestSize, st.requestDuration, st.err) + i.finishPushRequest(ctx, st.requestSize, st.requestDuration, st.acquiredCircuitBreakerPermit, st.err) } // This method can be called in two ways: 1. Ingester.PushWithCleanup, or 2. Ingester.StartPushRequest via gRPC server's method limiter. @@ -995,26 +990,30 @@ func (i *Ingester) FinishPushRequest(ctx context.Context) { // In the second case, returned errors will not be logged, because request will not reach any middleware. // // The shouldFinish flag tells if the caller must call finish on this request. If not, there is already someone in the call stack who will do that. -func (i *Ingester) startPushRequest(ctx context.Context, reqSize int64) (_ context.Context, shouldFinish bool, err error) { +// The acquiredCircuitBreakerPermit flag tells to the caller if this call has acquired a circuit breaker permit, +// and if the caller should take care of returning the permit back. +func (i *Ingester) startPushRequest(ctx context.Context, reqSize int64) (_ context.Context, shouldFinish, acquiredCircuitBreakerPermit bool, err error) { if err := i.checkAvailableForPush(); err != nil { - return nil, false, err + return nil, false, false, err } if _, ok := ctx.Value(pushReqCtxKey).(*pushRequestState); ok { // If state is already in context, this means we already passed through StartPushRequest for this request. - return ctx, false, nil + return ctx, false, false, nil } // We try to acquire a permit from the circuit breaker. // If it is not possible, it is because the circuit breaker is open, and a circuitBreakerOpenError is returned. // If it is possible, a permit has to be released by recording either a success or a failure with the circuit // breaker. This is done in finishPushRequest(). - if err := i.circuitBreaker.tryAcquirePermit(); err != nil { - return nil, false, err + acquiredCircuitBreakerPermit, err = i.circuitBreaker.tryAcquirePermit() + if err != nil { + return nil, false, acquiredCircuitBreakerPermit, err } st := &pushRequestState{ - requestSize: reqSize, + requestSize: reqSize, + acquiredCircuitBreakerPermit: acquiredCircuitBreakerPermit, } ctx = context.WithValue(ctx, pushReqCtxKey, st) @@ -1036,7 +1035,7 @@ func (i *Ingester) startPushRequest(ctx context.Context, reqSize int64) (_ conte // it is because one the per-instance limits has been hit. In this case, the // corresponding error has to be passed to finishPushRequest() in order to // record the failure with the circuit breaker. - i.finishPushRequest(ctx, reqSize, 0, err) + i.finishPushRequest(ctx, reqSize, 0, acquiredCircuitBreakerPermit, err) } }() @@ -1044,34 +1043,36 @@ func (i *Ingester) startPushRequest(ctx context.Context, reqSize int64) (_ conte if il != nil { if il.MaxInflightPushRequests > 0 && inflight > il.MaxInflightPushRequests { i.metrics.rejected.WithLabelValues(reasonIngesterMaxInflightPushRequests).Inc() - return nil, false, errMaxInflightRequestsReached + return nil, false, acquiredCircuitBreakerPermit, errMaxInflightRequestsReached } if il.MaxInflightPushRequestsBytes > 0 { if (rejectEqualInflightBytes && inflightBytes >= il.MaxInflightPushRequestsBytes) || inflightBytes > il.MaxInflightPushRequestsBytes { i.metrics.rejected.WithLabelValues(reasonIngesterMaxInflightPushRequestsBytes).Inc() - return nil, false, errMaxInflightRequestsBytesReached + return nil, false, acquiredCircuitBreakerPermit, errMaxInflightRequestsBytesReached } } if il.MaxIngestionRate > 0 { if rate := i.ingestionRate.Rate(); rate >= il.MaxIngestionRate { i.metrics.rejected.WithLabelValues(reasonIngesterMaxIngestionRate).Inc() - return nil, false, errMaxIngestionRateReached + return nil, false, acquiredCircuitBreakerPermit, errMaxIngestionRateReached } } } finishRequestInDefer = false - return ctx, true, nil + return ctx, true, acquiredCircuitBreakerPermit, nil } -func (i *Ingester) finishPushRequest(ctx context.Context, reqSize int64, duration time.Duration, err error) { +func (i *Ingester) finishPushRequest(ctx context.Context, reqSize int64, duration time.Duration, acquiredCircuitBreakerPermit bool, err error) { i.inflightPushRequests.Dec() if reqSize > 0 { i.inflightPushRequestsBytes.Sub(reqSize) } - _ = i.circuitBreaker.finishPushRequest(ctx, duration, err) + if acquiredCircuitBreakerPermit { + _ = i.circuitBreaker.finishPushRequest(ctx, duration, err) + } } // PushWithCleanup is the Push() implementation for blocks storage and takes a WriteRequest and adds it to the TSDB head. @@ -1086,28 +1087,27 @@ func (i *Ingester) PushWithCleanup(ctx context.Context, req *mimirpb.WriteReques // startPushRequest handles this. if i.cfg.IngestStorageConfig.Enabled || !i.cfg.LimitInflightRequestsUsingGrpcMethodLimiter { reqSize := int64(req.Size()) - _, shouldFinish, startPushErr := i.startPushRequest(ctx, reqSize) + _, shouldFinish, acquiredCircuitBreakerPermit, startPushErr := i.startPushRequest(ctx, reqSize) if startPushErr != nil { return middleware.DoNotLogError{Err: startPushErr} } if shouldFinish { defer func() { - i.finishPushRequest(ctx, reqSize, time.Since(start), err) + i.finishPushRequest(ctx, reqSize, time.Since(start), acquiredCircuitBreakerPermit, err) }() } - } else { - defer func() { - // We enrich the pushRequestState contained in the context with this PushWithCleanUp() - // call duration, and a possible error it returns. These data are needed during a - // successive call to FinishPushRequest(). - if st, ok := ctx.Value(pushReqCtxKey).(*pushRequestState); ok { - st.requestDuration = time.Since(start) - st.err = err - ctx = context.WithValue(ctx, pushReqCtxKey, st) - } - }() } + defer func() { + // We enrich the pushRequestState contained in the context with this PushWithCleanUp() + // call duration, and a possible error it returns. These data are needed during a + // successive call to FinishPushRequest(). + if st, ok := ctx.Value(pushReqCtxKey).(*pushRequestState); ok { + st.requestDuration = time.Since(start) + st.err = err + } + }() + userID, err := tenant.TenantID(ctx) if err != nil { return err From 45a8fe52d1c03eae8a6ea2309d9b4c883251341c Mon Sep 17 00:00:00 2001 From: Yuri Nikolic Date: Sat, 1 Jun 2024 14:38:54 +0200 Subject: [PATCH 09/16] Fix additional review findings Signed-off-by: Yuri Nikolic --- pkg/ingester/circuitbreaker.go | 7 ++- pkg/ingester/circuitbreaker_test.go | 21 ++----- pkg/ingester/ingester.go | 86 +++++++++++++++++------------ 3 files changed, 59 insertions(+), 55 deletions(-) diff --git a/pkg/ingester/circuitbreaker.go b/pkg/ingester/circuitbreaker.go index a1f7b78f9fe..3ff6c992498 100644 --- a/pkg/ingester/circuitbreaker.go +++ b/pkg/ingester/circuitbreaker.go @@ -11,7 +11,6 @@ import ( "github.com/go-kit/log" "github.com/go-kit/log/level" "github.com/grafana/dskit/grpcutil" - "github.com/grafana/dskit/middleware" "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" @@ -175,6 +174,7 @@ func (cb *circuitBreaker) isActive() bool { return cb.startTime.Before(time.Now()) } +// TODO: fix the comment // tryAcquirePermit tries to acquire a permit to use the circuit breaker and returns whether a permit was acquired. // If the circuit breaker is not yet active, a status false and no error are returned. // If it was not possible to acquire a permit, this means that the circuit breaker is open. In this case, a status @@ -187,7 +187,7 @@ func (cb *circuitBreaker) tryAcquirePermit() (bool, error) { } if !cb.cb.TryAcquirePermit() { cb.metrics.circuitBreakerResults.WithLabelValues(resultOpen).Inc() - return false, middleware.DoNotLogError{Err: newCircuitBreakerOpenError(cb.cb.RemainingDelay())} + return false, newCircuitBreakerOpenError(cb.cb.RemainingDelay()) } return true, nil } @@ -197,7 +197,7 @@ func (cb *circuitBreaker) recordResult(err error) { return } if err != nil && isFailure(err) { - cb.cb.RecordError(err) + cb.cb.RecordFailure() cb.metrics.circuitBreakerResults.WithLabelValues(resultError).Inc() } else { cb.metrics.circuitBreakerResults.WithLabelValues(resultSuccess).Inc() @@ -205,6 +205,7 @@ func (cb *circuitBreaker) recordResult(err error) { } } +// TODO: fix the comment // finishPushRequest records the result of a push request with this circuit breaker. // If the circuit breaker is not active, finishPushRequest does nothing. // If the passed duration of a push request exceeds the configured maximal push request duration, diff --git a/pkg/ingester/circuitbreaker_test.go b/pkg/ingester/circuitbreaker_test.go index 4eb6e99e90e..48721f2ae9c 100644 --- a/pkg/ingester/circuitbreaker_test.go +++ b/pkg/ingester/circuitbreaker_test.go @@ -12,7 +12,6 @@ import ( "github.com/failsafe-go/failsafe-go/circuitbreaker" "github.com/go-kit/log" "github.com/gogo/status" - "github.com/grafana/dskit/middleware" "github.com/grafana/dskit/services" "github.com/grafana/dskit/test" "github.com/grafana/dskit/user" @@ -156,7 +155,6 @@ func TestCircuitBreaker_TryAcquirePermit(t *testing.T) { `, }, } - ctx := context.Background() for testName, testCase := range testCases { t.Run(testName, func(t *testing.T) { registry := prometheus.NewRegistry() @@ -166,7 +164,7 @@ func TestCircuitBreaker_TryAcquirePermit(t *testing.T) { status, err := cb.tryAcquirePermit() require.Equal(t, testCase.expectedStatus, status) if testCase.expectedCircuitBreakerError { - checkCircuitBreakerOpenErr(ctx, err, t) + require.ErrorAs(t, err, &circuitBreakerOpenError{}) assert.NoError(t, testutil.GatherAndCompare(registry, strings.NewReader(testCase.expectedMetrics), metricNames...)) } else { require.NoError(t, err) @@ -446,7 +444,7 @@ func TestIngester_PushToStorage_CircuitBreaker(t *testing.T) { require.ErrorAs(t, err, &testCase.expectedErrorWhenCircuitBreakerClosed) } } else { - checkCircuitBreakerOpenErr(ctx, err, t) + require.ErrorAs(t, err, &circuitBreakerOpenError{}) } } } @@ -497,17 +495,6 @@ func TestIngester_PushToStorage_CircuitBreaker(t *testing.T) { } } -func checkCircuitBreakerOpenErr(ctx context.Context, err error, t *testing.T) { - var cbOpenErr circuitBreakerOpenError - require.ErrorAs(t, err, &cbOpenErr) - - var optional middleware.OptionalLogging - require.ErrorAs(t, err, &optional) - - shouldLog, _ := optional.ShouldLog(ctx) - require.False(t, shouldLog, "expected not to log via .ShouldLog()") -} - func TestIngester_StartPushRequest_CircuitBreakerOpen(t *testing.T) { reg := prometheus.NewPedanticRegistry() cfg := defaultIngesterTestConfig(t) @@ -668,7 +655,7 @@ func TestIngester_FinishPushRequest(t *testing.T) { st := &pushRequestState{ requestDuration: testCase.pushRequestDuration, acquiredCircuitBreakerPermit: testCase.acquiredCircuitBreakerPermit, - err: testCase.err, + pushErr: testCase.err, } ctx = context.WithValue(ctx, pushReqCtxKey, st) @@ -776,7 +763,7 @@ func TestIngester_Push_CircuitBreaker_DeadlineExceeded(t *testing.T) { } else { require.Equal(t, circuitbreaker.OpenState, i.circuitBreaker.cb.State()) require.Nil(t, ctx) - checkCircuitBreakerOpenErr(ctx, err, t) + require.ErrorAs(t, err, &circuitBreakerOpenError{}) } } } diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index e3a0aa8f07a..b9d10159218 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -966,7 +966,7 @@ type pushRequestState struct { requestSize int64 requestDuration time.Duration acquiredCircuitBreakerPermit bool - err error + pushErr error } // StartPushRequest checks if ingester can start push request, and increments relevant counters. @@ -981,7 +981,7 @@ func (i *Ingester) FinishPushRequest(ctx context.Context) { if !ok { return } - i.finishPushRequest(ctx, st.requestSize, st.requestDuration, st.acquiredCircuitBreakerPermit, st.err) + i.finishPushRequest(ctx, st.requestSize, st.requestDuration, st.acquiredCircuitBreakerPermit, st.pushErr) } // This method can be called in two ways: 1. Ingester.PushWithCleanup, or 2. Ingester.StartPushRequest via gRPC server's method limiter. @@ -1008,7 +1008,7 @@ func (i *Ingester) startPushRequest(ctx context.Context, reqSize int64) (_ conte // breaker. This is done in finishPushRequest(). acquiredCircuitBreakerPermit, err = i.circuitBreaker.tryAcquirePermit() if err != nil { - return nil, false, acquiredCircuitBreakerPermit, err + return nil, false, false, err } st := &pushRequestState{ @@ -1027,42 +1027,42 @@ func (i *Ingester) startPushRequest(ctx context.Context, reqSize int64) (_ conte rejectEqualInflightBytes = true // if inflightBytes == limit, reject new request } - finishRequestInDefer := true + err = i.checkInstanceLimits(inflight, inflightBytes, rejectEqualInflightBytes) + if err == nil { + return ctx, true, acquiredCircuitBreakerPermit, nil + } - defer func() { - if finishRequestInDefer { - // If startPushRequest() require finishPushRequest() to be executed in defer, - // it is because one the per-instance limits has been hit. In this case, the - // corresponding error has to be passed to finishPushRequest() in order to - // record the failure with the circuit breaker. - i.finishPushRequest(ctx, reqSize, 0, acquiredCircuitBreakerPermit, err) - } - }() + // In this case a per-instance limits has been hit, and the corresponding error has to be + // passed to finishPushRequest(), which finishes the push request, records the error with + // the circuit breaker, and gives it a possibly acquired permit back. + i.finishPushRequest(ctx, reqSize, 0, acquiredCircuitBreakerPermit, err) + return nil, false, false, err +} +func (i *Ingester) checkInstanceLimits(inflight int64, inflightBytes int64, rejectEqualInflightBytes bool) error { il := i.getInstanceLimits() - if il != nil { - if il.MaxInflightPushRequests > 0 && inflight > il.MaxInflightPushRequests { - i.metrics.rejected.WithLabelValues(reasonIngesterMaxInflightPushRequests).Inc() - return nil, false, acquiredCircuitBreakerPermit, errMaxInflightRequestsReached - } + if il == nil { + return nil + } + if il.MaxInflightPushRequests > 0 && inflight > il.MaxInflightPushRequests { + i.metrics.rejected.WithLabelValues(reasonIngesterMaxInflightPushRequests).Inc() + return errMaxInflightRequestsReached + } - if il.MaxInflightPushRequestsBytes > 0 { - if (rejectEqualInflightBytes && inflightBytes >= il.MaxInflightPushRequestsBytes) || inflightBytes > il.MaxInflightPushRequestsBytes { - i.metrics.rejected.WithLabelValues(reasonIngesterMaxInflightPushRequestsBytes).Inc() - return nil, false, acquiredCircuitBreakerPermit, errMaxInflightRequestsBytesReached - } + if il.MaxInflightPushRequestsBytes > 0 { + if (rejectEqualInflightBytes && inflightBytes >= il.MaxInflightPushRequestsBytes) || inflightBytes > il.MaxInflightPushRequestsBytes { + i.metrics.rejected.WithLabelValues(reasonIngesterMaxInflightPushRequestsBytes).Inc() + return errMaxInflightRequestsBytesReached } + } - if il.MaxIngestionRate > 0 { - if rate := i.ingestionRate.Rate(); rate >= il.MaxIngestionRate { - i.metrics.rejected.WithLabelValues(reasonIngesterMaxIngestionRate).Inc() - return nil, false, acquiredCircuitBreakerPermit, errMaxIngestionRateReached - } + if il.MaxIngestionRate > 0 { + if rate := i.ingestionRate.Rate(); rate >= il.MaxIngestionRate { + i.metrics.rejected.WithLabelValues(reasonIngesterMaxIngestionRate).Inc() + return errMaxIngestionRateReached } } - - finishRequestInDefer = false - return ctx, true, acquiredCircuitBreakerPermit, nil + return nil } func (i *Ingester) finishPushRequest(ctx context.Context, reqSize int64, duration time.Duration, acquiredCircuitBreakerPermit bool, err error) { @@ -1081,19 +1081,28 @@ func (i *Ingester) PushWithCleanup(ctx context.Context, req *mimirpb.WriteReques // retain anything from `req` past the exit from this function. defer cleanUp() - start := time.Now() + var ( + start = time.Now() + acquiredCircuitBreakerPermit bool + ) // Only start/finish request here when the request comes NOT from grpc handlers (i.e., from ingest.Store). // NOTE: request coming from grpc handler may end up calling start multiple times during its lifetime (e.g., when migrating to ingest storage). // startPushRequest handles this. if i.cfg.IngestStorageConfig.Enabled || !i.cfg.LimitInflightRequestsUsingGrpcMethodLimiter { reqSize := int64(req.Size()) - _, shouldFinish, acquiredCircuitBreakerPermit, startPushErr := i.startPushRequest(ctx, reqSize) + var ( + shouldFinish bool + startPushErr error + ) + // We need to replace the original context with the context returned by startPushRequest, + // because the latter might store a new pushRequestState in the context. + ctx, shouldFinish, acquiredCircuitBreakerPermit, startPushErr = i.startPushRequest(ctx, reqSize) if startPushErr != nil { return middleware.DoNotLogError{Err: startPushErr} } if shouldFinish { defer func() { - i.finishPushRequest(ctx, reqSize, time.Since(start), acquiredCircuitBreakerPermit, err) + i.FinishPushRequest(ctx) }() } } @@ -1104,7 +1113,14 @@ func (i *Ingester) PushWithCleanup(ctx context.Context, req *mimirpb.WriteReques // successive call to FinishPushRequest(). if st, ok := ctx.Value(pushReqCtxKey).(*pushRequestState); ok { st.requestDuration = time.Since(start) - st.err = err + st.pushErr = err + // We change the value of st.acquiredCircuitBreakerPermit only if + // the permit was acquired during PushWithCleanup. Otherwise, we + // keep its previous value, because it might have already been + // set to true (for example during a call to StartPushRequest). + if acquiredCircuitBreakerPermit { + st.acquiredCircuitBreakerPermit = true + } } }() From bd686a5c670dc369e54c1ccf9cfa6c9c52971498 Mon Sep 17 00:00:00 2001 From: Yuri Nikolic Date: Sat, 1 Jun 2024 22:43:07 +0200 Subject: [PATCH 10/16] Get rid of finishPushRequest Signed-off-by: Yuri Nikolic --- pkg/ingester/circuitbreaker.go | 3 +- pkg/ingester/ingester.go | 88 +++++++++++++++++----------------- 2 files changed, 44 insertions(+), 47 deletions(-) diff --git a/pkg/ingester/circuitbreaker.go b/pkg/ingester/circuitbreaker.go index 3ff6c992498..820e0e8f9e0 100644 --- a/pkg/ingester/circuitbreaker.go +++ b/pkg/ingester/circuitbreaker.go @@ -124,8 +124,7 @@ func newCircuitBreaker(cfg CircuitBreakerConfig, logger log.Logger, registerer p OnHalfOpen(func(event circuitbreaker.StateChangedEvent) { circuitBreakerTransitionsCounterFn(cb.metrics, circuitbreaker.HalfOpenState).Inc() level.Info(logger).Log("msg", "circuit breaker is half-open", "previous", event.OldState, "current", event.NewState) - }). - HandleIf(func(_ any, err error) bool { return isFailure(err) }) + }) if cfg.testModeEnabled { // In case of testing purposes, we initialize the circuit breaker with count based failure thresholding, diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index b9d10159218..5d3e293b87b 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -969,19 +969,32 @@ type pushRequestState struct { pushErr error } +func getPushRequestState(ctx context.Context) *pushRequestState { + if st, ok := ctx.Value(pushReqCtxKey).(*pushRequestState); ok { + return st + } + return nil +} + // StartPushRequest checks if ingester can start push request, and increments relevant counters. // If new push request cannot be started, errors convertible to gRPC status code are returned, and metrics are updated. func (i *Ingester) StartPushRequest(ctx context.Context, reqSize int64) (context.Context, error) { - ctx, _, _, err := i.startPushRequest(ctx, reqSize) + ctx, _, err := i.startPushRequest(ctx, reqSize) return ctx, err } func (i *Ingester) FinishPushRequest(ctx context.Context) { - st, ok := ctx.Value(pushReqCtxKey).(*pushRequestState) - if !ok { + st := getPushRequestState(ctx) + if st == nil { return } - i.finishPushRequest(ctx, st.requestSize, st.requestDuration, st.acquiredCircuitBreakerPermit, st.pushErr) + i.inflightPushRequests.Dec() + if st.requestSize > 0 { + i.inflightPushRequestsBytes.Sub(st.requestSize) + } + if st.acquiredCircuitBreakerPermit { + _ = i.circuitBreaker.finishPushRequest(ctx, st.requestDuration, st.pushErr) + } } // This method can be called in two ways: 1. Ingester.PushWithCleanup, or 2. Ingester.StartPushRequest via gRPC server's method limiter. @@ -989,26 +1002,26 @@ func (i *Ingester) FinishPushRequest(ctx context.Context) { // In the first case, returned errors can be inspected/logged by middleware. Ingester.PushWithCleanup will wrap the error in util_log.DoNotLogError wrapper. // In the second case, returned errors will not be logged, because request will not reach any middleware. // -// The shouldFinish flag tells if the caller must call finish on this request. If not, there is already someone in the call stack who will do that. -// The acquiredCircuitBreakerPermit flag tells to the caller if this call has acquired a circuit breaker permit, -// and if the caller should take care of returning the permit back. -func (i *Ingester) startPushRequest(ctx context.Context, reqSize int64) (_ context.Context, shouldFinish, acquiredCircuitBreakerPermit bool, err error) { +// If startPushRequest ends with no error, the resulting context includes a *pushRequestState object +// containing relevant information about the push request started by this method. +// The resulting boolean flag tells if the caller must call finish on this request. If not, there is already someone in the call stack who will do that. +func (i *Ingester) startPushRequest(ctx context.Context, reqSize int64) (context.Context, bool, error) { if err := i.checkAvailableForPush(); err != nil { - return nil, false, false, err + return nil, false, err } - if _, ok := ctx.Value(pushReqCtxKey).(*pushRequestState); ok { + if st := getPushRequestState(ctx); st != nil { // If state is already in context, this means we already passed through StartPushRequest for this request. - return ctx, false, false, nil + return ctx, false, nil } // We try to acquire a permit from the circuit breaker. // If it is not possible, it is because the circuit breaker is open, and a circuitBreakerOpenError is returned. // If it is possible, a permit has to be released by recording either a success or a failure with the circuit - // breaker. This is done in finishPushRequest(). - acquiredCircuitBreakerPermit, err = i.circuitBreaker.tryAcquirePermit() + // breaker. This is done by FinishPushRequest(). + acquiredCircuitBreakerPermit, err := i.circuitBreaker.tryAcquirePermit() if err != nil { - return nil, false, false, err + return nil, false, err } st := &pushRequestState{ @@ -1026,17 +1039,22 @@ func (i *Ingester) startPushRequest(ctx context.Context, reqSize int64) (_ conte inflightBytes = i.inflightPushRequestsBytes.Load() rejectEqualInflightBytes = true // if inflightBytes == limit, reject new request } + var startPushRequestErr error + defer func() { + if startPushRequestErr != nil { + // In this case a per-instance limits has been hit, and the corresponding error has to be + // passed to FinishPushRequest(), which finishes the push request, records the error with + // the circuit breaker, and gives it a possibly acquired permit back. + i.FinishPushRequest(ctx) + } + }() - err = i.checkInstanceLimits(inflight, inflightBytes, rejectEqualInflightBytes) - if err == nil { - return ctx, true, acquiredCircuitBreakerPermit, nil + startPushRequestErr = i.checkInstanceLimits(inflight, inflightBytes, rejectEqualInflightBytes) + if startPushRequestErr == nil { + return ctx, true, nil } - // In this case a per-instance limits has been hit, and the corresponding error has to be - // passed to finishPushRequest(), which finishes the push request, records the error with - // the circuit breaker, and gives it a possibly acquired permit back. - i.finishPushRequest(ctx, reqSize, 0, acquiredCircuitBreakerPermit, err) - return nil, false, false, err + return nil, false, startPushRequestErr } func (i *Ingester) checkInstanceLimits(inflight int64, inflightBytes int64, rejectEqualInflightBytes bool) error { @@ -1065,26 +1083,13 @@ func (i *Ingester) checkInstanceLimits(inflight int64, inflightBytes int64, reje return nil } -func (i *Ingester) finishPushRequest(ctx context.Context, reqSize int64, duration time.Duration, acquiredCircuitBreakerPermit bool, err error) { - i.inflightPushRequests.Dec() - if reqSize > 0 { - i.inflightPushRequestsBytes.Sub(reqSize) - } - if acquiredCircuitBreakerPermit { - _ = i.circuitBreaker.finishPushRequest(ctx, duration, err) - } -} - // PushWithCleanup is the Push() implementation for blocks storage and takes a WriteRequest and adds it to the TSDB head. func (i *Ingester) PushWithCleanup(ctx context.Context, req *mimirpb.WriteRequest, cleanUp func()) (err error) { // NOTE: because we use `unsafe` in deserialisation, we must not // retain anything from `req` past the exit from this function. defer cleanUp() - var ( - start = time.Now() - acquiredCircuitBreakerPermit bool - ) + start := time.Now() // Only start/finish request here when the request comes NOT from grpc handlers (i.e., from ingest.Store). // NOTE: request coming from grpc handler may end up calling start multiple times during its lifetime (e.g., when migrating to ingest storage). // startPushRequest handles this. @@ -1096,7 +1101,7 @@ func (i *Ingester) PushWithCleanup(ctx context.Context, req *mimirpb.WriteReques ) // We need to replace the original context with the context returned by startPushRequest, // because the latter might store a new pushRequestState in the context. - ctx, shouldFinish, acquiredCircuitBreakerPermit, startPushErr = i.startPushRequest(ctx, reqSize) + ctx, shouldFinish, startPushErr = i.startPushRequest(ctx, reqSize) if startPushErr != nil { return middleware.DoNotLogError{Err: startPushErr} } @@ -1111,16 +1116,9 @@ func (i *Ingester) PushWithCleanup(ctx context.Context, req *mimirpb.WriteReques // We enrich the pushRequestState contained in the context with this PushWithCleanUp() // call duration, and a possible error it returns. These data are needed during a // successive call to FinishPushRequest(). - if st, ok := ctx.Value(pushReqCtxKey).(*pushRequestState); ok { + if st := getPushRequestState(ctx); st != nil { st.requestDuration = time.Since(start) st.pushErr = err - // We change the value of st.acquiredCircuitBreakerPermit only if - // the permit was acquired during PushWithCleanup. Otherwise, we - // keep its previous value, because it might have already been - // set to true (for example during a call to StartPushRequest). - if acquiredCircuitBreakerPermit { - st.acquiredCircuitBreakerPermit = true - } } }() From 48e8d6280327c7692a8826915a9b60d1637a3f5c Mon Sep 17 00:00:00 2001 From: Yuri Nikolic Date: Sun, 2 Jun 2024 10:05:24 +0200 Subject: [PATCH 11/16] Add unit test for startPushRequest Signed-off-by: Yuri Nikolic --- pkg/ingester/circuitbreaker.go | 22 ++--- pkg/ingester/ingester.go | 31 ++++--- pkg/ingester/ingester_test.go | 144 +++++++++++++++++++++++++++++++++ 3 files changed, 171 insertions(+), 26 deletions(-) diff --git a/pkg/ingester/circuitbreaker.go b/pkg/ingester/circuitbreaker.go index 820e0e8f9e0..00b47093fb6 100644 --- a/pkg/ingester/circuitbreaker.go +++ b/pkg/ingester/circuitbreaker.go @@ -173,13 +173,11 @@ func (cb *circuitBreaker) isActive() bool { return cb.startTime.Before(time.Now()) } -// TODO: fix the comment // tryAcquirePermit tries to acquire a permit to use the circuit breaker and returns whether a permit was acquired. -// If the circuit breaker is not yet active, a status false and no error are returned. -// If it was not possible to acquire a permit, this means that the circuit breaker is open. In this case, a status -// false and an circuitBreakerOpenError are returned. -// If it was possible to acquire a permit, a status true and no error are returned. In this case, the permission -// will be automatically released once when a result is recorded by calling resultRecorded. +// If it was possible to acquire a permit, a status true and no error are returned. The acquired permit must be +// returned by a call to finishPushRequest. +// If it was not possible to acquire a permit, a status false is returned. In this case no call to finishPushRequest +// is needed. If the permit was not acquired because of an error, that causing error is returned as well. func (cb *circuitBreaker) tryAcquirePermit() (bool, error) { if !cb.isActive() { return false, nil @@ -204,14 +202,10 @@ func (cb *circuitBreaker) recordResult(err error) { } } -// TODO: fix the comment -// finishPushRequest records the result of a push request with this circuit breaker. -// If the circuit breaker is not active, finishPushRequest does nothing. -// If the passed duration of a push request exceeds the configured maximal push request duration, -// a context.DeadlineExceeded error is recorded and returned, independently of the passed pushErr, -// the actual error that occurred during the push request. -// Otherwise, the given pushErr is recorded and returned. -// The returned error is needed only for testing purposes. +// finishPushRequest should be called to complete the push request executed upon a +// successfully acquired circuit breaker permit. +// It records the result of the push request with the circuit breaker. Push requests +// that lasted longer than the configured timeout are treated as a failure. func (cb *circuitBreaker) finishPushRequest(ctx context.Context, duration time.Duration, pushErr error) error { if !cb.isActive() { return nil diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index 5d3e293b87b..cf8b29333d7 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -979,7 +979,7 @@ func getPushRequestState(ctx context.Context) *pushRequestState { // StartPushRequest checks if ingester can start push request, and increments relevant counters. // If new push request cannot be started, errors convertible to gRPC status code are returned, and metrics are updated. func (i *Ingester) StartPushRequest(ctx context.Context, reqSize int64) (context.Context, error) { - ctx, _, err := i.startPushRequest(ctx, reqSize) + ctx, _, err := i.startPushRequest(ctx, reqSize, nil) return ctx, err } @@ -1002,10 +1002,13 @@ func (i *Ingester) FinishPushRequest(ctx context.Context) { // In the first case, returned errors can be inspected/logged by middleware. Ingester.PushWithCleanup will wrap the error in util_log.DoNotLogError wrapper. // In the second case, returned errors will not be logged, because request will not reach any middleware. // +// Parameter finishPushRequestFn is used for testing purposes, and represents the function that finishes the push request that startPushRequest initialize, +// in case of an error during the execution of the latter. It defaults to FinishPushRequest. +// // If startPushRequest ends with no error, the resulting context includes a *pushRequestState object // containing relevant information about the push request started by this method. // The resulting boolean flag tells if the caller must call finish on this request. If not, there is already someone in the call stack who will do that. -func (i *Ingester) startPushRequest(ctx context.Context, reqSize int64) (context.Context, bool, error) { +func (i *Ingester) startPushRequest(ctx context.Context, reqSize int64, finishPushRequestFn func(ctx context.Context)) (context.Context, bool, error) { if err := i.checkAvailableForPush(); err != nil { return nil, false, err } @@ -1039,22 +1042,26 @@ func (i *Ingester) startPushRequest(ctx context.Context, reqSize int64) (context inflightBytes = i.inflightPushRequestsBytes.Load() rejectEqualInflightBytes = true // if inflightBytes == limit, reject new request } - var startPushRequestErr error + var instanceLimitsErr error defer func() { - if startPushRequestErr != nil { - // In this case a per-instance limits has been hit, and the corresponding error has to be - // passed to FinishPushRequest(), which finishes the push request, records the error with - // the circuit breaker, and gives it a possibly acquired permit back. - i.FinishPushRequest(ctx) + if instanceLimitsErr != nil { + // In this case a per-instance limit has been hit, and the corresponding error has to be passed + // to the function which finishes the push request, records the error with the circuit breaker, + // and gives it a possibly acquired permit back. + if finishPushRequestFn != nil { + finishPushRequestFn(ctx) + } else { + i.FinishPushRequest(ctx) + } } }() - startPushRequestErr = i.checkInstanceLimits(inflight, inflightBytes, rejectEqualInflightBytes) - if startPushRequestErr == nil { + instanceLimitsErr = i.checkInstanceLimits(inflight, inflightBytes, rejectEqualInflightBytes) + if instanceLimitsErr == nil { return ctx, true, nil } - return nil, false, startPushRequestErr + return nil, false, instanceLimitsErr } func (i *Ingester) checkInstanceLimits(inflight int64, inflightBytes int64, rejectEqualInflightBytes bool) error { @@ -1101,7 +1108,7 @@ func (i *Ingester) PushWithCleanup(ctx context.Context, req *mimirpb.WriteReques ) // We need to replace the original context with the context returned by startPushRequest, // because the latter might store a new pushRequestState in the context. - ctx, shouldFinish, startPushErr = i.startPushRequest(ctx, reqSize) + ctx, shouldFinish, startPushErr = i.startPushRequest(ctx, reqSize, nil) if startPushErr != nil { return middleware.DoNotLogError{Err: startPushErr} } diff --git a/pkg/ingester/ingester_test.go b/pkg/ingester/ingester_test.go index 1d078067300..3ab22153bb5 100644 --- a/pkg/ingester/ingester_test.go +++ b/pkg/ingester/ingester_test.go @@ -80,6 +80,150 @@ func mustNewActiveSeriesCustomTrackersConfigFromMap(t *testing.T, source map[str return m } +func TestIngester_StartPushRequest(t *testing.T) { + instanceLimits := &InstanceLimits{} + pushReqState := &pushRequestState{ + requestSize: 10, + acquiredCircuitBreakerPermit: true, + } + ctx := context.Background() + ctxWithPushReqState := context.WithValue(ctx, pushReqCtxKey, pushReqState) + + type testCase struct { + ctx context.Context + failingIngester bool + cbOpen bool + cbInitialDelay time.Duration + instanceLimitReached bool + expectedStatus bool + expectedFinishPushRequestCalls int + verifyCtxFn func(context.Context, context.Context) + verifyErr func(error) + } + + setupIngester := func(tc testCase) *failingIngester { + cfg := defaultIngesterTestConfig(t) + cfg.CircuitBreakerConfig = CircuitBreakerConfig{ + Enabled: true, + InitialDelay: tc.cbInitialDelay, + CooldownPeriod: 10 * time.Second, + } + cfg.InstanceLimitsFn = func() *InstanceLimits { + return instanceLimits + } + var ( + failingCause error + expectedState services.State + ) + if tc.failingIngester { + expectedState = services.Terminated + failingCause = newUnavailableError(expectedState) + } else { + expectedState = services.Running + } + failingIng := setupFailingIngester(t, cfg, failingCause) + failingIng.startWaitAndCheck(ctx, t) + require.Equal(t, expectedState, failingIng.lifecycler.State()) + + if tc.cbOpen { + failingIng.circuitBreaker.cb.Open() + } else { + failingIng.circuitBreaker.cb.Close() + } + + if tc.instanceLimitReached { + instanceLimits.MaxInflightPushRequestsBytes = 1 + } else { + instanceLimits.MaxInflightPushRequestsBytes = 0 + } + + return failingIng + } + + testCases := map[string]testCase{ + "fail if ingester is not available for push": { + failingIngester: true, + ctx: ctx, + expectedStatus: false, + expectedFinishPushRequestCalls: 0, + verifyErr: func(err error) { + require.ErrorAs(t, err, &unavailableError{}) + }, + }, + "fail if circuit breaker is open": { + ctx: ctx, + cbOpen: true, + expectedStatus: false, + expectedFinishPushRequestCalls: 0, + verifyErr: func(err error) { + require.ErrorAs(t, err, &circuitBreakerOpenError{}) + }, + }, + "fail if instance limit is reached": { + ctx: ctx, + instanceLimitReached: true, + expectedStatus: false, + expectedFinishPushRequestCalls: 1, + verifyErr: func(err error) { + require.ErrorAs(t, err, &instanceLimitReachedError{}) + }, + }, + "do not fail if circuit breaker is not active": { + ctx: ctx, + cbInitialDelay: 1 * time.Minute, + expectedStatus: true, + expectedFinishPushRequestCalls: 0, + verifyCtxFn: func(inCtx, outCtx context.Context) { + require.NotEqual(t, inCtx, outCtx) + }, + }, + "do not fail and return the same context if it already contains a pushRequestState": { + ctx: ctxWithPushReqState, + expectedStatus: false, + expectedFinishPushRequestCalls: 0, + verifyCtxFn: func(inCtx, outCtx context.Context) { + require.Equal(t, inCtx, outCtx) + }, + }, + "do not fail and add pushRequestState to the context if everything is ok": { + ctx: ctx, + expectedStatus: true, + expectedFinishPushRequestCalls: 0, + verifyCtxFn: func(inCtx, outCtx context.Context) { + require.NotEqual(t, inCtx, outCtx) + }, + }, + } + for testName, tc := range testCases { + t.Run(testName, func(t *testing.T) { + failingIng := setupIngester(tc) + defer services.StopAndAwaitTerminated(context.Background(), failingIng) //nolint:errcheck + + finishPushRequestCount := 0 + finishPushRequestFn := func(ctx context.Context) { + failingIng.FinishPushRequest(ctx) + finishPushRequestCount++ + } + + ctx, shouldFinish, err := failingIng.startPushRequest(tc.ctx, 10, finishPushRequestFn) + require.Equal(t, tc.expectedStatus, shouldFinish) + + if err == nil { + pushReqState := getPushRequestState(ctx) + require.NotNil(t, pushReqState) + + if tc.verifyCtxFn != nil { + tc.verifyCtxFn(tc.ctx, ctx) + } + } else { + require.Nil(t, ctx) + require.NotNil(t, tc.verifyErr) + tc.verifyErr(err) + } + }) + } +} + func TestIngester_Push(t *testing.T) { metricLabelAdapters := []mimirpb.LabelAdapter{{Name: labels.MetricName, Value: "test"}} metricLabelSet := mimirpb.FromLabelAdaptersToMetric(metricLabelAdapters) From d374c8304cfc3fde022279a0eabb41eca7b92a97 Mon Sep 17 00:00:00 2001 From: Yuri Nikolic Date: Sun, 2 Jun 2024 11:43:33 +0200 Subject: [PATCH 12/16] Activate circuit breaker on a successful completion of ingester.starting Signed-off-by: Yuri Nikolic --- pkg/ingester/circuitbreaker.go | 30 +++++++++++++++++++---------- pkg/ingester/circuitbreaker_test.go | 13 ++++++++----- pkg/ingester/ingester.go | 13 +++++++++++-- 3 files changed, 39 insertions(+), 17 deletions(-) diff --git a/pkg/ingester/circuitbreaker.go b/pkg/ingester/circuitbreaker.go index 00b47093fb6..777c594b8f2 100644 --- a/pkg/ingester/circuitbreaker.go +++ b/pkg/ingester/circuitbreaker.go @@ -14,6 +14,7 @@ import ( "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" + "go.uber.org/atomic" "google.golang.org/grpc/codes" "github.com/grafana/mimir/pkg/mimirpb" @@ -92,18 +93,19 @@ func (cfg *CircuitBreakerConfig) RegisterFlags(f *flag.FlagSet) { } type circuitBreaker struct { - cfg CircuitBreakerConfig - logger log.Logger - metrics *circuitBreakerMetrics - startTime time.Time - cb circuitbreaker.CircuitBreaker[any] + cfg CircuitBreakerConfig + logger log.Logger + metrics *circuitBreakerMetrics + active atomic.Bool + cb circuitbreaker.CircuitBreaker[any] } -func newCircuitBreaker(cfg CircuitBreakerConfig, logger log.Logger, registerer prometheus.Registerer) *circuitBreaker { +func newCircuitBreaker(cfg CircuitBreakerConfig, isActive bool, logger log.Logger, registerer prometheus.Registerer) *circuitBreaker { + active := atomic.NewBool(isActive) cb := circuitBreaker{ - cfg: cfg, - logger: logger, - startTime: time.Now().Add(cfg.InitialDelay), + cfg: cfg, + logger: logger, + active: *active, } circuitBreakerTransitionsCounterFn := func(metrics *circuitBreakerMetrics, state circuitbreaker.State) prometheus.Counter { @@ -170,7 +172,14 @@ func (cb *circuitBreaker) isActive() bool { if cb == nil { return false } - return cb.startTime.Before(time.Now()) + return cb.active.Load() +} + +func (cb *circuitBreaker) setActive() { + if cb == nil { + return + } + cb.active.Store(true) } // tryAcquirePermit tries to acquire a permit to use the circuit breaker and returns whether a permit was acquired. @@ -206,6 +215,7 @@ func (cb *circuitBreaker) recordResult(err error) { // successfully acquired circuit breaker permit. // It records the result of the push request with the circuit breaker. Push requests // that lasted longer than the configured timeout are treated as a failure. +// The returned error is only used for testing purposes. func (cb *circuitBreaker) finishPushRequest(ctx context.Context, duration time.Duration, pushErr error) error { if !cb.isActive() { return nil diff --git a/pkg/ingester/circuitbreaker_test.go b/pkg/ingester/circuitbreaker_test.go index 48721f2ae9c..d3d2712d28a 100644 --- a/pkg/ingester/circuitbreaker_test.go +++ b/pkg/ingester/circuitbreaker_test.go @@ -77,12 +77,15 @@ func TestCircuitBreaker_IsActive(t *testing.T) { registry := prometheus.NewRegistry() cfg := CircuitBreakerConfig{Enabled: true, InitialDelay: 10 * time.Millisecond} - cb = newCircuitBreaker(cfg, log.NewNopLogger(), registry) + cb = newCircuitBreaker(cfg, false, log.NewNopLogger(), registry) + time.AfterFunc(cfg.InitialDelay, func() { + cb.setActive() + }) // When InitialDelay is set, circuit breaker is not immediately active. require.False(t, cb.isActive()) // After InitialDelay passed, circuit breaker becomes active. - time.Sleep(10 * time.Millisecond) + time.Sleep(2 * cfg.InitialDelay) require.True(t, cb.isActive()) } @@ -159,7 +162,7 @@ func TestCircuitBreaker_TryAcquirePermit(t *testing.T) { t.Run(testName, func(t *testing.T) { registry := prometheus.NewRegistry() cfg := CircuitBreakerConfig{Enabled: true, CooldownPeriod: 10 * time.Second, InitialDelay: testCase.initialDelay} - cb := newCircuitBreaker(cfg, log.NewNopLogger(), registry) + cb := newCircuitBreaker(cfg, cfg.InitialDelay == 0, log.NewNopLogger(), registry) testCase.circuitBreakerSetup(cb) status, err := cb.tryAcquirePermit() require.Equal(t, testCase.expectedStatus, status) @@ -216,7 +219,7 @@ func TestCircuitBreaker_RecordResult(t *testing.T) { for testName, testCase := range testCases { t.Run(testName, func(t *testing.T) { registry := prometheus.NewRegistry() - cb := newCircuitBreaker(cfg, log.NewNopLogger(), registry) + cb := newCircuitBreaker(cfg, true, log.NewNopLogger(), registry) cb.recordResult(testCase.err) assert.NoError(t, testutil.GatherAndCompare(registry, strings.NewReader(testCase.expectedMetrics), metricNames...)) }) @@ -317,7 +320,7 @@ func TestCircuitBreaker_FinishPushRequest(t *testing.T) { InitialDelay: testCase.initialDelay, PushTimeout: 2 * time.Second, } - cb := newCircuitBreaker(cfg, log.NewNopLogger(), registry) + cb := newCircuitBreaker(cfg, cfg.InitialDelay == 0, log.NewNopLogger(), registry) err := cb.finishPushRequest(ctx, testCase.pushRequestDuration, testCase.err) if testCase.expectedErr == nil { require.NoError(t, err) diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index cf8b29333d7..cbb5e57cd8a 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -399,7 +399,8 @@ func New(cfg Config, limits *validation.Overrides, ingestersRing ring.ReadRing, i.activeGroups = activeGroupsCleanupService if cfg.CircuitBreakerConfig.Enabled { - i.circuitBreaker = newCircuitBreaker(cfg.CircuitBreakerConfig, logger, registerer) + // We create an inactive circuit breaker, which will be activated on a successful completion of starting. + i.circuitBreaker = newCircuitBreaker(cfg.CircuitBreakerConfig, false, logger, registerer) } if registerer != nil { @@ -543,7 +544,15 @@ func (i *Ingester) startingForFlusher(ctx context.Context) error { func (i *Ingester) starting(ctx context.Context) (err error) { defer func() { - if err != nil { + if err == nil { + if i.cfg.CircuitBreakerConfig.InitialDelay == 0 { + i.circuitBreaker.setActive() + } else { + time.AfterFunc(i.cfg.CircuitBreakerConfig.InitialDelay, func() { + i.circuitBreaker.setActive() + }) + } + } else { // if starting() fails for any reason (e.g., context canceled), // the lifecycler must be stopped. _ = services.StopAndAwaitTerminated(context.Background(), i.lifecycler) From 335d21ab4355d1f22017c3a40c952c88d4559de0 Mon Sep 17 00:00:00 2001 From: Yuri Nikolic Date: Sun, 2 Jun 2024 12:02:11 +0200 Subject: [PATCH 13/16] Rename the output error of ingester.PushWithCleanup() Signed-off-by: Yuri Nikolic --- pkg/ingester/ingester.go | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index cbb5e57cd8a..338d37b9522 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -1100,7 +1100,7 @@ func (i *Ingester) checkInstanceLimits(inflight int64, inflightBytes int64, reje } // PushWithCleanup is the Push() implementation for blocks storage and takes a WriteRequest and adds it to the TSDB head. -func (i *Ingester) PushWithCleanup(ctx context.Context, req *mimirpb.WriteRequest, cleanUp func()) (err error) { +func (i *Ingester) PushWithCleanup(ctx context.Context, req *mimirpb.WriteRequest, cleanUp func()) (returnErr error) { // NOTE: because we use `unsafe` in deserialisation, we must not // retain anything from `req` past the exit from this function. defer cleanUp() @@ -1134,7 +1134,7 @@ func (i *Ingester) PushWithCleanup(ctx context.Context, req *mimirpb.WriteReques // successive call to FinishPushRequest(). if st := getPushRequestState(ctx); st != nil { st.requestDuration = time.Since(start) - st.pushErr = err + st.pushErr = returnErr } }() @@ -1204,13 +1204,12 @@ func (i *Ingester) PushWithCleanup(ctx context.Context, req *mimirpb.WriteReques minAppendTime, minAppendTimeAvailable := db.Head().AppendableMinValidTime() - err = i.pushSamplesToAppender(userID, req.Timeseries, app, startAppend, &stats, updateFirstPartial, activeSeries, i.limits.OutOfOrderTimeWindow(userID), minAppendTimeAvailable, minAppendTime) - if err != nil { + if pushSamplesToAppenderErr := i.pushSamplesToAppender(userID, req.Timeseries, app, startAppend, &stats, updateFirstPartial, activeSeries, i.limits.OutOfOrderTimeWindow(userID), minAppendTimeAvailable, minAppendTime); pushSamplesToAppenderErr != nil { if err := app.Rollback(); err != nil { level.Warn(i.logger).Log("msg", "failed to rollback appender on error", "user", userID, "err", err) } - return wrapOrAnnotateWithUser(err, userID) + return wrapOrAnnotateWithUser(pushSamplesToAppenderErr, userID) } // At this point all samples have been added to the appender, so we can track the time it took. From e3b0849f708a38e1d70cc4eab70c7e4879bdc38d Mon Sep 17 00:00:00 2001 From: Yuri Nikolic Date: Mon, 3 Jun 2024 16:55:38 +0200 Subject: [PATCH 14/16] Fixing review findings Signed-off-by: Yuri Nikolic --- CHANGELOG.md | 2 +- cmd/mimir/config-descriptor.json | 2 +- cmd/mimir/help-all.txt.tmpl | 2 +- .../mimir/configure/about-versioning.md | 2 +- .../configuration-parameters/index.md | 6 +- pkg/ingester/circuitbreaker.go | 6 +- pkg/ingester/circuitbreaker_test.go | 61 +++++++------- pkg/ingester/ingester.go | 51 +++++------- pkg/ingester/ingester_test.go | 80 ++++++++++--------- 9 files changed, 104 insertions(+), 108 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9c27ab7f3aa..102c3031b65 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,7 +23,7 @@ * [FEATURE] mimirtool: Add `runtime-config verify` sub-command, for verifying Mimir runtime config files. #8123 * [FEATURE] Query-frontend, querier: new experimental `/cardinality/active_native_histogram_metrics` API to get active native histogram metric names with statistics about active native histogram buckets. #7982 #7986 #8008 * [FEATURE] Alertmanager: Added `-alertmanager.max-silences-count` and `-alertmanager.max-silence-size-bytes` to set limits on per tenant silences. Disabled by default. #6898 -* [FEATURE] Ingester: add experimental support for the server-side circuit breakers when writing to ingesters via `-ingester.circuit-breaker.enabled`, `-ingester.circuit-breaker.failure-threshold`, or `-ingester.circuit-breaker.cooldown-period` or their corresponding YAML. Added metrics `cortex_ingester_circuit_breaker_results_total` and `cortex_ingester_circuit_breaker_transitions_total`. #8180 +* [FEATURE] Ingester: add experimental support for the server-side circuit breakers when writing to ingesters. This can be enabled using `-ingester.circuit-breaker.enabled` option. Further `-ingester.circuit-breaker.*` options for configuring circuit-breaker are available. Added metrics `cortex_ingester_circuit_breaker_results_total`, `cortex_ingester_circuit_breaker_transitions_total` and `cortex_ingester_circuit_breaker_current_state`. #8180 * [ENHANCEMENT] Reduced memory allocations in functions used to propagate contextual information between gRPC calls. #7529 * [ENHANCEMENT] Distributor: add experimental limit for exemplars per series per request, enabled with `-distributor.max-exemplars-per-series-per-request`, the number of discarded exemplars are tracked with `cortex_discarded_exemplars_total{reason="too_many_exemplars_per_series_per_request"}` #7989 #8010 * [ENHANCEMENT] Store-gateway: merge series from different blocks concurrently. #7456 diff --git a/cmd/mimir/config-descriptor.json b/cmd/mimir/config-descriptor.json index d35075dcd8d..cb6785a4ae4 100644 --- a/cmd/mimir/config-descriptor.json +++ b/cmd/mimir/config-descriptor.json @@ -3206,7 +3206,7 @@ "kind": "field", "name": "initial_delay", "required": false, - "desc": "How long the circuit breaker should wait between creation and starting up. During that time both failures and successes will not be counted.", + "desc": "How long the circuit breaker should wait to start up after the corresponding ingester started. During that time both failures and successes will not be counted.", "fieldValue": null, "fieldDefaultValue": 0, "fieldFlag": "ingester.circuit-breaker.initial-delay", diff --git a/cmd/mimir/help-all.txt.tmpl b/cmd/mimir/help-all.txt.tmpl index b66023e203a..5df86e53bfc 100644 --- a/cmd/mimir/help-all.txt.tmpl +++ b/cmd/mimir/help-all.txt.tmpl @@ -1316,7 +1316,7 @@ Usage of ./cmd/mimir/mimir: -ingester.circuit-breaker.failure-threshold-percentage uint [experimental] Max percentage of requests that can fail over period before the circuit breaker opens (default 10) -ingester.circuit-breaker.initial-delay duration - [experimental] How long the circuit breaker should wait between creation and starting up. During that time both failures and successes will not be counted. + [experimental] How long the circuit breaker should wait to start up after the corresponding ingester started. During that time both failures and successes will not be counted. -ingester.circuit-breaker.push-timeout duration How long is execution of ingester's Push supposed to last before it is reported as timeout in a circuit breaker. This configuration is used for circuit breakers only, and timeout expirations are not reported as errors (default 2s) -ingester.circuit-breaker.thresholding-period duration diff --git a/docs/sources/mimir/configure/about-versioning.md b/docs/sources/mimir/configure/about-versioning.md index 1e07c1fe988..79ba25c3805 100644 --- a/docs/sources/mimir/configure/about-versioning.md +++ b/docs/sources/mimir/configure/about-versioning.md @@ -119,7 +119,7 @@ The following features are currently experimental: - `-ingester.owned-series-update-interval` - Per-ingester circuit breaking based on requests timing out or hitting per-instance limits - `-ingester.circuit-breaker.enabled` - - `-ingester.circuit-breaker.failure-threshold` + - `-ingester.circuit-breaker.failure-threshold-percentage` - `-ingester.circuit-breaker.failure-execution-threshold` - `-ingester.circuit-breaker.thresholding-period` - `-ingester.circuit-breaker.cooldown-period` diff --git a/docs/sources/mimir/configure/configuration-parameters/index.md b/docs/sources/mimir/configure/configuration-parameters/index.md index e45ebf41126..58eaf184db7 100644 --- a/docs/sources/mimir/configure/configuration-parameters/index.md +++ b/docs/sources/mimir/configure/configuration-parameters/index.md @@ -1243,9 +1243,9 @@ circuit_breaker: # CLI flag: -ingester.circuit-breaker.cooldown-period [cooldown_period: | default = 10s] - # (experimental) How long the circuit breaker should wait between creation and - # starting up. During that time both failures and successes will not be - # counted. + # (experimental) How long the circuit breaker should wait to start up after + # the corresponding ingester started. During that time both failures and + # successes will not be counted. # CLI flag: -ingester.circuit-breaker.initial-delay [initial_delay: | default = 0s] diff --git a/pkg/ingester/circuitbreaker.go b/pkg/ingester/circuitbreaker.go index 777c594b8f2..da9477fc643 100644 --- a/pkg/ingester/circuitbreaker.go +++ b/pkg/ingester/circuitbreaker.go @@ -88,7 +88,7 @@ func (cfg *CircuitBreakerConfig) RegisterFlags(f *flag.FlagSet) { f.UintVar(&cfg.FailureExecutionThreshold, prefix+"failure-execution-threshold", 100, "How many requests must have been executed in period for the circuit breaker to be eligible to open for the rate of failures") f.DurationVar(&cfg.ThresholdingPeriod, prefix+"thresholding-period", time.Minute, "Moving window of time that the percentage of failed requests is computed over") f.DurationVar(&cfg.CooldownPeriod, prefix+"cooldown-period", 10*time.Second, "How long the circuit breaker will stay in the open state before allowing some requests") - f.DurationVar(&cfg.InitialDelay, prefix+"initial-delay", 0, "How long the circuit breaker should wait between creation and starting up. During that time both failures and successes will not be counted.") + f.DurationVar(&cfg.InitialDelay, prefix+"initial-delay", 0, "How long the circuit breaker should wait to start up after the corresponding ingester started. During that time both failures and successes will not be counted.") f.DurationVar(&cfg.PushTimeout, prefix+"push-timeout", defaultPushTimeout, "How long is execution of ingester's Push supposed to last before it is reported as timeout in a circuit breaker. This configuration is used for circuit breakers only, and timeout expirations are not reported as errors") } @@ -183,9 +183,9 @@ func (cb *circuitBreaker) setActive() { } // tryAcquirePermit tries to acquire a permit to use the circuit breaker and returns whether a permit was acquired. -// If it was possible to acquire a permit, a status true and no error are returned. The acquired permit must be +// If it was possible to acquire a permit, success flag true and no error are returned. The acquired permit must be // returned by a call to finishPushRequest. -// If it was not possible to acquire a permit, a status false is returned. In this case no call to finishPushRequest +// If it was not possible to acquire a permit, success flag false is returned. In this case no call to finishPushRequest // is needed. If the permit was not acquired because of an error, that causing error is returned as well. func (cb *circuitBreaker) tryAcquirePermit() (bool, error) { if !cb.isActive() { diff --git a/pkg/ingester/circuitbreaker_test.go b/pkg/ingester/circuitbreaker_test.go index d3d2712d28a..f84e2008437 100644 --- a/pkg/ingester/circuitbreaker_test.go +++ b/pkg/ingester/circuitbreaker_test.go @@ -85,8 +85,9 @@ func TestCircuitBreaker_IsActive(t *testing.T) { require.False(t, cb.isActive()) // After InitialDelay passed, circuit breaker becomes active. - time.Sleep(2 * cfg.InitialDelay) - require.True(t, cb.isActive()) + require.Eventually(t, func() bool { + return cb.isActive() + }, time.Second, 10*time.Millisecond) } func TestCircuitBreaker_TryAcquirePermit(t *testing.T) { @@ -98,24 +99,24 @@ func TestCircuitBreaker_TryAcquirePermit(t *testing.T) { testCases := map[string]struct { initialDelay time.Duration circuitBreakerSetup func(*circuitBreaker) - expectedStatus bool + expectedSuccess bool expectedCircuitBreakerError bool expectedMetrics string }{ "if circuit breaker is not active, status false and no error are returned": { initialDelay: 1 * time.Minute, circuitBreakerSetup: func(cb *circuitBreaker) { cb.cb.Close() }, - expectedStatus: false, + expectedSuccess: false, expectedCircuitBreakerError: false, }, "if circuit breaker closed, status true and no error are returned": { circuitBreakerSetup: func(cb *circuitBreaker) { cb.cb.Close() }, - expectedStatus: true, + expectedSuccess: true, expectedCircuitBreakerError: false, }, "if circuit breaker open, status false and a circuitBreakerErrorOpen are returned": { circuitBreakerSetup: func(cb *circuitBreaker) { cb.cb.Open() }, - expectedStatus: false, + expectedSuccess: false, expectedCircuitBreakerError: true, expectedMetrics: ` # HELP cortex_ingester_circuit_breaker_results_total Results of executing requests via the circuit breaker. @@ -137,7 +138,7 @@ func TestCircuitBreaker_TryAcquirePermit(t *testing.T) { }, "if circuit breaker half-open, status false and a circuitBreakerErrorOpen are returned": { circuitBreakerSetup: func(cb *circuitBreaker) { cb.cb.HalfOpen() }, - expectedStatus: false, + expectedSuccess: false, expectedCircuitBreakerError: true, expectedMetrics: ` # HELP cortex_ingester_circuit_breaker_results_total Results of executing requests via the circuit breaker. @@ -165,7 +166,7 @@ func TestCircuitBreaker_TryAcquirePermit(t *testing.T) { cb := newCircuitBreaker(cfg, cfg.InitialDelay == 0, log.NewNopLogger(), registry) testCase.circuitBreakerSetup(cb) status, err := cb.tryAcquirePermit() - require.Equal(t, testCase.expectedStatus, status) + require.Equal(t, testCase.expectedSuccess, status) if testCase.expectedCircuitBreakerError { require.ErrorAs(t, err, &circuitBreakerOpenError{}) assert.NoError(t, testutil.GatherAndCompare(registry, strings.NewReader(testCase.expectedMetrics), metricNames...)) @@ -635,34 +636,34 @@ func TestIngester_FinishPushRequest(t *testing.T) { }, } for testName, testCase := range testCases { - reg := prometheus.NewPedanticRegistry() - cfg := defaultIngesterTestConfig(t) - cfg.CircuitBreakerConfig = CircuitBreakerConfig{ - Enabled: true, - PushTimeout: 2 * time.Second, - } + t.Run(testName, func(t *testing.T) { + reg := prometheus.NewPedanticRegistry() + cfg := defaultIngesterTestConfig(t) + cfg.CircuitBreakerConfig = CircuitBreakerConfig{ + Enabled: true, + PushTimeout: 2 * time.Second, + } - i, err := prepareIngesterWithBlocksStorage(t, cfg, nil, reg) - require.NoError(t, err) + i, err := prepareIngesterWithBlocksStorage(t, cfg, nil, reg) + require.NoError(t, err) - require.NoError(t, services.StartAndAwaitRunning(context.Background(), i)) - defer services.StopAndAwaitTerminated(context.Background(), i) //nolint:errcheck + require.NoError(t, services.StartAndAwaitRunning(context.Background(), i)) + defer services.StopAndAwaitTerminated(context.Background(), i) //nolint:errcheck - // Wait until the ingester is healthy - test.Poll(t, 100*time.Millisecond, 1, func() interface{} { - return i.lifecycler.HealthyInstancesCount() - }) + // Wait until the ingester is healthy + test.Poll(t, 100*time.Millisecond, 1, func() interface{} { + return i.lifecycler.HealthyInstancesCount() + }) - ctx := user.InjectOrgID(context.Background(), "test") + ctx := user.InjectOrgID(context.Background(), "test") - st := &pushRequestState{ - requestDuration: testCase.pushRequestDuration, - acquiredCircuitBreakerPermit: testCase.acquiredCircuitBreakerPermit, - pushErr: testCase.err, - } - ctx = context.WithValue(ctx, pushReqCtxKey, st) + st := &pushRequestState{ + requestDuration: testCase.pushRequestDuration, + acquiredCircuitBreakerPermit: testCase.acquiredCircuitBreakerPermit, + pushErr: testCase.err, + } + ctx = context.WithValue(ctx, pushReqCtxKey, st) - t.Run(testName, func(t *testing.T) { i.FinishPushRequest(ctx) assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(testCase.expectedMetrics), metricNames...)) }) diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index 338d37b9522..0767ca861db 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -544,15 +544,7 @@ func (i *Ingester) startingForFlusher(ctx context.Context) error { func (i *Ingester) starting(ctx context.Context) (err error) { defer func() { - if err == nil { - if i.cfg.CircuitBreakerConfig.InitialDelay == 0 { - i.circuitBreaker.setActive() - } else { - time.AfterFunc(i.cfg.CircuitBreakerConfig.InitialDelay, func() { - i.circuitBreaker.setActive() - }) - } - } else { + if err != nil { // if starting() fails for any reason (e.g., context canceled), // the lifecycler must be stopped. _ = services.StopAndAwaitTerminated(context.Background(), i.lifecycler) @@ -648,7 +640,18 @@ func (i *Ingester) starting(ctx context.Context) (err error) { } i.subservicesAfterIngesterRingLifecycler, err = createManagerThenStartAndAwaitHealthy(ctx, servs...) - return errors.Wrap(err, "failed to start ingester subservices after ingester ring lifecycler") + if err != nil { + return errors.Wrap(err, "failed to start ingester subservices after ingester ring lifecycler") + } + + if i.cfg.CircuitBreakerConfig.InitialDelay == 0 { + i.circuitBreaker.setActive() + } else { + time.AfterFunc(i.cfg.CircuitBreakerConfig.InitialDelay, func() { + i.circuitBreaker.setActive() + }) + } + return nil } func (i *Ingester) stoppingForFlusher(_ error) error { @@ -988,7 +991,7 @@ func getPushRequestState(ctx context.Context) *pushRequestState { // StartPushRequest checks if ingester can start push request, and increments relevant counters. // If new push request cannot be started, errors convertible to gRPC status code are returned, and metrics are updated. func (i *Ingester) StartPushRequest(ctx context.Context, reqSize int64) (context.Context, error) { - ctx, _, err := i.startPushRequest(ctx, reqSize, nil) + ctx, _, err := i.startPushRequest(ctx, reqSize) return ctx, err } @@ -1011,13 +1014,10 @@ func (i *Ingester) FinishPushRequest(ctx context.Context) { // In the first case, returned errors can be inspected/logged by middleware. Ingester.PushWithCleanup will wrap the error in util_log.DoNotLogError wrapper. // In the second case, returned errors will not be logged, because request will not reach any middleware. // -// Parameter finishPushRequestFn is used for testing purposes, and represents the function that finishes the push request that startPushRequest initialize, -// in case of an error during the execution of the latter. It defaults to FinishPushRequest. -// // If startPushRequest ends with no error, the resulting context includes a *pushRequestState object // containing relevant information about the push request started by this method. // The resulting boolean flag tells if the caller must call finish on this request. If not, there is already someone in the call stack who will do that. -func (i *Ingester) startPushRequest(ctx context.Context, reqSize int64, finishPushRequestFn func(ctx context.Context)) (context.Context, bool, error) { +func (i *Ingester) startPushRequest(ctx context.Context, reqSize int64) (context.Context, bool, error) { if err := i.checkAvailableForPush(); err != nil { return nil, false, err } @@ -1051,25 +1051,16 @@ func (i *Ingester) startPushRequest(ctx context.Context, reqSize int64, finishPu inflightBytes = i.inflightPushRequestsBytes.Load() rejectEqualInflightBytes = true // if inflightBytes == limit, reject new request } - var instanceLimitsErr error - defer func() { - if instanceLimitsErr != nil { - // In this case a per-instance limit has been hit, and the corresponding error has to be passed - // to the function which finishes the push request, records the error with the circuit breaker, - // and gives it a possibly acquired permit back. - if finishPushRequestFn != nil { - finishPushRequestFn(ctx) - } else { - i.FinishPushRequest(ctx) - } - } - }() - instanceLimitsErr = i.checkInstanceLimits(inflight, inflightBytes, rejectEqualInflightBytes) + instanceLimitsErr := i.checkInstanceLimits(inflight, inflightBytes, rejectEqualInflightBytes) if instanceLimitsErr == nil { return ctx, true, nil } + // In this case a per-instance limit has been hit, and the corresponding error has to be passed + // to FinishPushRequest, which finishes the push request, records the error with the circuit breaker, + // and gives it a possibly acquired permit back. + i.FinishPushRequest(ctx) return nil, false, instanceLimitsErr } @@ -1117,7 +1108,7 @@ func (i *Ingester) PushWithCleanup(ctx context.Context, req *mimirpb.WriteReques ) // We need to replace the original context with the context returned by startPushRequest, // because the latter might store a new pushRequestState in the context. - ctx, shouldFinish, startPushErr = i.startPushRequest(ctx, reqSize, nil) + ctx, shouldFinish, startPushErr = i.startPushRequest(ctx, reqSize) if startPushErr != nil { return middleware.DoNotLogError{Err: startPushErr} } diff --git a/pkg/ingester/ingester_test.go b/pkg/ingester/ingester_test.go index 3ab22153bb5..ee9d2ef35cf 100644 --- a/pkg/ingester/ingester_test.go +++ b/pkg/ingester/ingester_test.go @@ -81,6 +81,7 @@ func mustNewActiveSeriesCustomTrackersConfigFromMap(t *testing.T, source map[str } func TestIngester_StartPushRequest(t *testing.T) { + const reqSize = 10 instanceLimits := &InstanceLimits{} pushReqState := &pushRequestState{ requestSize: 10, @@ -90,15 +91,16 @@ func TestIngester_StartPushRequest(t *testing.T) { ctxWithPushReqState := context.WithValue(ctx, pushReqCtxKey, pushReqState) type testCase struct { - ctx context.Context - failingIngester bool - cbOpen bool - cbInitialDelay time.Duration - instanceLimitReached bool - expectedStatus bool - expectedFinishPushRequestCalls int - verifyCtxFn func(context.Context, context.Context) - verifyErr func(error) + ctx context.Context + failingIngester bool + cbOpen bool + cbInitialDelay time.Duration + instanceLimitReached bool + expectedStatus bool + expectedInFlightPushRequests int64 + expectedInflightPushRequestsBytes int64 + verifyCtxFn func(context.Context, context.Context) + verifyErr func(error) } setupIngester := func(tc testCase) *failingIngester { @@ -142,53 +144,59 @@ func TestIngester_StartPushRequest(t *testing.T) { testCases := map[string]testCase{ "fail if ingester is not available for push": { - failingIngester: true, - ctx: ctx, - expectedStatus: false, - expectedFinishPushRequestCalls: 0, + failingIngester: true, + ctx: ctx, + expectedStatus: false, + expectedInFlightPushRequests: 0, + expectedInflightPushRequestsBytes: 0, verifyErr: func(err error) { require.ErrorAs(t, err, &unavailableError{}) }, }, "fail if circuit breaker is open": { - ctx: ctx, - cbOpen: true, - expectedStatus: false, - expectedFinishPushRequestCalls: 0, + ctx: ctx, + cbOpen: true, + expectedStatus: false, + expectedInFlightPushRequests: 0, + expectedInflightPushRequestsBytes: 0, verifyErr: func(err error) { require.ErrorAs(t, err, &circuitBreakerOpenError{}) }, }, "fail if instance limit is reached": { - ctx: ctx, - instanceLimitReached: true, - expectedStatus: false, - expectedFinishPushRequestCalls: 1, + ctx: ctx, + instanceLimitReached: true, + expectedStatus: false, + expectedInFlightPushRequests: 0, + expectedInflightPushRequestsBytes: 0, verifyErr: func(err error) { require.ErrorAs(t, err, &instanceLimitReachedError{}) }, }, "do not fail if circuit breaker is not active": { - ctx: ctx, - cbInitialDelay: 1 * time.Minute, - expectedStatus: true, - expectedFinishPushRequestCalls: 0, + ctx: ctx, + cbInitialDelay: 1 * time.Minute, + expectedStatus: true, + expectedInFlightPushRequests: 1, + expectedInflightPushRequestsBytes: reqSize, verifyCtxFn: func(inCtx, outCtx context.Context) { require.NotEqual(t, inCtx, outCtx) }, }, "do not fail and return the same context if it already contains a pushRequestState": { - ctx: ctxWithPushReqState, - expectedStatus: false, - expectedFinishPushRequestCalls: 0, + ctx: ctxWithPushReqState, + expectedStatus: false, + expectedInFlightPushRequests: 0, + expectedInflightPushRequestsBytes: 0, verifyCtxFn: func(inCtx, outCtx context.Context) { require.Equal(t, inCtx, outCtx) }, }, "do not fail and add pushRequestState to the context if everything is ok": { - ctx: ctx, - expectedStatus: true, - expectedFinishPushRequestCalls: 0, + ctx: ctx, + expectedStatus: true, + expectedInFlightPushRequests: 1, + expectedInflightPushRequestsBytes: reqSize, verifyCtxFn: func(inCtx, outCtx context.Context) { require.NotEqual(t, inCtx, outCtx) }, @@ -199,14 +207,10 @@ func TestIngester_StartPushRequest(t *testing.T) { failingIng := setupIngester(tc) defer services.StopAndAwaitTerminated(context.Background(), failingIng) //nolint:errcheck - finishPushRequestCount := 0 - finishPushRequestFn := func(ctx context.Context) { - failingIng.FinishPushRequest(ctx) - finishPushRequestCount++ - } - - ctx, shouldFinish, err := failingIng.startPushRequest(tc.ctx, 10, finishPushRequestFn) + ctx, shouldFinish, err := failingIng.startPushRequest(tc.ctx, reqSize) require.Equal(t, tc.expectedStatus, shouldFinish) + require.Equal(t, tc.expectedInFlightPushRequests, failingIng.inflightPushRequests.Load()) + require.Equal(t, tc.expectedInflightPushRequestsBytes, failingIng.inflightPushRequestsBytes.Load()) if err == nil { pushReqState := getPushRequestState(ctx) From 1d3a8bb22563f8711749023aecc205ea6584eef4 Mon Sep 17 00:00:00 2001 From: Yuri Nikolic Date: Mon, 3 Jun 2024 23:56:18 +0200 Subject: [PATCH 15/16] Get rid of test-delay key from in context Signed-off-by: Yuri Nikolic --- pkg/ingester/circuitbreaker.go | 20 +++++++++----------- pkg/ingester/circuitbreaker_test.go | 16 ++++++---------- pkg/ingester/ingester.go | 2 +- 3 files changed, 16 insertions(+), 22 deletions(-) diff --git a/pkg/ingester/circuitbreaker.go b/pkg/ingester/circuitbreaker.go index da9477fc643..b4684b75f7a 100644 --- a/pkg/ingester/circuitbreaker.go +++ b/pkg/ingester/circuitbreaker.go @@ -20,14 +20,11 @@ import ( "github.com/grafana/mimir/pkg/mimirpb" ) -type testCtxKey string - const ( - resultSuccess = "success" - resultError = "error" - resultOpen = "circuit_breaker_open" - defaultPushTimeout = 2 * time.Second - testDelayKey testCtxKey = "test-delay" + resultSuccess = "success" + resultError = "error" + resultOpen = "circuit_breaker_open" + defaultPushTimeout = 2 * time.Second ) type circuitBreakerMetrics struct { @@ -98,6 +95,9 @@ type circuitBreaker struct { metrics *circuitBreakerMetrics active atomic.Bool cb circuitbreaker.CircuitBreaker[any] + + // testRequestDelay is needed for testing purposes to simulate long lasting requests + testRequestDelay time.Duration } func newCircuitBreaker(cfg CircuitBreakerConfig, isActive bool, logger log.Logger, registerer prometheus.Registerer) *circuitBreaker { @@ -216,14 +216,12 @@ func (cb *circuitBreaker) recordResult(err error) { // It records the result of the push request with the circuit breaker. Push requests // that lasted longer than the configured timeout are treated as a failure. // The returned error is only used for testing purposes. -func (cb *circuitBreaker) finishPushRequest(ctx context.Context, duration time.Duration, pushErr error) error { +func (cb *circuitBreaker) finishPushRequest(duration time.Duration, pushErr error) error { if !cb.isActive() { return nil } if cb.cfg.testModeEnabled { - if testDelay, ok := ctx.Value(testDelayKey).(time.Duration); ok { - duration += testDelay - } + duration += cb.testRequestDelay } if cb.cfg.PushTimeout < duration { pushErr = context.DeadlineExceeded diff --git a/pkg/ingester/circuitbreaker_test.go b/pkg/ingester/circuitbreaker_test.go index f84e2008437..4d65214859a 100644 --- a/pkg/ingester/circuitbreaker_test.go +++ b/pkg/ingester/circuitbreaker_test.go @@ -315,14 +315,13 @@ func TestCircuitBreaker_FinishPushRequest(t *testing.T) { for testName, testCase := range testCases { t.Run(testName, func(t *testing.T) { registry := prometheus.NewRegistry() - ctx := context.Background() cfg := CircuitBreakerConfig{ Enabled: true, InitialDelay: testCase.initialDelay, PushTimeout: 2 * time.Second, } cb := newCircuitBreaker(cfg, cfg.InitialDelay == 0, log.NewNopLogger(), registry) - err := cb.finishPushRequest(ctx, testCase.pushRequestDuration, testCase.err) + err := cb.finishPushRequest(testCase.pushRequestDuration, testCase.err) if testCase.expectedErr == nil { require.NoError(t, err) } else { @@ -337,15 +336,13 @@ func TestIngester_PushToStorage_CircuitBreaker(t *testing.T) { pushTimeout := 100 * time.Millisecond tests := map[string]struct { expectedErrorWhenCircuitBreakerClosed error - ctx func(context.Context) context.Context + pushRequestDelay time.Duration limits InstanceLimits }{ "deadline exceeded": { expectedErrorWhenCircuitBreakerClosed: nil, limits: InstanceLimits{MaxInMemoryTenants: 3}, - ctx: func(ctx context.Context) context.Context { - return context.WithValue(ctx, testDelayKey, 2*pushTimeout) - }, + pushRequestDelay: pushTimeout, }, "instance limit hit": { expectedErrorWhenCircuitBreakerClosed: instanceLimitReachedError{}, @@ -432,9 +429,7 @@ func TestIngester_PushToStorage_CircuitBreaker(t *testing.T) { for _, req := range reqs { ctx := user.InjectOrgID(context.Background(), userID) count++ - if testCase.ctx != nil { - ctx = testCase.ctx(ctx) - } + i.circuitBreaker.testRequestDelay = testCase.pushRequestDelay err = i.PushToStorage(ctx, req) if initialDelayEnabled { if testCase.expectedErrorWhenCircuitBreakerClosed != nil { @@ -746,7 +741,8 @@ func TestIngester_Push_CircuitBreaker_DeadlineExceeded(t *testing.T) { for _, req := range reqs { ctx := user.InjectOrgID(context.Background(), userID) - ctx = context.WithValue(ctx, testDelayKey, 2*pushTimeout) + // Configure circuit breaker to delay push requests. + i.circuitBreaker.testRequestDelay = pushTimeout count++ ctx, err = i.StartPushRequest(ctx, int64(req.Size())) diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index 0767ca861db..f44c65fb27d 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -1005,7 +1005,7 @@ func (i *Ingester) FinishPushRequest(ctx context.Context) { i.inflightPushRequestsBytes.Sub(st.requestSize) } if st.acquiredCircuitBreakerPermit { - _ = i.circuitBreaker.finishPushRequest(ctx, st.requestDuration, st.pushErr) + _ = i.circuitBreaker.finishPushRequest(st.requestDuration, st.pushErr) } } From e136df3425fa473809413ae2b4af5a4563b24af1 Mon Sep 17 00:00:00 2001 From: Yuri Nikolic Date: Tue, 4 Jun 2024 17:11:13 +0200 Subject: [PATCH 16/16] Fixing review findings Signed-off-by: Yuri Nikolic --- cmd/mimir/config-descriptor.json | 2 +- cmd/mimir/help-all.txt.tmpl | 2 +- .../configuration-parameters/index.md | 6 +- pkg/ingester/circuitbreaker.go | 67 +++++++++------- pkg/ingester/circuitbreaker_test.go | 80 +++++++++++-------- pkg/ingester/ingester.go | 17 ++-- 6 files changed, 95 insertions(+), 79 deletions(-) diff --git a/cmd/mimir/config-descriptor.json b/cmd/mimir/config-descriptor.json index cb6785a4ae4..df83ac4d831 100644 --- a/cmd/mimir/config-descriptor.json +++ b/cmd/mimir/config-descriptor.json @@ -3206,7 +3206,7 @@ "kind": "field", "name": "initial_delay", "required": false, - "desc": "How long the circuit breaker should wait to start up after the corresponding ingester started. During that time both failures and successes will not be counted.", + "desc": "How long the circuit breaker should wait between an activation request and becoming effectively active. During that time both failures and successes will not be counted.", "fieldValue": null, "fieldDefaultValue": 0, "fieldFlag": "ingester.circuit-breaker.initial-delay", diff --git a/cmd/mimir/help-all.txt.tmpl b/cmd/mimir/help-all.txt.tmpl index 5df86e53bfc..e7ae95878ad 100644 --- a/cmd/mimir/help-all.txt.tmpl +++ b/cmd/mimir/help-all.txt.tmpl @@ -1316,7 +1316,7 @@ Usage of ./cmd/mimir/mimir: -ingester.circuit-breaker.failure-threshold-percentage uint [experimental] Max percentage of requests that can fail over period before the circuit breaker opens (default 10) -ingester.circuit-breaker.initial-delay duration - [experimental] How long the circuit breaker should wait to start up after the corresponding ingester started. During that time both failures and successes will not be counted. + [experimental] How long the circuit breaker should wait between an activation request and becoming effectively active. During that time both failures and successes will not be counted. -ingester.circuit-breaker.push-timeout duration How long is execution of ingester's Push supposed to last before it is reported as timeout in a circuit breaker. This configuration is used for circuit breakers only, and timeout expirations are not reported as errors (default 2s) -ingester.circuit-breaker.thresholding-period duration diff --git a/docs/sources/mimir/configure/configuration-parameters/index.md b/docs/sources/mimir/configure/configuration-parameters/index.md index 58eaf184db7..a28ce1394b6 100644 --- a/docs/sources/mimir/configure/configuration-parameters/index.md +++ b/docs/sources/mimir/configure/configuration-parameters/index.md @@ -1243,9 +1243,9 @@ circuit_breaker: # CLI flag: -ingester.circuit-breaker.cooldown-period [cooldown_period: | default = 10s] - # (experimental) How long the circuit breaker should wait to start up after - # the corresponding ingester started. During that time both failures and - # successes will not be counted. + # (experimental) How long the circuit breaker should wait between an + # activation request and becoming effectively active. During that time both + # failures and successes will not be counted. # CLI flag: -ingester.circuit-breaker.initial-delay [initial_delay: | default = 0s] diff --git a/pkg/ingester/circuitbreaker.go b/pkg/ingester/circuitbreaker.go index b4684b75f7a..897294a4bb6 100644 --- a/pkg/ingester/circuitbreaker.go +++ b/pkg/ingester/circuitbreaker.go @@ -21,10 +21,10 @@ import ( ) const ( - resultSuccess = "success" - resultError = "error" - resultOpen = "circuit_breaker_open" - defaultPushTimeout = 2 * time.Second + circuitBreakerResultSuccess = "success" + circuitBreakerResultError = "error" + circuitBreakerResultOpen = "circuit_breaker_open" + circuitBreakerDefaultPushTimeout = 2 * time.Second ) type circuitBreakerMetrics struct { @@ -60,7 +60,7 @@ func newCircuitBreakerMetrics(r prometheus.Registerer, currentStateFn func() cir // We initialize all possible states for the circuitBreakerTransitions metrics cbMetrics.circuitBreakerTransitions.WithLabelValues(s.String()) } - for _, r := range []string{resultSuccess, resultError, resultOpen} { + for _, r := range []string{circuitBreakerResultSuccess, circuitBreakerResultError, circuitBreakerResultOpen} { // We initialize all possible results for the circuitBreakerResults metrics cbMetrics.circuitBreakerResults.WithLabelValues(r) } @@ -85,10 +85,12 @@ func (cfg *CircuitBreakerConfig) RegisterFlags(f *flag.FlagSet) { f.UintVar(&cfg.FailureExecutionThreshold, prefix+"failure-execution-threshold", 100, "How many requests must have been executed in period for the circuit breaker to be eligible to open for the rate of failures") f.DurationVar(&cfg.ThresholdingPeriod, prefix+"thresholding-period", time.Minute, "Moving window of time that the percentage of failed requests is computed over") f.DurationVar(&cfg.CooldownPeriod, prefix+"cooldown-period", 10*time.Second, "How long the circuit breaker will stay in the open state before allowing some requests") - f.DurationVar(&cfg.InitialDelay, prefix+"initial-delay", 0, "How long the circuit breaker should wait to start up after the corresponding ingester started. During that time both failures and successes will not be counted.") - f.DurationVar(&cfg.PushTimeout, prefix+"push-timeout", defaultPushTimeout, "How long is execution of ingester's Push supposed to last before it is reported as timeout in a circuit breaker. This configuration is used for circuit breakers only, and timeout expirations are not reported as errors") + f.DurationVar(&cfg.InitialDelay, prefix+"initial-delay", 0, "How long the circuit breaker should wait between an activation request and becoming effectively active. During that time both failures and successes will not be counted.") + f.DurationVar(&cfg.PushTimeout, prefix+"push-timeout", circuitBreakerDefaultPushTimeout, "How long is execution of ingester's Push supposed to last before it is reported as timeout in a circuit breaker. This configuration is used for circuit breakers only, and timeout expirations are not reported as errors") } +// circuitBreaker abstracts the ingester's server-side circuit breaker functionality. +// A nil *circuitBreaker is a valid noop implementation. type circuitBreaker struct { cfg CircuitBreakerConfig logger log.Logger @@ -100,8 +102,11 @@ type circuitBreaker struct { testRequestDelay time.Duration } -func newCircuitBreaker(cfg CircuitBreakerConfig, isActive bool, logger log.Logger, registerer prometheus.Registerer) *circuitBreaker { - active := atomic.NewBool(isActive) +func newCircuitBreaker(cfg CircuitBreakerConfig, logger log.Logger, registerer prometheus.Registerer) *circuitBreaker { + if !cfg.Enabled { + return nil + } + active := atomic.NewBool(false) cb := circuitBreaker{ cfg: cfg, logger: logger, @@ -142,7 +147,7 @@ func newCircuitBreaker(cfg CircuitBreakerConfig, isActive bool, logger log.Logge return &cb } -func isFailure(err error) bool { +func isCircuitBreakerFailure(err error) bool { if err == nil { return false } @@ -169,17 +174,19 @@ func isFailure(err error) bool { } func (cb *circuitBreaker) isActive() bool { - if cb == nil { - return false - } - return cb.active.Load() + return cb != nil && cb.active.Load() } -func (cb *circuitBreaker) setActive() { +func (cb *circuitBreaker) activate() { if cb == nil { return } - cb.active.Store(true) + if cb.cfg.InitialDelay == 0 { + cb.active.Store(true) + } + time.AfterFunc(cb.cfg.InitialDelay, func() { + cb.active.Store(true) + }) } // tryAcquirePermit tries to acquire a permit to use the circuit breaker and returns whether a permit was acquired. @@ -192,25 +199,12 @@ func (cb *circuitBreaker) tryAcquirePermit() (bool, error) { return false, nil } if !cb.cb.TryAcquirePermit() { - cb.metrics.circuitBreakerResults.WithLabelValues(resultOpen).Inc() + cb.metrics.circuitBreakerResults.WithLabelValues(circuitBreakerResultOpen).Inc() return false, newCircuitBreakerOpenError(cb.cb.RemainingDelay()) } return true, nil } -func (cb *circuitBreaker) recordResult(err error) { - if !cb.isActive() { - return - } - if err != nil && isFailure(err) { - cb.cb.RecordFailure() - cb.metrics.circuitBreakerResults.WithLabelValues(resultError).Inc() - } else { - cb.metrics.circuitBreakerResults.WithLabelValues(resultSuccess).Inc() - cb.cb.RecordSuccess() - } -} - // finishPushRequest should be called to complete the push request executed upon a // successfully acquired circuit breaker permit. // It records the result of the push request with the circuit breaker. Push requests @@ -229,3 +223,16 @@ func (cb *circuitBreaker) finishPushRequest(duration time.Duration, pushErr erro cb.recordResult(pushErr) return pushErr } + +func (cb *circuitBreaker) recordResult(err error) { + if !cb.isActive() { + return + } + if err != nil && isCircuitBreakerFailure(err) { + cb.cb.RecordFailure() + cb.metrics.circuitBreakerResults.WithLabelValues(circuitBreakerResultError).Inc() + } else { + cb.metrics.circuitBreakerResults.WithLabelValues(circuitBreakerResultSuccess).Inc() + cb.cb.RecordSuccess() + } +} diff --git a/pkg/ingester/circuitbreaker_test.go b/pkg/ingester/circuitbreaker_test.go index 4d65214859a..7240b689b82 100644 --- a/pkg/ingester/circuitbreaker_test.go +++ b/pkg/ingester/circuitbreaker_test.go @@ -28,30 +28,30 @@ import ( func TestIsFailure(t *testing.T) { t.Run("no error", func(t *testing.T) { - require.False(t, isFailure(nil)) + require.False(t, isCircuitBreakerFailure(nil)) }) t.Run("context cancelled", func(t *testing.T) { - require.False(t, isFailure(context.Canceled)) - require.False(t, isFailure(fmt.Errorf("%w", context.Canceled))) + require.False(t, isCircuitBreakerFailure(context.Canceled)) + require.False(t, isCircuitBreakerFailure(fmt.Errorf("%w", context.Canceled))) }) t.Run("gRPC context cancelled", func(t *testing.T) { err := status.Error(codes.Canceled, "cancelled!") - require.False(t, isFailure(err)) - require.False(t, isFailure(fmt.Errorf("%w", err))) + require.False(t, isCircuitBreakerFailure(err)) + require.False(t, isCircuitBreakerFailure(fmt.Errorf("%w", err))) }) t.Run("gRPC deadline exceeded", func(t *testing.T) { err := status.Error(codes.DeadlineExceeded, "broken!") - require.True(t, isFailure(err)) - require.True(t, isFailure(fmt.Errorf("%w", err))) + require.True(t, isCircuitBreakerFailure(err)) + require.True(t, isCircuitBreakerFailure(fmt.Errorf("%w", err))) }) t.Run("gRPC unavailable with INSTANCE_LIMIT details", func(t *testing.T) { err := newInstanceLimitReachedError("broken") - require.True(t, isFailure(err)) - require.True(t, isFailure(fmt.Errorf("%w", err))) + require.True(t, isCircuitBreakerFailure(err)) + require.True(t, isCircuitBreakerFailure(fmt.Errorf("%w", err))) }) t.Run("gRPC unavailable with SERVICE_UNAVAILABLE details is not a failure", func(t *testing.T) { @@ -59,14 +59,14 @@ func TestIsFailure(t *testing.T) { stat, err := stat.WithDetails(&mimirpb.ErrorDetails{Cause: mimirpb.SERVICE_UNAVAILABLE}) require.NoError(t, err) err = stat.Err() - require.False(t, isFailure(err)) - require.False(t, isFailure(fmt.Errorf("%w", err))) + require.False(t, isCircuitBreakerFailure(err)) + require.False(t, isCircuitBreakerFailure(fmt.Errorf("%w", err))) }) t.Run("gRPC unavailable without details is not a failure", func(t *testing.T) { err := status.Error(codes.Unavailable, "broken!") - require.False(t, isFailure(err)) - require.False(t, isFailure(fmt.Errorf("%w", err))) + require.False(t, isCircuitBreakerFailure(err)) + require.False(t, isCircuitBreakerFailure(fmt.Errorf("%w", err))) }) } @@ -77,10 +77,9 @@ func TestCircuitBreaker_IsActive(t *testing.T) { registry := prometheus.NewRegistry() cfg := CircuitBreakerConfig{Enabled: true, InitialDelay: 10 * time.Millisecond} - cb = newCircuitBreaker(cfg, false, log.NewNopLogger(), registry) - time.AfterFunc(cfg.InitialDelay, func() { - cb.setActive() - }) + cb = newCircuitBreaker(cfg, log.NewNopLogger(), registry) + cb.activate() + // When InitialDelay is set, circuit breaker is not immediately active. require.False(t, cb.isActive()) @@ -104,18 +103,26 @@ func TestCircuitBreaker_TryAcquirePermit(t *testing.T) { expectedMetrics string }{ "if circuit breaker is not active, status false and no error are returned": { - initialDelay: 1 * time.Minute, - circuitBreakerSetup: func(cb *circuitBreaker) { cb.cb.Close() }, + initialDelay: 1 * time.Minute, + circuitBreakerSetup: func(cb *circuitBreaker) { + cb.active.Store(false) + }, expectedSuccess: false, expectedCircuitBreakerError: false, }, "if circuit breaker closed, status true and no error are returned": { - circuitBreakerSetup: func(cb *circuitBreaker) { cb.cb.Close() }, + circuitBreakerSetup: func(cb *circuitBreaker) { + cb.activate() + cb.cb.Close() + }, expectedSuccess: true, expectedCircuitBreakerError: false, }, "if circuit breaker open, status false and a circuitBreakerErrorOpen are returned": { - circuitBreakerSetup: func(cb *circuitBreaker) { cb.cb.Open() }, + circuitBreakerSetup: func(cb *circuitBreaker) { + cb.activate() + cb.cb.Open() + }, expectedSuccess: false, expectedCircuitBreakerError: true, expectedMetrics: ` @@ -137,7 +144,10 @@ func TestCircuitBreaker_TryAcquirePermit(t *testing.T) { `, }, "if circuit breaker half-open, status false and a circuitBreakerErrorOpen are returned": { - circuitBreakerSetup: func(cb *circuitBreaker) { cb.cb.HalfOpen() }, + circuitBreakerSetup: func(cb *circuitBreaker) { + cb.activate() + cb.cb.HalfOpen() + }, expectedSuccess: false, expectedCircuitBreakerError: true, expectedMetrics: ` @@ -162,8 +172,8 @@ func TestCircuitBreaker_TryAcquirePermit(t *testing.T) { for testName, testCase := range testCases { t.Run(testName, func(t *testing.T) { registry := prometheus.NewRegistry() - cfg := CircuitBreakerConfig{Enabled: true, CooldownPeriod: 10 * time.Second, InitialDelay: testCase.initialDelay} - cb := newCircuitBreaker(cfg, cfg.InitialDelay == 0, log.NewNopLogger(), registry) + cfg := CircuitBreakerConfig{Enabled: true, CooldownPeriod: 10 * time.Second} + cb := newCircuitBreaker(cfg, log.NewNopLogger(), registry) testCase.circuitBreakerSetup(cb) status, err := cb.tryAcquirePermit() require.Equal(t, testCase.expectedSuccess, status) @@ -220,7 +230,8 @@ func TestCircuitBreaker_RecordResult(t *testing.T) { for testName, testCase := range testCases { t.Run(testName, func(t *testing.T) { registry := prometheus.NewRegistry() - cb := newCircuitBreaker(cfg, true, log.NewNopLogger(), registry) + cb := newCircuitBreaker(cfg, log.NewNopLogger(), registry) + cb.activate() cb.recordResult(testCase.err) assert.NoError(t, testutil.GatherAndCompare(registry, strings.NewReader(testCase.expectedMetrics), metricNames...)) }) @@ -233,13 +244,14 @@ func TestCircuitBreaker_FinishPushRequest(t *testing.T) { } testCases := map[string]struct { pushRequestDuration time.Duration - initialDelay time.Duration + isActive bool err error expectedErr error expectedMetrics string }{ "with a permit acquired, pushRequestDuration lower than PushTimeout and no input error, finishPushRequest gives success": { pushRequestDuration: 1 * time.Second, + isActive: true, err: nil, expectedMetrics: ` # HELP cortex_ingester_circuit_breaker_results_total Results of executing requests via the circuit breaker. @@ -251,7 +263,7 @@ func TestCircuitBreaker_FinishPushRequest(t *testing.T) { }, "with circuit breaker not active, pushRequestDuration lower than PushTimeout and no input error, finishPushRequest does nothing": { pushRequestDuration: 1 * time.Second, - initialDelay: 1 * time.Minute, + isActive: false, err: nil, expectedMetrics: ` # HELP cortex_ingester_circuit_breaker_results_total Results of executing requests via the circuit breaker. @@ -263,6 +275,7 @@ func TestCircuitBreaker_FinishPushRequest(t *testing.T) { }, "with circuit breaker active, pushRequestDuration higher than PushTimeout and no input error, finishPushRequest gives context deadline exceeded error": { pushRequestDuration: 3 * time.Second, + isActive: true, err: nil, expectedErr: context.DeadlineExceeded, expectedMetrics: ` @@ -275,7 +288,7 @@ func TestCircuitBreaker_FinishPushRequest(t *testing.T) { }, "with circuit breaker not active, pushRequestDuration higher than PushTimeout and no input error, finishPushRequest does nothing": { pushRequestDuration: 3 * time.Second, - initialDelay: 1 * time.Minute, + isActive: false, err: nil, expectedErr: nil, expectedMetrics: ` @@ -288,6 +301,7 @@ func TestCircuitBreaker_FinishPushRequest(t *testing.T) { }, "with circuit breaker active, pushRequestDuration higher than PushTimeout and an input error different from context deadline exceeded, finishPushRequest gives context deadline exceeded error": { pushRequestDuration: 3 * time.Second, + isActive: true, err: newInstanceLimitReachedError("error"), expectedErr: context.DeadlineExceeded, expectedMetrics: ` @@ -300,7 +314,7 @@ func TestCircuitBreaker_FinishPushRequest(t *testing.T) { }, "with circuit breaker not active, pushRequestDuration higher than PushTimeout and an input error different from context deadline exceeded, finishPushRequest does nothing": { pushRequestDuration: 3 * time.Second, - initialDelay: 1 * time.Minute, + isActive: false, err: newInstanceLimitReachedError("error"), expectedErr: nil, expectedMetrics: ` @@ -316,11 +330,11 @@ func TestCircuitBreaker_FinishPushRequest(t *testing.T) { t.Run(testName, func(t *testing.T) { registry := prometheus.NewRegistry() cfg := CircuitBreakerConfig{ - Enabled: true, - InitialDelay: testCase.initialDelay, - PushTimeout: 2 * time.Second, + Enabled: true, + PushTimeout: 2 * time.Second, } - cb := newCircuitBreaker(cfg, cfg.InitialDelay == 0, log.NewNopLogger(), registry) + cb := newCircuitBreaker(cfg, log.NewNopLogger(), registry) + cb.active.Store(testCase.isActive) err := cb.finishPushRequest(testCase.pushRequestDuration, testCase.err) if testCase.expectedErr == nil { require.NoError(t, err) diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index f44c65fb27d..77e84bde583 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -398,10 +398,8 @@ func New(cfg Config, limits *validation.Overrides, ingestersRing ring.ReadRing, i.metrics = newIngesterMetrics(registerer, cfg.ActiveSeriesMetrics.Enabled, i.getInstanceLimits, i.ingestionRate, &i.inflightPushRequests, &i.inflightPushRequestsBytes) i.activeGroups = activeGroupsCleanupService - if cfg.CircuitBreakerConfig.Enabled { - // We create an inactive circuit breaker, which will be activated on a successful completion of starting. - i.circuitBreaker = newCircuitBreaker(cfg.CircuitBreakerConfig, false, logger, registerer) - } + // We create a circuit breaker, which will be activated on a successful completion of starting. + i.circuitBreaker = newCircuitBreaker(cfg.CircuitBreakerConfig, logger, registerer) if registerer != nil { promauto.With(registerer).NewGaugeFunc(prometheus.GaugeOpts{ @@ -644,13 +642,7 @@ func (i *Ingester) starting(ctx context.Context) (err error) { return errors.Wrap(err, "failed to start ingester subservices after ingester ring lifecycler") } - if i.cfg.CircuitBreakerConfig.InitialDelay == 0 { - i.circuitBreaker.setActive() - } else { - time.AfterFunc(i.cfg.CircuitBreakerConfig.InitialDelay, func() { - i.circuitBreaker.setActive() - }) - } + i.circuitBreaker.activate() return nil } @@ -1054,12 +1046,15 @@ func (i *Ingester) startPushRequest(ctx context.Context, reqSize int64) (context instanceLimitsErr := i.checkInstanceLimits(inflight, inflightBytes, rejectEqualInflightBytes) if instanceLimitsErr == nil { + // In this case a pull request has been successfully started, and we return + // the context enriched with the corresponding pushRequestState object. return ctx, true, nil } // In this case a per-instance limit has been hit, and the corresponding error has to be passed // to FinishPushRequest, which finishes the push request, records the error with the circuit breaker, // and gives it a possibly acquired permit back. + st.pushErr = instanceLimitsErr i.FinishPushRequest(ctx) return nil, false, instanceLimitsErr }