grafana · krajorama · Apr 3, 2024 · Apr 2, 2024 · Apr 2, 2024 · Apr 2, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -24,6 +24,7 @@
 * [BUGFIX] Ingester: don't retain blocks if they finish exactly on the boundary of the retention window. #7656
 * [BUGFIX] Bug-fixes and improvements to experimental native histograms. #7744
 * [BUGFIX] Querier: return an error when a query uses `label_join` with an invalid destination label name. #7744
+* [BUGFIX] Ingester: turn native histogram validation errors in TSDB into soft ingester errors that result in returning 4xx to the end-user instead of 5xx. #7736 #7773
 
 ### Mixin
 
@@ -122,7 +123,6 @@
 * [CHANGE] The configuration option `-querier.max-query-into-future` has been deprecated and will be removed in Mimir 2.14. #7496
 * [CHANGE] Distributor: the metric `cortex_distributor_sample_delay_seconds` has been deprecated and will be removed in Mimir 2.14. #7516
 * [CHANGE] Query-frontend: The deprecated YAML setting `frontend.cache_unaligned_requests` has been moved to `limits.cache_unaligned_requests`. #7519
-* [CHANGE] Distributor: validate that in integer native histograms the zero, negative and positive bucket counts add up to the overall count of the histogram. Such errors are now reported as 4xx and not 5xx and show up in the `cortex_discarded_samples_total` with the label `reason="native_histogram_bucket_count_mismatch"`. #7736
 * [FEATURE] Introduce `-server.log-source-ips-full` option to log all IPs from `Forwarded`, `X-Real-IP`, `X-Forwarded-For` headers. #7250
 * [FEATURE] Introduce `-tenant-federation.max-tenants` option to limit the max number of tenants allowed for requests when federation is enabled. #6959
 * [FEATURE] Cardinality API: added a new `count_method` parameter which enables counting active label names. #7085

@@ -1481,9 +1481,48 @@ This non-critical error occurs when Mimir receives a write request that contains
 The series containing such samples are skipped during ingestion, and valid series within the same request are ingested.
 {{< /admonition >}}
 
-### err-mimir-native-histogram-bucket-count-mismatch
+### err-mimir-native-histogram-count-mismatch
 
-This non-critical error occurs when Mimir receives a write request that contains a sample that is a native histogram where the zero, positive and negative bucket counts do not add up to the overall count of the native histogram.
+This non-critical error occures when Mimir receives a write request that contains a sample that is a native histogram
+where the buckets counts don't add up to the overall count recorded in the native histogram, provided that the overall
+sum is a regular float number.
+
+{{< admonition type="note" >}}
+The series containing such samples are skipped during ingestion, and valid series within the same request are ingested.
+{{< /admonition >}}
+
+### err-mimir-native-histogram-count-not-big-enough
+
+This non-critical error occures when Mimir receives a write request that contains a sample that is a native histogram
+where the buckets counts add up to a higher number than the overall count recorded in the native histogram, provided
+that the overall sum is not a float number (NaN).
+
+{{< admonition type="note" >}}
+The series containing such samples are skipped during ingestion, and valid series within the same request are ingested.
+{{< /admonition >}}
+
+### err-mimir-native-histogram-negative-bucket-count
+
+This non-critical error occures when Mimir receives a write request that contains a sample that is a native histogram
+where some bucket count is negative.
+
+{{< admonition type="note" >}}
+The series containing such samples are skipped during ingestion, and valid series within the same request are ingested.
+{{< /admonition >}}
+
+### err-mimir-native-histogram-span-negative-offset
+
+This non-critical error occures when Mimir receives a write request that contains a sample that is a native histogram
+where a bucket span has a negative offset.
+
+{{< admonition type="note" >}}
+The series containing such samples are skipped during ingestion, and valid series within the same request are ingested.
+{{< /admonition >}}
+
+### err-mimir-native-histogram-spans-buckets-mismatch
+
+This non-critical error occures when Mimir receives a write request that contains a sample that is a native histogram
+where the number of bucket counts does not agree with the number of buckets encoded in the bucket spans.
 
 {{< admonition type="note" >}}
 The series containing such samples are skipped during ingestion, and valid series within the same request are ingested.

@@ -38,7 +38,6 @@ var (
 	reasonLabelValueTooLong            = globalerror.SeriesLabelValueTooLong.LabelValue()
 	reasonMaxNativeHistogramBuckets    = globalerror.MaxNativeHistogramBuckets.LabelValue()
 	reasonInvalidNativeHistogramSchema = globalerror.InvalidSchemaNativeHistogram.LabelValue()
-	reasonBucketCountMismatch          = globalerror.BucketCountMismatch.LabelValue()
 	reasonDuplicateLabelNames          = globalerror.SeriesWithDuplicateLabelNames.LabelValue()
 	reasonTooFarInFuture               = globalerror.SampleTooFarInFuture.LabelValue()
 
@@ -80,7 +79,6 @@ var (
 	maxNativeHistogramBucketsMsgFormat    = globalerror.MaxNativeHistogramBuckets.Message("received a native histogram sample with too many buckets, timestamp: %d series: %s, buckets: %d, limit: %d")
 	notReducibleNativeHistogramMsgFormat  = globalerror.NotReducibleNativeHistogram.Message("received a native histogram sample with too many buckets and cannot reduce, timestamp: %d series: %s, buckets: %d, limit: %d")
 	invalidSchemaNativeHistogramMsgFormat = globalerror.InvalidSchemaNativeHistogram.Message("received a native histogram sample with an invalid schema: %d")
-	bucketCountMismatchMsgFormat          = globalerror.BucketCountMismatch.Message("native histogram bucket count mismatch, timestamp: %d, series: %s, expected %v, got %v")
 	sampleTimestampTooNewMsgFormat        = globalerror.SampleTooFarInFuture.MessageWithPerTenantLimitConfig(
 		"received a sample whose timestamp is too far in the future, timestamp: %d series: '%.200s'",
 		validation.CreationGracePeriodFlag,
@@ -123,7 +121,6 @@ type sampleValidationMetrics struct {
 	labelValueTooLong            *prometheus.CounterVec
 	maxNativeHistogramBuckets    *prometheus.CounterVec
 	invalidNativeHistogramSchema *prometheus.CounterVec
-	bucketCountMismatch          *prometheus.CounterVec
 	duplicateLabelNames          *prometheus.CounterVec
 	tooFarInFuture               *prometheus.CounterVec
 }
@@ -138,7 +135,6 @@ func (m *sampleValidationMetrics) deleteUserMetrics(userID string) {
 	m.labelValueTooLong.DeletePartialMatch(filter)
 	m.maxNativeHistogramBuckets.DeletePartialMatch(filter)
 	m.invalidNativeHistogramSchema.DeletePartialMatch(filter)
-	m.bucketCountMismatch.DeletePartialMatch(filter)
 	m.duplicateLabelNames.DeletePartialMatch(filter)
 	m.tooFarInFuture.DeletePartialMatch(filter)
 }
@@ -152,7 +148,6 @@ func (m *sampleValidationMetrics) deleteUserMetricsForGroup(userID, group string
 	m.labelValueTooLong.DeleteLabelValues(userID, group)
 	m.maxNativeHistogramBuckets.DeleteLabelValues(userID, group)
 	m.invalidNativeHistogramSchema.DeleteLabelValues(userID, group)
-	m.bucketCountMismatch.DeleteLabelValues(userID, group)
 	m.duplicateLabelNames.DeleteLabelValues(userID, group)
 	m.tooFarInFuture.DeleteLabelValues(userID, group)
 }
@@ -167,7 +162,6 @@ func newSampleValidationMetrics(r prometheus.Registerer) *sampleValidationMetric
 		labelValueTooLong:            validation.DiscardedSamplesCounter(r, reasonLabelValueTooLong),
 		maxNativeHistogramBuckets:    validation.DiscardedSamplesCounter(r, reasonMaxNativeHistogramBuckets),
 		invalidNativeHistogramSchema: validation.DiscardedSamplesCounter(r, reasonInvalidNativeHistogramSchema),
-		bucketCountMismatch:          validation.DiscardedSamplesCounter(r, reasonBucketCountMismatch),
 		duplicateLabelNames:          validation.DiscardedSamplesCounter(r, reasonDuplicateLabelNames),
 		tooFarInFuture:               validation.DiscardedSamplesCounter(r, reasonTooFarInFuture),
 	}
@@ -257,26 +251,6 @@ func validateSampleHistogram(m *sampleValidationMetrics, now model.Time, cfg sam
 		}
 	}
 
-	// Check that bucket counts including zero bucket add up to the overall count.
-	if !s.IsFloatHistogram() {
-		// Do nothing for float histograms, due to floating point precision issues, we don't check the bucket count.
-		count := s.GetZeroCountInt()
-		bucketCount := int64(0)
-		for _, c := range s.GetNegativeDeltas() {
-			bucketCount += c
-			count += uint64(bucketCount)
-		}
-		bucketCount = int64(0)
-		for _, c := range s.GetPositiveDeltas() {
-			bucketCount += c
-			count += uint64(bucketCount)
-		}
-		if count != s.GetCountInt() {
-			m.bucketCountMismatch.WithLabelValues(userID, group).Inc()
-			return fmt.Errorf(bucketCountMismatchMsgFormat, s.Timestamp, mimirpb.FromLabelAdaptersToString(ls), s.GetCountInt(), count)
-		}
-	}
-
 	return nil
 }
 

@@ -584,100 +584,6 @@ func TestInvalidNativeHistogramSchema(t *testing.T) {
 	`), "cortex_discarded_samples_total"))
 }
 
-func TestInvalidBucketCountHistogram(t *testing.T) {
-	testCases := map[string]struct {
-		h             *mimirpb.Histogram
-		expectedError error
-	}{
-		"a valid zero counts causes no error": {
-			h:             &mimirpb.Histogram{},
-			expectedError: nil,
-		},
-		"a valid integer histogram causes no error": {
-			h: &mimirpb.Histogram{
-				Count:          &mimirpb.Histogram_CountInt{CountInt: 5},
-				Sum:            10,
-				Schema:         1,
-				ZeroThreshold:  0.001,
-				ZeroCount:      &mimirpb.Histogram_ZeroCountInt{ZeroCountInt: 1},
-				NegativeSpans:  []mimirpb.BucketSpan{{Offset: 0, Length: 2}},
-				NegativeDeltas: []int64{1, 1},
-				PositiveSpans:  []mimirpb.BucketSpan{{Offset: 0, Length: 1}},
-				PositiveDeltas: []int64{1},
-				ResetHint:      mimirpb.Histogram_UNKNOWN,
-				Timestamp:      0,
-			},
-			expectedError: nil,
-		},
-		"a valid float histogram causes no error": {
-			h: &mimirpb.Histogram{
-				Count:          &mimirpb.Histogram_CountFloat{CountFloat: 5.5},
-				Sum:            10,
-				Schema:         1,
-				ZeroThreshold:  0.001,
-				ZeroCount:      &mimirpb.Histogram_ZeroCountFloat{ZeroCountFloat: 1.5},
-				NegativeSpans:  []mimirpb.BucketSpan{{Offset: 0, Length: 2}},
-				NegativeCounts: []float64{1.0, 2.0},
-				PositiveSpans:  []mimirpb.BucketSpan{{Offset: 0, Length: 1}},
-				PositiveCounts: []float64{1.0},
-				ResetHint:      mimirpb.Histogram_UNKNOWN,
-				Timestamp:      0,
-			},
-			expectedError: nil,
-		},
-		"an integer histogram with the wrong overall count": {
-			h: &mimirpb.Histogram{
-				Count:          &mimirpb.Histogram_CountInt{CountInt: 4},
-				Sum:            10,
-				Schema:         1,
-				ZeroThreshold:  0.001,
-				ZeroCount:      &mimirpb.Histogram_ZeroCountInt{ZeroCountInt: 1},
-				NegativeSpans:  []mimirpb.BucketSpan{{Offset: 0, Length: 2}},
-				NegativeDeltas: []int64{1, 1},
-				PositiveSpans:  []mimirpb.BucketSpan{{Offset: 0, Length: 1}},
-				PositiveDeltas: []int64{1},
-				ResetHint:      mimirpb.Histogram_UNKNOWN,
-				Timestamp:      0,
-			},
-			expectedError: fmt.Errorf("native histogram bucket count mismatch, timestamp: 0, series: a{a=\"a\"}, expected 4, got 5 (err-mimir-native-histogram-bucket-count-mismatch)"),
-		},
-		"a float histogram with the wrong overall count": {
-			h: &mimirpb.Histogram{
-				Count:          &mimirpb.Histogram_CountFloat{CountFloat: 4.5},
-				Sum:            10,
-				Schema:         1,
-				ZeroThreshold:  0.001,
-				ZeroCount:      &mimirpb.Histogram_ZeroCountFloat{ZeroCountFloat: 1.5},
-				NegativeSpans:  []mimirpb.BucketSpan{{Offset: 0, Length: 2}},
-				NegativeCounts: []float64{1.0, 2.0},
-				PositiveSpans:  []mimirpb.BucketSpan{{Offset: 0, Length: 1}},
-				PositiveCounts: []float64{1.0},
-				ResetHint:      mimirpb.Histogram_UNKNOWN,
-				Timestamp:      0,
-			},
-			// Due to floating point precision issues, this case is not an error at the moment.
-			expectedError: nil,
-		},
-	}
-
-	registry := prometheus.NewRegistry()
-	metrics := newSampleValidationMetrics(registry)
-	cfg := sampleValidationCfg{}
-	labels := []mimirpb.LabelAdapter{{Name: model.MetricNameLabel, Value: "a"}, {Name: "a", Value: "a"}}
-	for testName, testCase := range testCases {
-		t.Run(testName, func(t *testing.T) {
-			err := validateSampleHistogram(metrics, model.Now(), cfg, "user-1", "group-1", labels, testCase.h)
-			require.Equal(t, testCase.expectedError, err)
-		})
-	}
-
-	require.NoError(t, testutil.GatherAndCompare(registry, strings.NewReader(`
-			# HELP cortex_discarded_samples_total The total number of samples that were discarded.
-			# TYPE cortex_discarded_samples_total counter
-			cortex_discarded_samples_total{group="group-1",reason="native_histogram_bucket_count_mismatch",user="user-1"} 1
-	`), "cortex_discarded_samples_total"))
-}
-
 func tooManyLabelsArgs(series []mimirpb.LabelAdapter, limit int) []any {
 	metric := mimirpb.FromLabelAdaptersToMetric(series).String()
 	ellipsis := ""

@@ -455,6 +455,43 @@ var _ ingesterError = perMetricMetadataLimitReachedError{}
 // Ensure that perMetricMetadataLimitReachedError is an softError.
 var _ softError = perMetricMetadataLimitReachedError{}
 
+// nativeHistogramValidationError indicates that native histogram bucket counts did not add up to the overall count.
+type nativeHistogramValidationError struct {
+	id           globalerror.ID
+	originalErr  error
+	seriesLabels []mimirpb.LabelAdapter
+	timestamp    model.Time
+}
+
+func newNativeHistogramValidationError(id globalerror.ID, originalErr error, timestamp model.Time, seriesLabels []mimirpb.LabelAdapter) nativeHistogramValidationError {
+	return nativeHistogramValidationError{
+		id:           id,
+		originalErr:  originalErr,
+		seriesLabels: seriesLabels,
+		timestamp:    timestamp,
+	}
+}
+
+func (e nativeHistogramValidationError) Error() string {
+	return e.id.Message(fmt.Sprintf("err: %v. timestamp=%s, series=%s",
+		e.originalErr,
+		e.timestamp.Time().UTC().Format(time.RFC3339Nano),
+		e.seriesLabels,
+	))
+}
+
+func (e nativeHistogramValidationError) errorCause() mimirpb.ErrorCause {
+	return mimirpb.BAD_DATA
+}
+
+func (e nativeHistogramValidationError) soft() {}
+
+// Ensure that histogramBucketCountMismatchError is an ingesterError.
+var _ ingesterError = nativeHistogramValidationError{}
+
+// Ensure that histogramBucketCountMismatchError is an softError.
+var _ softError = nativeHistogramValidationError{}
+
 // unavailableError is an ingesterError indicating that the ingester is unavailable.
 type unavailableError struct {
 	state services.State
@@ -550,6 +587,7 @@ type ingesterErrSamplers struct {
 	maxMetadataPerMetricLimitExceeded *log.Sampler
 	maxSeriesPerUserLimitExceeded     *log.Sampler
 	maxMetadataPerUserLimitExceeded   *log.Sampler
+	nativeHistogramValidationError    *log.Sampler
 }
 
 func newIngesterErrSamplers(freq int64) ingesterErrSamplers {
@@ -563,6 +601,7 @@ func newIngesterErrSamplers(freq int64) ingesterErrSamplers {
 		log.NewSampler(freq),
 		log.NewSampler(freq),
 		log.NewSampler(freq),
+		log.NewSampler(freq),
 	}
 }
 

@@ -1290,6 +1290,33 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre
 				return newPerMetricSeriesLimitReachedError(i.limiter.limits.MaxGlobalSeriesPerMetric(userID), labels)
 			})
 			return true
+
+		// Map TSDB native histogram validation errors to soft errors.
+		case errors.Is(err, histogram.ErrHistogramCountMismatch):
+			updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError {
+				return newNativeHistogramValidationError(globalerror.NativeHistogramCountMismatch, err, model.Time(timestamp), labels)
+			})
+			return true
+		case errors.Is(err, histogram.ErrHistogramCountNotBigEnough):
+			updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError {
+				return newNativeHistogramValidationError(globalerror.NativeHistogramCountNotBigEnough, err, model.Time(timestamp), labels)
+			})
+			return true
+		case errors.Is(err, histogram.ErrHistogramNegativeBucketCount):
+			updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError {
+				return newNativeHistogramValidationError(globalerror.NativeHistogramNegativeBucketCount, err, model.Time(timestamp), labels)
+			})
+			return true
+		case errors.Is(err, histogram.ErrHistogramSpanNegativeOffset):
+			updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError {
+				return newNativeHistogramValidationError(globalerror.NativeHistogramSpanNegativeOffset, err, model.Time(timestamp), labels)
+			})
+			return true
+		case errors.Is(err, histogram.ErrHistogramSpansBucketsMismatch):
+			updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError {
+				return newNativeHistogramValidationError(globalerror.NativeHistogramSpansBucketsMismatch, err, model.Time(timestamp), labels)
+			})
+			return true
 		}
 		return false
 	}