diff --git a/CHANGELOG.md b/CHANGELOG.md index bd4dfa9e9c6..045e6dd0154 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,6 +26,7 @@ * [BUGFIX] Bug-fixes and improvements to experimental native histograms. #7744 * [BUGFIX] Querier: return an error when a query uses `label_join` with an invalid destination label name. #7744 * [BUGFIX] Compactor: correct outstanding job estimation in metrics and `compaction-planner` tool when block labels differ. #7745 +* [BUGFIX] Ingester: turn native histogram validation errors in TSDB into soft ingester errors that result in returning 4xx to the end-user instead of 5xx. In the case of TSDB validation errors, the counter `cortex_discarded_samples_total` will be increased with the `reason` label set to `"invalid-native-histogram"`. #7736 #7773 ### Mixin @@ -128,7 +129,6 @@ * [CHANGE] The configuration option `-querier.max-query-into-future` has been deprecated and will be removed in Mimir 2.14. #7496 * [CHANGE] Distributor: the metric `cortex_distributor_sample_delay_seconds` has been deprecated and will be removed in Mimir 2.14. #7516 * [CHANGE] Query-frontend: The deprecated YAML setting `frontend.cache_unaligned_requests` has been moved to `limits.cache_unaligned_requests`. #7519 -* [CHANGE] Distributor: validate that in integer native histograms the zero, negative and positive bucket counts add up to the overall count of the histogram. Such errors are now reported as 4xx and not 5xx and show up in the `cortex_discarded_samples_total` with the label `reason="native_histogram_bucket_count_mismatch"`. #7736 * [FEATURE] Introduce `-server.log-source-ips-full` option to log all IPs from `Forwarded`, `X-Real-IP`, `X-Forwarded-For` headers. #7250 * [FEATURE] Introduce `-tenant-federation.max-tenants` option to limit the max number of tenants allowed for requests when federation is enabled. #6959 * [FEATURE] Cardinality API: added a new `count_method` parameter which enables counting active label names. #7085 diff --git a/docs/sources/mimir/manage/mimir-runbooks/_index.md b/docs/sources/mimir/manage/mimir-runbooks/_index.md index 13975693671..4d480867c53 100644 --- a/docs/sources/mimir/manage/mimir-runbooks/_index.md +++ b/docs/sources/mimir/manage/mimir-runbooks/_index.md @@ -1477,9 +1477,48 @@ This non-critical error occurs when Mimir receives a write request that contains The series containing such samples are skipped during ingestion, and valid series within the same request are ingested. {{< /admonition >}} -### err-mimir-native-histogram-bucket-count-mismatch +### err-mimir-native-histogram-count-mismatch -This non-critical error occurs when Mimir receives a write request that contains a sample that is a native histogram where the zero, positive and negative bucket counts do not add up to the overall count of the native histogram. +This non-critical error occures when Mimir receives a write request that contains a sample that is a native histogram +where the buckets counts don't add up to the overall count recorded in the native histogram, provided that the overall +sum is a regular float number. + +{{< admonition type="note" >}} +The series containing such samples are skipped during ingestion, and valid series within the same request are ingested. +{{< /admonition >}} + +### err-mimir-native-histogram-count-not-big-enough + +This non-critical error occures when Mimir receives a write request that contains a sample that is a native histogram +where the buckets counts add up to a higher number than the overall count recorded in the native histogram, provided +that the overall sum is not a float number (NaN). + +{{< admonition type="note" >}} +The series containing such samples are skipped during ingestion, and valid series within the same request are ingested. +{{< /admonition >}} + +### err-mimir-native-histogram-negative-bucket-count + +This non-critical error occures when Mimir receives a write request that contains a sample that is a native histogram +where some bucket count is negative. + +{{< admonition type="note" >}} +The series containing such samples are skipped during ingestion, and valid series within the same request are ingested. +{{< /admonition >}} + +### err-mimir-native-histogram-span-negative-offset + +This non-critical error occures when Mimir receives a write request that contains a sample that is a native histogram +where a bucket span has a negative offset. + +{{< admonition type="note" >}} +The series containing such samples are skipped during ingestion, and valid series within the same request are ingested. +{{< /admonition >}} + +### err-mimir-native-histogram-spans-buckets-mismatch + +This non-critical error occures when Mimir receives a write request that contains a sample that is a native histogram +where the number of bucket counts does not agree with the number of buckets encoded in the bucket spans. {{< admonition type="note" >}} The series containing such samples are skipped during ingestion, and valid series within the same request are ingested. diff --git a/pkg/distributor/validate.go b/pkg/distributor/validate.go index cb1e5757380..1b06e683859 100644 --- a/pkg/distributor/validate.go +++ b/pkg/distributor/validate.go @@ -38,7 +38,6 @@ var ( reasonLabelValueTooLong = globalerror.SeriesLabelValueTooLong.LabelValue() reasonMaxNativeHistogramBuckets = globalerror.MaxNativeHistogramBuckets.LabelValue() reasonInvalidNativeHistogramSchema = globalerror.InvalidSchemaNativeHistogram.LabelValue() - reasonBucketCountMismatch = globalerror.BucketCountMismatch.LabelValue() reasonDuplicateLabelNames = globalerror.SeriesWithDuplicateLabelNames.LabelValue() reasonTooFarInFuture = globalerror.SampleTooFarInFuture.LabelValue() @@ -80,7 +79,6 @@ var ( maxNativeHistogramBucketsMsgFormat = globalerror.MaxNativeHistogramBuckets.Message("received a native histogram sample with too many buckets, timestamp: %d series: %s, buckets: %d, limit: %d") notReducibleNativeHistogramMsgFormat = globalerror.NotReducibleNativeHistogram.Message("received a native histogram sample with too many buckets and cannot reduce, timestamp: %d series: %s, buckets: %d, limit: %d") invalidSchemaNativeHistogramMsgFormat = globalerror.InvalidSchemaNativeHistogram.Message("received a native histogram sample with an invalid schema: %d") - bucketCountMismatchMsgFormat = globalerror.BucketCountMismatch.Message("native histogram bucket count mismatch, timestamp: %d, series: %s, expected %v, got %v") sampleTimestampTooNewMsgFormat = globalerror.SampleTooFarInFuture.MessageWithPerTenantLimitConfig( "received a sample whose timestamp is too far in the future, timestamp: %d series: '%.200s'", validation.CreationGracePeriodFlag, @@ -123,7 +121,6 @@ type sampleValidationMetrics struct { labelValueTooLong *prometheus.CounterVec maxNativeHistogramBuckets *prometheus.CounterVec invalidNativeHistogramSchema *prometheus.CounterVec - bucketCountMismatch *prometheus.CounterVec duplicateLabelNames *prometheus.CounterVec tooFarInFuture *prometheus.CounterVec } @@ -138,7 +135,6 @@ func (m *sampleValidationMetrics) deleteUserMetrics(userID string) { m.labelValueTooLong.DeletePartialMatch(filter) m.maxNativeHistogramBuckets.DeletePartialMatch(filter) m.invalidNativeHistogramSchema.DeletePartialMatch(filter) - m.bucketCountMismatch.DeletePartialMatch(filter) m.duplicateLabelNames.DeletePartialMatch(filter) m.tooFarInFuture.DeletePartialMatch(filter) } @@ -152,7 +148,6 @@ func (m *sampleValidationMetrics) deleteUserMetricsForGroup(userID, group string m.labelValueTooLong.DeleteLabelValues(userID, group) m.maxNativeHistogramBuckets.DeleteLabelValues(userID, group) m.invalidNativeHistogramSchema.DeleteLabelValues(userID, group) - m.bucketCountMismatch.DeleteLabelValues(userID, group) m.duplicateLabelNames.DeleteLabelValues(userID, group) m.tooFarInFuture.DeleteLabelValues(userID, group) } @@ -167,7 +162,6 @@ func newSampleValidationMetrics(r prometheus.Registerer) *sampleValidationMetric labelValueTooLong: validation.DiscardedSamplesCounter(r, reasonLabelValueTooLong), maxNativeHistogramBuckets: validation.DiscardedSamplesCounter(r, reasonMaxNativeHistogramBuckets), invalidNativeHistogramSchema: validation.DiscardedSamplesCounter(r, reasonInvalidNativeHistogramSchema), - bucketCountMismatch: validation.DiscardedSamplesCounter(r, reasonBucketCountMismatch), duplicateLabelNames: validation.DiscardedSamplesCounter(r, reasonDuplicateLabelNames), tooFarInFuture: validation.DiscardedSamplesCounter(r, reasonTooFarInFuture), } @@ -257,26 +251,6 @@ func validateSampleHistogram(m *sampleValidationMetrics, now model.Time, cfg sam } } - // Check that bucket counts including zero bucket add up to the overall count. - if !s.IsFloatHistogram() { - // Do nothing for float histograms, due to floating point precision issues, we don't check the bucket count. - count := s.GetZeroCountInt() - bucketCount := int64(0) - for _, c := range s.GetNegativeDeltas() { - bucketCount += c - count += uint64(bucketCount) - } - bucketCount = int64(0) - for _, c := range s.GetPositiveDeltas() { - bucketCount += c - count += uint64(bucketCount) - } - if count != s.GetCountInt() { - m.bucketCountMismatch.WithLabelValues(userID, group).Inc() - return fmt.Errorf(bucketCountMismatchMsgFormat, s.Timestamp, mimirpb.FromLabelAdaptersToString(ls), s.GetCountInt(), count) - } - } - return nil } diff --git a/pkg/distributor/validate_test.go b/pkg/distributor/validate_test.go index 8886d73bae5..485e0c5f9bf 100644 --- a/pkg/distributor/validate_test.go +++ b/pkg/distributor/validate_test.go @@ -584,100 +584,6 @@ func TestInvalidNativeHistogramSchema(t *testing.T) { `), "cortex_discarded_samples_total")) } -func TestInvalidBucketCountHistogram(t *testing.T) { - testCases := map[string]struct { - h *mimirpb.Histogram - expectedError error - }{ - "a valid zero counts causes no error": { - h: &mimirpb.Histogram{}, - expectedError: nil, - }, - "a valid integer histogram causes no error": { - h: &mimirpb.Histogram{ - Count: &mimirpb.Histogram_CountInt{CountInt: 5}, - Sum: 10, - Schema: 1, - ZeroThreshold: 0.001, - ZeroCount: &mimirpb.Histogram_ZeroCountInt{ZeroCountInt: 1}, - NegativeSpans: []mimirpb.BucketSpan{{Offset: 0, Length: 2}}, - NegativeDeltas: []int64{1, 1}, - PositiveSpans: []mimirpb.BucketSpan{{Offset: 0, Length: 1}}, - PositiveDeltas: []int64{1}, - ResetHint: mimirpb.Histogram_UNKNOWN, - Timestamp: 0, - }, - expectedError: nil, - }, - "a valid float histogram causes no error": { - h: &mimirpb.Histogram{ - Count: &mimirpb.Histogram_CountFloat{CountFloat: 5.5}, - Sum: 10, - Schema: 1, - ZeroThreshold: 0.001, - ZeroCount: &mimirpb.Histogram_ZeroCountFloat{ZeroCountFloat: 1.5}, - NegativeSpans: []mimirpb.BucketSpan{{Offset: 0, Length: 2}}, - NegativeCounts: []float64{1.0, 2.0}, - PositiveSpans: []mimirpb.BucketSpan{{Offset: 0, Length: 1}}, - PositiveCounts: []float64{1.0}, - ResetHint: mimirpb.Histogram_UNKNOWN, - Timestamp: 0, - }, - expectedError: nil, - }, - "an integer histogram with the wrong overall count": { - h: &mimirpb.Histogram{ - Count: &mimirpb.Histogram_CountInt{CountInt: 4}, - Sum: 10, - Schema: 1, - ZeroThreshold: 0.001, - ZeroCount: &mimirpb.Histogram_ZeroCountInt{ZeroCountInt: 1}, - NegativeSpans: []mimirpb.BucketSpan{{Offset: 0, Length: 2}}, - NegativeDeltas: []int64{1, 1}, - PositiveSpans: []mimirpb.BucketSpan{{Offset: 0, Length: 1}}, - PositiveDeltas: []int64{1}, - ResetHint: mimirpb.Histogram_UNKNOWN, - Timestamp: 0, - }, - expectedError: fmt.Errorf("native histogram bucket count mismatch, timestamp: 0, series: a{a=\"a\"}, expected 4, got 5 (err-mimir-native-histogram-bucket-count-mismatch)"), - }, - "a float histogram with the wrong overall count": { - h: &mimirpb.Histogram{ - Count: &mimirpb.Histogram_CountFloat{CountFloat: 4.5}, - Sum: 10, - Schema: 1, - ZeroThreshold: 0.001, - ZeroCount: &mimirpb.Histogram_ZeroCountFloat{ZeroCountFloat: 1.5}, - NegativeSpans: []mimirpb.BucketSpan{{Offset: 0, Length: 2}}, - NegativeCounts: []float64{1.0, 2.0}, - PositiveSpans: []mimirpb.BucketSpan{{Offset: 0, Length: 1}}, - PositiveCounts: []float64{1.0}, - ResetHint: mimirpb.Histogram_UNKNOWN, - Timestamp: 0, - }, - // Due to floating point precision issues, this case is not an error at the moment. - expectedError: nil, - }, - } - - registry := prometheus.NewRegistry() - metrics := newSampleValidationMetrics(registry) - cfg := sampleValidationCfg{} - labels := []mimirpb.LabelAdapter{{Name: model.MetricNameLabel, Value: "a"}, {Name: "a", Value: "a"}} - for testName, testCase := range testCases { - t.Run(testName, func(t *testing.T) { - err := validateSampleHistogram(metrics, model.Now(), cfg, "user-1", "group-1", labels, testCase.h) - require.Equal(t, testCase.expectedError, err) - }) - } - - require.NoError(t, testutil.GatherAndCompare(registry, strings.NewReader(` - # HELP cortex_discarded_samples_total The total number of samples that were discarded. - # TYPE cortex_discarded_samples_total counter - cortex_discarded_samples_total{group="group-1",reason="native_histogram_bucket_count_mismatch",user="user-1"} 1 - `), "cortex_discarded_samples_total")) -} - func tooManyLabelsArgs(series []mimirpb.LabelAdapter, limit int) []any { metric := mimirpb.FromLabelAdaptersToMetric(series).String() ellipsis := "" diff --git a/pkg/ingester/errors.go b/pkg/ingester/errors.go index 3e4534da1a3..be209398e4f 100644 --- a/pkg/ingester/errors.go +++ b/pkg/ingester/errors.go @@ -455,6 +455,43 @@ var _ ingesterError = perMetricMetadataLimitReachedError{} // Ensure that perMetricMetadataLimitReachedError is an softError. var _ softError = perMetricMetadataLimitReachedError{} +// nativeHistogramValidationError indicates that native histogram bucket counts did not add up to the overall count. +type nativeHistogramValidationError struct { + id globalerror.ID + originalErr error + seriesLabels []mimirpb.LabelAdapter + timestamp model.Time +} + +func newNativeHistogramValidationError(id globalerror.ID, originalErr error, timestamp model.Time, seriesLabels []mimirpb.LabelAdapter) nativeHistogramValidationError { + return nativeHistogramValidationError{ + id: id, + originalErr: originalErr, + seriesLabels: seriesLabels, + timestamp: timestamp, + } +} + +func (e nativeHistogramValidationError) Error() string { + return e.id.Message(fmt.Sprintf("err: %v. timestamp=%s, series=%s", + e.originalErr, + e.timestamp.Time().UTC().Format(time.RFC3339Nano), + e.seriesLabels, + )) +} + +func (e nativeHistogramValidationError) errorCause() mimirpb.ErrorCause { + return mimirpb.BAD_DATA +} + +func (e nativeHistogramValidationError) soft() {} + +// Ensure that histogramBucketCountMismatchError is an ingesterError. +var _ ingesterError = nativeHistogramValidationError{} + +// Ensure that histogramBucketCountMismatchError is an softError. +var _ softError = nativeHistogramValidationError{} + // unavailableError is an ingesterError indicating that the ingester is unavailable. type unavailableError struct { state services.State @@ -550,6 +587,7 @@ type ingesterErrSamplers struct { maxMetadataPerMetricLimitExceeded *log.Sampler maxSeriesPerUserLimitExceeded *log.Sampler maxMetadataPerUserLimitExceeded *log.Sampler + nativeHistogramValidationError *log.Sampler } func newIngesterErrSamplers(freq int64) ingesterErrSamplers { @@ -563,6 +601,7 @@ func newIngesterErrSamplers(freq int64) ingesterErrSamplers { log.NewSampler(freq), log.NewSampler(freq), log.NewSampler(freq), + log.NewSampler(freq), } } diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index 9067bbc01d1..cd5938cfc6f 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -98,13 +98,14 @@ const ( instanceIngestionRateTickInterval = time.Second // Reasons for discarding samples - reasonSampleOutOfOrder = "sample-out-of-order" - reasonSampleTooOld = "sample-too-old" - reasonSampleTooFarInFuture = "sample-too-far-in-future" - reasonNewValueForTimestamp = "new-value-for-timestamp" - reasonSampleOutOfBounds = "sample-out-of-bounds" - reasonPerUserSeriesLimit = "per_user_series_limit" - reasonPerMetricSeriesLimit = "per_metric_series_limit" + reasonSampleOutOfOrder = "sample-out-of-order" + reasonSampleTooOld = "sample-too-old" + reasonSampleTooFarInFuture = "sample-too-far-in-future" + reasonNewValueForTimestamp = "new-value-for-timestamp" + reasonSampleOutOfBounds = "sample-out-of-bounds" + reasonPerUserSeriesLimit = "per_user_series_limit" + reasonPerMetricSeriesLimit = "per_metric_series_limit" + reasonInvalidNativeHistogram = "invalid-native-histogram" replicationFactorStatsName = "ingester_replication_factor" ringStoreStatsName = "ingester_ring_store" @@ -945,17 +946,18 @@ type extendedAppender interface { } type pushStats struct { - succeededSamplesCount int - failedSamplesCount int - succeededExemplarsCount int - failedExemplarsCount int - sampleOutOfBoundsCount int - sampleOutOfOrderCount int - sampleTooOldCount int - sampleTooFarInFutureCount int - newValueForTimestampCount int - perUserSeriesLimitCount int - perMetricSeriesLimitCount int + succeededSamplesCount int + failedSamplesCount int + succeededExemplarsCount int + failedExemplarsCount int + sampleOutOfBoundsCount int + sampleOutOfOrderCount int + sampleTooOldCount int + sampleTooFarInFutureCount int + newValueForTimestampCount int + perUserSeriesLimitCount int + perMetricSeriesLimitCount int + invalidNativeHistogramCount int } type ctxKey int @@ -1215,6 +1217,9 @@ func (i *Ingester) updateMetricsFromPushStats(userID string, group string, stats if stats.perMetricSeriesLimitCount > 0 { discarded.perMetricSeriesLimit.WithLabelValues(userID, group).Add(float64(stats.perMetricSeriesLimitCount)) } + if stats.invalidNativeHistogramCount > 0 { + discarded.invalidNativeHistogram.WithLabelValues(userID, group).Add(float64(stats.invalidNativeHistogramCount)) + } if stats.succeededSamplesCount > 0 { i.ingestionRate.Add(int64(stats.succeededSamplesCount)) @@ -1290,6 +1295,38 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre return newPerMetricSeriesLimitReachedError(i.limiter.limits.MaxGlobalSeriesPerMetric(userID), labels) }) return true + + // Map TSDB native histogram validation errors to soft errors. + case errors.Is(err, histogram.ErrHistogramCountMismatch): + stats.invalidNativeHistogramCount++ + updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { + return newNativeHistogramValidationError(globalerror.NativeHistogramCountMismatch, err, model.Time(timestamp), labels) + }) + return true + case errors.Is(err, histogram.ErrHistogramCountNotBigEnough): + stats.invalidNativeHistogramCount++ + updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { + return newNativeHistogramValidationError(globalerror.NativeHistogramCountNotBigEnough, err, model.Time(timestamp), labels) + }) + return true + case errors.Is(err, histogram.ErrHistogramNegativeBucketCount): + stats.invalidNativeHistogramCount++ + updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { + return newNativeHistogramValidationError(globalerror.NativeHistogramNegativeBucketCount, err, model.Time(timestamp), labels) + }) + return true + case errors.Is(err, histogram.ErrHistogramSpanNegativeOffset): + stats.invalidNativeHistogramCount++ + updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { + return newNativeHistogramValidationError(globalerror.NativeHistogramSpanNegativeOffset, err, model.Time(timestamp), labels) + }) + return true + case errors.Is(err, histogram.ErrHistogramSpansBucketsMismatch): + stats.invalidNativeHistogramCount++ + updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { + return newNativeHistogramValidationError(globalerror.NativeHistogramSpansBucketsMismatch, err, model.Time(timestamp), labels) + }) + return true } return false } diff --git a/pkg/ingester/ingester_test.go b/pkg/ingester/ingester_test.go index eb8a8fc4425..44fd6e78dac 100644 --- a/pkg/ingester/ingester_test.go +++ b/pkg/ingester/ingester_test.go @@ -68,6 +68,7 @@ import ( "github.com/grafana/mimir/pkg/storage/tsdb/block" "github.com/grafana/mimir/pkg/usagestats" "github.com/grafana/mimir/pkg/util" + "github.com/grafana/mimir/pkg/util/globalerror" util_math "github.com/grafana/mimir/pkg/util/math" util_test "github.com/grafana/mimir/pkg/util/test" "github.com/grafana/mimir/pkg/util/validation" @@ -99,6 +100,22 @@ func TestIngester_Push(t *testing.T) { userID := "test" now := time.Now() + histogramWithBucketCountMismatch := util_test.GenerateTestHistogram(1) + histogramWithBucketCountMismatch.Count++ + + histogramWithCountNotBigEnough := util_test.GenerateTestHistogram(1) + histogramWithCountNotBigEnough.Sum = math.NaN() + histogramWithCountNotBigEnough.Count-- + + histogramWithNegativeBucketCount := util_test.GenerateTestHistogram(1) + histogramWithNegativeBucketCount.NegativeBuckets[1] = -100 + + histogramWithSpanNegativeOffset := util_test.GenerateTestHistogram(1) + histogramWithSpanNegativeOffset.PositiveSpans[1].Offset = -2 // The first span can start at negative offset, hence the 1. + + histogramWithSpansBucketsMismatch := util_test.GenerateTestHistogram(1) + histogramWithSpansBucketsMismatch.PositiveSpans[1].Length++ + tests := map[string]struct { reqs []*mimirpb.WriteRequest expectedErr error @@ -1155,6 +1172,211 @@ func TestIngester_Push(t *testing.T) { cortex_ingester_tsdb_head_max_timestamp_seconds 0.01 `, }, + "should soft fail if histogram has a bucket count vs overall count mismatch": { + nativeHistograms: true, + reqs: []*mimirpb.WriteRequest{ + mimirpb.NewWriteRequest(nil, mimirpb.API).AddHistogramSeries( + [][]mimirpb.LabelAdapter{metricLabelAdapters}, + []mimirpb.Histogram{mimirpb.FromHistogramToHistogramProto(10, histogramWithBucketCountMismatch)}, + nil, + ), + }, + // Expect the error string instead of constructing the error to catch if Prometheus changes the error message. + expectedErr: newErrorWithStatus(wrapOrAnnotateWithUser(newNativeHistogramValidationError(globalerror.NativeHistogramCountMismatch, fmt.Errorf("21 observations found in buckets, but the Count field is 22: histogram's observation count should equal the number of observations found in the buckets (in absence of NaN)"), model.Time(10), []mimirpb.LabelAdapter{metricLabelAdapters[0]}), userID), codes.FailedPrecondition), + expectedMetrics: ` + # HELP cortex_ingester_ingested_samples_total The total number of samples ingested per user. + # TYPE cortex_ingester_ingested_samples_total counter + cortex_ingester_ingested_samples_total{user="test"} 0 + # HELP cortex_discarded_samples_total The total number of samples that were discarded. + # TYPE cortex_discarded_samples_total counter + cortex_discarded_samples_total{group="",reason="invalid-native-histogram",user="test"} 1 + # HELP cortex_ingester_ingested_samples_failures_total The total number of samples that errored on ingestion per user. + # TYPE cortex_ingester_ingested_samples_failures_total counter + cortex_ingester_ingested_samples_failures_total{user="test"} 1 + # HELP cortex_ingester_memory_users The current number of users in memory. + # TYPE cortex_ingester_memory_users gauge + cortex_ingester_memory_users 1 + # HELP cortex_ingester_memory_series The current number of series in memory. + # TYPE cortex_ingester_memory_series gauge + cortex_ingester_memory_series 0 + # HELP cortex_ingester_memory_series_created_total The total number of series that were created per user. + # TYPE cortex_ingester_memory_series_created_total counter + cortex_ingester_memory_series_created_total{user="test"} 0 + # HELP cortex_ingester_memory_series_removed_total The total number of series that were removed per user. + # TYPE cortex_ingester_memory_series_removed_total counter + cortex_ingester_memory_series_removed_total{user="test"} 0 + # HELP cortex_ingester_tsdb_head_min_timestamp_seconds Minimum timestamp of the head block across all tenants. + # TYPE cortex_ingester_tsdb_head_min_timestamp_seconds gauge + cortex_ingester_tsdb_head_min_timestamp_seconds 0.01 + # HELP cortex_ingester_tsdb_head_max_timestamp_seconds Maximum timestamp of the head block across all tenants. + # TYPE cortex_ingester_tsdb_head_max_timestamp_seconds gauge + cortex_ingester_tsdb_head_max_timestamp_seconds 0.01 + `, + }, + "should soft fail if histogram has a bucket count higher than overall count and sum NaN": { + nativeHistograms: true, + reqs: []*mimirpb.WriteRequest{ + mimirpb.NewWriteRequest(nil, mimirpb.API).AddHistogramSeries( + [][]mimirpb.LabelAdapter{metricLabelAdapters}, + []mimirpb.Histogram{mimirpb.FromHistogramToHistogramProto(10, histogramWithCountNotBigEnough)}, + nil, + ), + }, + // Expect the error string instead of constructing the error to catch if Prometheus changes the error message. + expectedErr: newErrorWithStatus(wrapOrAnnotateWithUser(newNativeHistogramValidationError(globalerror.NativeHistogramCountNotBigEnough, fmt.Errorf("21 observations found in buckets, but the Count field is 20: histogram's observation count should be at least the number of observations found in the buckets"), model.Time(10), []mimirpb.LabelAdapter{metricLabelAdapters[0]}), userID), codes.FailedPrecondition), + expectedMetrics: ` + # HELP cortex_ingester_ingested_samples_total The total number of samples ingested per user. + # TYPE cortex_ingester_ingested_samples_total counter + cortex_ingester_ingested_samples_total{user="test"} 0 + # HELP cortex_discarded_samples_total The total number of samples that were discarded. + # TYPE cortex_discarded_samples_total counter + cortex_discarded_samples_total{group="",reason="invalid-native-histogram",user="test"} 1 + # HELP cortex_ingester_ingested_samples_failures_total The total number of samples that errored on ingestion per user. + # TYPE cortex_ingester_ingested_samples_failures_total counter + cortex_ingester_ingested_samples_failures_total{user="test"} 1 + # HELP cortex_ingester_memory_users The current number of users in memory. + # TYPE cortex_ingester_memory_users gauge + cortex_ingester_memory_users 1 + # HELP cortex_ingester_memory_series The current number of series in memory. + # TYPE cortex_ingester_memory_series gauge + cortex_ingester_memory_series 0 + # HELP cortex_ingester_memory_series_created_total The total number of series that were created per user. + # TYPE cortex_ingester_memory_series_created_total counter + cortex_ingester_memory_series_created_total{user="test"} 0 + # HELP cortex_ingester_memory_series_removed_total The total number of series that were removed per user. + # TYPE cortex_ingester_memory_series_removed_total counter + cortex_ingester_memory_series_removed_total{user="test"} 0 + # HELP cortex_ingester_tsdb_head_min_timestamp_seconds Minimum timestamp of the head block across all tenants. + # TYPE cortex_ingester_tsdb_head_min_timestamp_seconds gauge + cortex_ingester_tsdb_head_min_timestamp_seconds 0.01 + # HELP cortex_ingester_tsdb_head_max_timestamp_seconds Maximum timestamp of the head block across all tenants. + # TYPE cortex_ingester_tsdb_head_max_timestamp_seconds gauge + cortex_ingester_tsdb_head_max_timestamp_seconds 0.01 + `, + }, + "should soft fail if histogram has a negative span offset": { + nativeHistograms: true, + reqs: []*mimirpb.WriteRequest{ + mimirpb.NewWriteRequest(nil, mimirpb.API).AddHistogramSeries( + [][]mimirpb.LabelAdapter{metricLabelAdapters}, + []mimirpb.Histogram{mimirpb.FromHistogramToHistogramProto(10, histogramWithSpanNegativeOffset)}, + nil, + ), + }, + // Expect the error string instead of constructing the error to catch if Prometheus changes the error message. + expectedErr: newErrorWithStatus(wrapOrAnnotateWithUser(newNativeHistogramValidationError(globalerror.NativeHistogramSpanNegativeOffset, fmt.Errorf("positive side: span number 2 with offset -2: histogram has a span whose offset is negative"), model.Time(10), []mimirpb.LabelAdapter{metricLabelAdapters[0]}), userID), codes.FailedPrecondition), + expectedMetrics: ` + # HELP cortex_ingester_ingested_samples_total The total number of samples ingested per user. + # TYPE cortex_ingester_ingested_samples_total counter + cortex_ingester_ingested_samples_total{user="test"} 0 + # HELP cortex_discarded_samples_total The total number of samples that were discarded. + # TYPE cortex_discarded_samples_total counter + cortex_discarded_samples_total{group="",reason="invalid-native-histogram",user="test"} 1 + # HELP cortex_ingester_ingested_samples_failures_total The total number of samples that errored on ingestion per user. + # TYPE cortex_ingester_ingested_samples_failures_total counter + cortex_ingester_ingested_samples_failures_total{user="test"} 1 + # HELP cortex_ingester_memory_users The current number of users in memory. + # TYPE cortex_ingester_memory_users gauge + cortex_ingester_memory_users 1 + # HELP cortex_ingester_memory_series The current number of series in memory. + # TYPE cortex_ingester_memory_series gauge + cortex_ingester_memory_series 0 + # HELP cortex_ingester_memory_series_created_total The total number of series that were created per user. + # TYPE cortex_ingester_memory_series_created_total counter + cortex_ingester_memory_series_created_total{user="test"} 0 + # HELP cortex_ingester_memory_series_removed_total The total number of series that were removed per user. + # TYPE cortex_ingester_memory_series_removed_total counter + cortex_ingester_memory_series_removed_total{user="test"} 0 + # HELP cortex_ingester_tsdb_head_min_timestamp_seconds Minimum timestamp of the head block across all tenants. + # TYPE cortex_ingester_tsdb_head_min_timestamp_seconds gauge + cortex_ingester_tsdb_head_min_timestamp_seconds 0.01 + # HELP cortex_ingester_tsdb_head_max_timestamp_seconds Maximum timestamp of the head block across all tenants. + # TYPE cortex_ingester_tsdb_head_max_timestamp_seconds gauge + cortex_ingester_tsdb_head_max_timestamp_seconds 0.01 + `, + }, + "should soft fail if histogram has different number of buckets then encoded in spans": { + nativeHistograms: true, + reqs: []*mimirpb.WriteRequest{ + mimirpb.NewWriteRequest(nil, mimirpb.API).AddHistogramSeries( + [][]mimirpb.LabelAdapter{metricLabelAdapters}, + []mimirpb.Histogram{mimirpb.FromHistogramToHistogramProto(10, histogramWithSpansBucketsMismatch)}, + nil, + ), + }, + // Expect the error string instead of constructing the error to catch if Prometheus changes the error message. + expectedErr: newErrorWithStatus(wrapOrAnnotateWithUser(newNativeHistogramValidationError(globalerror.NativeHistogramSpansBucketsMismatch, fmt.Errorf("positive side: spans need 5 buckets, have 4 buckets: histogram spans specify different number of buckets than provided"), model.Time(10), []mimirpb.LabelAdapter{metricLabelAdapters[0]}), userID), codes.FailedPrecondition), + expectedMetrics: ` + # HELP cortex_ingester_ingested_samples_total The total number of samples ingested per user. + # TYPE cortex_ingester_ingested_samples_total counter + cortex_ingester_ingested_samples_total{user="test"} 0 + # HELP cortex_discarded_samples_total The total number of samples that were discarded. + # TYPE cortex_discarded_samples_total counter + cortex_discarded_samples_total{group="",reason="invalid-native-histogram",user="test"} 1 + # HELP cortex_ingester_ingested_samples_failures_total The total number of samples that errored on ingestion per user. + # TYPE cortex_ingester_ingested_samples_failures_total counter + cortex_ingester_ingested_samples_failures_total{user="test"} 1 + # HELP cortex_ingester_memory_users The current number of users in memory. + # TYPE cortex_ingester_memory_users gauge + cortex_ingester_memory_users 1 + # HELP cortex_ingester_memory_series The current number of series in memory. + # TYPE cortex_ingester_memory_series gauge + cortex_ingester_memory_series 0 + # HELP cortex_ingester_memory_series_created_total The total number of series that were created per user. + # TYPE cortex_ingester_memory_series_created_total counter + cortex_ingester_memory_series_created_total{user="test"} 0 + # HELP cortex_ingester_memory_series_removed_total The total number of series that were removed per user. + # TYPE cortex_ingester_memory_series_removed_total counter + cortex_ingester_memory_series_removed_total{user="test"} 0 + # HELP cortex_ingester_tsdb_head_min_timestamp_seconds Minimum timestamp of the head block across all tenants. + # TYPE cortex_ingester_tsdb_head_min_timestamp_seconds gauge + cortex_ingester_tsdb_head_min_timestamp_seconds 0.01 + # HELP cortex_ingester_tsdb_head_max_timestamp_seconds Maximum timestamp of the head block across all tenants. + # TYPE cortex_ingester_tsdb_head_max_timestamp_seconds gauge + cortex_ingester_tsdb_head_max_timestamp_seconds 0.01 + `, + }, + "should soft fail if histogram has a negative bucket count": { + nativeHistograms: true, + reqs: []*mimirpb.WriteRequest{ + mimirpb.NewWriteRequest(nil, mimirpb.API).AddHistogramSeries( + [][]mimirpb.LabelAdapter{metricLabelAdapters}, + []mimirpb.Histogram{mimirpb.FromHistogramToHistogramProto(10, histogramWithNegativeBucketCount)}, + nil, + ), + }, + // Expect the error string instead of constructing the error to catch if Prometheus changes the error message. + expectedErr: newErrorWithStatus(wrapOrAnnotateWithUser(newNativeHistogramValidationError(globalerror.NativeHistogramNegativeBucketCount, fmt.Errorf("negative side: bucket number 2 has observation count of -98: histogram has a bucket whose observation count is negative"), model.Time(10), []mimirpb.LabelAdapter{metricLabelAdapters[0]}), userID), codes.FailedPrecondition), + expectedMetrics: ` + # HELP cortex_ingester_ingested_samples_total The total number of samples ingested per user. + # TYPE cortex_ingester_ingested_samples_total counter + cortex_ingester_ingested_samples_total{user="test"} 0 + # HELP cortex_discarded_samples_total The total number of samples that were discarded. + # TYPE cortex_discarded_samples_total counter + cortex_discarded_samples_total{group="",reason="invalid-native-histogram",user="test"} 1 + # HELP cortex_ingester_ingested_samples_failures_total The total number of samples that errored on ingestion per user. + # TYPE cortex_ingester_ingested_samples_failures_total counter + cortex_ingester_ingested_samples_failures_total{user="test"} 1 + # HELP cortex_ingester_memory_users The current number of users in memory. + # TYPE cortex_ingester_memory_users gauge + cortex_ingester_memory_users 1 + # HELP cortex_ingester_memory_series The current number of series in memory. + # TYPE cortex_ingester_memory_series gauge + cortex_ingester_memory_series 0 + # HELP cortex_ingester_memory_series_created_total The total number of series that were created per user. + # TYPE cortex_ingester_memory_series_created_total counter + cortex_ingester_memory_series_created_total{user="test"} 0 + # HELP cortex_ingester_memory_series_removed_total The total number of series that were removed per user. + # TYPE cortex_ingester_memory_series_removed_total counter + cortex_ingester_memory_series_removed_total{user="test"} 0 + # HELP cortex_ingester_tsdb_head_min_timestamp_seconds Minimum timestamp of the head block across all tenants. + # TYPE cortex_ingester_tsdb_head_min_timestamp_seconds gauge + cortex_ingester_tsdb_head_min_timestamp_seconds 0.01 + # HELP cortex_ingester_tsdb_head_max_timestamp_seconds Maximum timestamp of the head block across all tenants. + # TYPE cortex_ingester_tsdb_head_max_timestamp_seconds gauge + cortex_ingester_tsdb_head_max_timestamp_seconds 0.01 + `, + }, "should soft fail on existing histogram series if all exemplars are out of order": { maxExemplars: 2, nativeHistograms: true, diff --git a/pkg/ingester/metrics.go b/pkg/ingester/metrics.go index 2c422c24fab..833a802fb04 100644 --- a/pkg/ingester/metrics.go +++ b/pkg/ingester/metrics.go @@ -420,24 +420,26 @@ func (m *ingesterMetrics) deletePerUserCustomTrackerMetrics(userID string, custo } type discardedMetrics struct { - sampleOutOfBounds *prometheus.CounterVec - sampleOutOfOrder *prometheus.CounterVec - sampleTooOld *prometheus.CounterVec - sampleTooFarInFuture *prometheus.CounterVec - newValueForTimestamp *prometheus.CounterVec - perUserSeriesLimit *prometheus.CounterVec - perMetricSeriesLimit *prometheus.CounterVec + sampleOutOfBounds *prometheus.CounterVec + sampleOutOfOrder *prometheus.CounterVec + sampleTooOld *prometheus.CounterVec + sampleTooFarInFuture *prometheus.CounterVec + newValueForTimestamp *prometheus.CounterVec + perUserSeriesLimit *prometheus.CounterVec + perMetricSeriesLimit *prometheus.CounterVec + invalidNativeHistogram *prometheus.CounterVec } func newDiscardedMetrics(r prometheus.Registerer) *discardedMetrics { return &discardedMetrics{ - sampleOutOfBounds: validation.DiscardedSamplesCounter(r, reasonSampleOutOfBounds), - sampleOutOfOrder: validation.DiscardedSamplesCounter(r, reasonSampleOutOfOrder), - sampleTooOld: validation.DiscardedSamplesCounter(r, reasonSampleTooOld), - sampleTooFarInFuture: validation.DiscardedSamplesCounter(r, reasonSampleTooFarInFuture), - newValueForTimestamp: validation.DiscardedSamplesCounter(r, reasonNewValueForTimestamp), - perUserSeriesLimit: validation.DiscardedSamplesCounter(r, reasonPerUserSeriesLimit), - perMetricSeriesLimit: validation.DiscardedSamplesCounter(r, reasonPerMetricSeriesLimit), + sampleOutOfBounds: validation.DiscardedSamplesCounter(r, reasonSampleOutOfBounds), + sampleOutOfOrder: validation.DiscardedSamplesCounter(r, reasonSampleOutOfOrder), + sampleTooOld: validation.DiscardedSamplesCounter(r, reasonSampleTooOld), + sampleTooFarInFuture: validation.DiscardedSamplesCounter(r, reasonSampleTooFarInFuture), + newValueForTimestamp: validation.DiscardedSamplesCounter(r, reasonNewValueForTimestamp), + perUserSeriesLimit: validation.DiscardedSamplesCounter(r, reasonPerUserSeriesLimit), + perMetricSeriesLimit: validation.DiscardedSamplesCounter(r, reasonPerMetricSeriesLimit), + invalidNativeHistogram: validation.DiscardedSamplesCounter(r, reasonInvalidNativeHistogram), } } @@ -449,6 +451,7 @@ func (m *discardedMetrics) DeletePartialMatch(filter prometheus.Labels) { m.newValueForTimestamp.DeletePartialMatch(filter) m.perUserSeriesLimit.DeletePartialMatch(filter) m.perMetricSeriesLimit.DeletePartialMatch(filter) + m.invalidNativeHistogram.DeletePartialMatch(filter) } func (m *discardedMetrics) DeleteLabelValues(userID string, group string) { @@ -459,6 +462,7 @@ func (m *discardedMetrics) DeleteLabelValues(userID string, group string) { m.newValueForTimestamp.DeleteLabelValues(userID, group) m.perUserSeriesLimit.DeleteLabelValues(userID, group) m.perMetricSeriesLimit.DeleteLabelValues(userID, group) + m.invalidNativeHistogram.DeleteLabelValues(userID, group) } // TSDB metrics collector. Each tenant has its own registry, that TSDB code uses. diff --git a/pkg/util/globalerror/errors.go b/pkg/util/globalerror/errors.go index 7c7e4c64a28..1a35681272f 100644 --- a/pkg/util/globalerror/errors.go +++ b/pkg/util/globalerror/errors.go @@ -20,7 +20,6 @@ const ( MaxNativeHistogramBuckets ID = "max-native-histogram-buckets" NotReducibleNativeHistogram ID = "not-reducible-native-histogram" InvalidSchemaNativeHistogram ID = "invalid-native-histogram-schema" - BucketCountMismatch ID = "native-histogram-bucket-count-mismatch" SeriesInvalidLabel ID = "label-invalid" SeriesLabelNameTooLong ID = "label-name-too-long" SeriesLabelValueTooLong ID = "label-value-too-long" @@ -73,6 +72,14 @@ const ( BucketIndexTooOld ID = "bucket-index-too-old" DistributorMaxWriteMessageSize ID = "distributor-max-write-message-size" + + // Map Prometheus TSDB native histogram validation errors to Mimir errors. + // E.g. histogram.ErrHistogramCountNotBigEnough -> NativeHistogramCountNotBigEnough + NativeHistogramCountMismatch ID = "native-histogram-count-mismatch" + NativeHistogramCountNotBigEnough ID = "native-histogram-count-not-big-enough" + NativeHistogramNegativeBucketCount ID = "native-histogram-negative-bucket-count" + NativeHistogramSpanNegativeOffset ID = "native-histogram-span-negative-offset" + NativeHistogramSpansBucketsMismatch ID = "native-histogram-spans-buckets-mismatch" ) // Message returns the provided msg, appending the error id.