From 4ca0d07dab8e8e35dd0e47805894a7d823ff585a Mon Sep 17 00:00:00 2001 From: George Krajcsovits Date: Wed, 3 Apr 2024 14:47:39 +0200 Subject: [PATCH] Handle TSDB native histogram validation errors as soft errors (#7773) (#7787) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Revert "Distributor: add bucket count validation to native histograms (#7736)" This reverts commit fb7dbaaf6eff31e1535809b17fb6573b21ad9aad. * Handle TSDB native histogram validation errors are soft errors * add all testcases Signed-off-by: György Krajcsovits * Count towards discarded samples metrics with new reason Signed-off-by: György Krajcsovits --------- Signed-off-by: György Krajcsovits (cherry picked from commit 51c4088cc3b5087aabf72478dcc909d011ff64c0) --- CHANGELOG.md | 1 + .../mimir/manage/mimir-runbooks/_index.md | 47 ++++ pkg/ingester/errors.go | 39 +++ pkg/ingester/ingester.go | 73 ++++-- pkg/ingester/ingester_test.go | 222 ++++++++++++++++++ pkg/ingester/metrics.go | 32 +-- pkg/util/globalerror/errors.go | 8 + 7 files changed, 390 insertions(+), 32 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6e195d31b5d..77c5cb1b53d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ * [BUGFIX] querier: Don't cache context.Canceled errors for bucket index. #7620 * [BUGFIX] Store-gateway: account for `"other"` time in LabelValues and LabelNames requests. #7622 * [BUGFIX] Query-frontend: Fix memory leak on every request. #7654 +* [BUGFIX] Ingester: turn native histogram validation errors in TSDB into soft ingester errors that result in returning 4xx to the end-user instead of 5xx. In the case of TSDB validation errors, the counter `cortex_discarded_samples_total` will be increased with the `reason` label set to `"invalid-native-histogram"`. #7736 #7773 ### Mixin diff --git a/docs/sources/mimir/manage/mimir-runbooks/_index.md b/docs/sources/mimir/manage/mimir-runbooks/_index.md index e016c42b549..87b6611a480 100644 --- a/docs/sources/mimir/manage/mimir-runbooks/_index.md +++ b/docs/sources/mimir/manage/mimir-runbooks/_index.md @@ -1365,6 +1365,53 @@ This non-critical error occurs when Mimir receives a write request that contains The series containing such samples are skipped during ingestion, and valid series within the same request are ingested. {{< /admonition >}} +### err-mimir-native-histogram-count-mismatch + +This non-critical error occures when Mimir receives a write request that contains a sample that is a native histogram +where the buckets counts don't add up to the overall count recorded in the native histogram, provided that the overall +sum is a regular float number. + +{{< admonition type="note" >}} +The series containing such samples are skipped during ingestion, and valid series within the same request are ingested. +{{< /admonition >}} + +### err-mimir-native-histogram-count-not-big-enough + +This non-critical error occures when Mimir receives a write request that contains a sample that is a native histogram +where the buckets counts add up to a higher number than the overall count recorded in the native histogram, provided +that the overall sum is not a float number (NaN). + +{{< admonition type="note" >}} +The series containing such samples are skipped during ingestion, and valid series within the same request are ingested. +{{< /admonition >}} + +### err-mimir-native-histogram-negative-bucket-count + +This non-critical error occures when Mimir receives a write request that contains a sample that is a native histogram +where some bucket count is negative. + +{{< admonition type="note" >}} +The series containing such samples are skipped during ingestion, and valid series within the same request are ingested. +{{< /admonition >}} + +### err-mimir-native-histogram-span-negative-offset + +This non-critical error occures when Mimir receives a write request that contains a sample that is a native histogram +where a bucket span has a negative offset. + +{{< admonition type="note" >}} +The series containing such samples are skipped during ingestion, and valid series within the same request are ingested. +{{< /admonition >}} + +### err-mimir-native-histogram-spans-buckets-mismatch + +This non-critical error occures when Mimir receives a write request that contains a sample that is a native histogram +where the number of bucket counts does not agree with the number of buckets encoded in the bucket spans. + +{{< admonition type="note" >}} +The series containing such samples are skipped during ingestion, and valid series within the same request are ingested. +{{< /admonition >}} + ### err-mimir-label-invalid This non-critical error occurs when Mimir receives a write request that contains a series with an invalid label name. diff --git a/pkg/ingester/errors.go b/pkg/ingester/errors.go index a9deccf785d..13fb5f8eb78 100644 --- a/pkg/ingester/errors.go +++ b/pkg/ingester/errors.go @@ -455,6 +455,43 @@ var _ ingesterError = perMetricMetadataLimitReachedError{} // Ensure that perMetricMetadataLimitReachedError is an softError. var _ softError = perMetricMetadataLimitReachedError{} +// nativeHistogramValidationError indicates that native histogram bucket counts did not add up to the overall count. +type nativeHistogramValidationError struct { + id globalerror.ID + originalErr error + seriesLabels []mimirpb.LabelAdapter + timestamp model.Time +} + +func newNativeHistogramValidationError(id globalerror.ID, originalErr error, timestamp model.Time, seriesLabels []mimirpb.LabelAdapter) nativeHistogramValidationError { + return nativeHistogramValidationError{ + id: id, + originalErr: originalErr, + seriesLabels: seriesLabels, + timestamp: timestamp, + } +} + +func (e nativeHistogramValidationError) Error() string { + return e.id.Message(fmt.Sprintf("err: %v. timestamp=%s, series=%s", + e.originalErr, + e.timestamp.Time().UTC().Format(time.RFC3339Nano), + e.seriesLabels, + )) +} + +func (e nativeHistogramValidationError) errorCause() mimirpb.ErrorCause { + return mimirpb.BAD_DATA +} + +func (e nativeHistogramValidationError) soft() {} + +// Ensure that histogramBucketCountMismatchError is an ingesterError. +var _ ingesterError = nativeHistogramValidationError{} + +// Ensure that histogramBucketCountMismatchError is an softError. +var _ softError = nativeHistogramValidationError{} + // unavailableError is an ingesterError indicating that the ingester is unavailable. type unavailableError struct { state services.State @@ -550,6 +587,7 @@ type ingesterErrSamplers struct { maxMetadataPerMetricLimitExceeded *log.Sampler maxSeriesPerUserLimitExceeded *log.Sampler maxMetadataPerUserLimitExceeded *log.Sampler + nativeHistogramValidationError *log.Sampler } func newIngesterErrSamplers(freq int64) ingesterErrSamplers { @@ -563,6 +601,7 @@ func newIngesterErrSamplers(freq int64) ingesterErrSamplers { log.NewSampler(freq), log.NewSampler(freq), log.NewSampler(freq), + log.NewSampler(freq), } } diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index 51fea5b2b4b..7cab1c5f97a 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -97,13 +97,14 @@ const ( instanceIngestionRateTickInterval = time.Second // Reasons for discarding samples - reasonSampleOutOfOrder = "sample-out-of-order" - reasonSampleTooOld = "sample-too-old" - reasonSampleTooFarInFuture = "sample-too-far-in-future" - reasonNewValueForTimestamp = "new-value-for-timestamp" - reasonSampleOutOfBounds = "sample-out-of-bounds" - reasonPerUserSeriesLimit = "per_user_series_limit" - reasonPerMetricSeriesLimit = "per_metric_series_limit" + reasonSampleOutOfOrder = "sample-out-of-order" + reasonSampleTooOld = "sample-too-old" + reasonSampleTooFarInFuture = "sample-too-far-in-future" + reasonNewValueForTimestamp = "new-value-for-timestamp" + reasonSampleOutOfBounds = "sample-out-of-bounds" + reasonPerUserSeriesLimit = "per_user_series_limit" + reasonPerMetricSeriesLimit = "per_metric_series_limit" + reasonInvalidNativeHistogram = "invalid-native-histogram" replicationFactorStatsName = "ingester_replication_factor" ringStoreStatsName = "ingester_ring_store" @@ -878,17 +879,18 @@ type extendedAppender interface { } type pushStats struct { - succeededSamplesCount int - failedSamplesCount int - succeededExemplarsCount int - failedExemplarsCount int - sampleOutOfBoundsCount int - sampleOutOfOrderCount int - sampleTooOldCount int - sampleTooFarInFutureCount int - newValueForTimestampCount int - perUserSeriesLimitCount int - perMetricSeriesLimitCount int + succeededSamplesCount int + failedSamplesCount int + succeededExemplarsCount int + failedExemplarsCount int + sampleOutOfBoundsCount int + sampleOutOfOrderCount int + sampleTooOldCount int + sampleTooFarInFutureCount int + newValueForTimestampCount int + perUserSeriesLimitCount int + perMetricSeriesLimitCount int + invalidNativeHistogramCount int } type ctxKey int @@ -1148,6 +1150,9 @@ func (i *Ingester) updateMetricsFromPushStats(userID string, group string, stats if stats.perMetricSeriesLimitCount > 0 { discarded.perMetricSeriesLimit.WithLabelValues(userID, group).Add(float64(stats.perMetricSeriesLimitCount)) } + if stats.invalidNativeHistogramCount > 0 { + discarded.invalidNativeHistogram.WithLabelValues(userID, group).Add(float64(stats.invalidNativeHistogramCount)) + } if stats.succeededSamplesCount > 0 { i.ingestionRate.Add(int64(stats.succeededSamplesCount)) @@ -1223,6 +1228,38 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre return newPerMetricSeriesLimitReachedError(i.limiter.limits.MaxGlobalSeriesPerMetric(userID), labels) }) return true + + // Map TSDB native histogram validation errors to soft errors. + case errors.Is(err, histogram.ErrHistogramCountMismatch): + stats.invalidNativeHistogramCount++ + updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { + return newNativeHistogramValidationError(globalerror.NativeHistogramCountMismatch, err, model.Time(timestamp), labels) + }) + return true + case errors.Is(err, histogram.ErrHistogramCountNotBigEnough): + stats.invalidNativeHistogramCount++ + updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { + return newNativeHistogramValidationError(globalerror.NativeHistogramCountNotBigEnough, err, model.Time(timestamp), labels) + }) + return true + case errors.Is(err, histogram.ErrHistogramNegativeBucketCount): + stats.invalidNativeHistogramCount++ + updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { + return newNativeHistogramValidationError(globalerror.NativeHistogramNegativeBucketCount, err, model.Time(timestamp), labels) + }) + return true + case errors.Is(err, histogram.ErrHistogramSpanNegativeOffset): + stats.invalidNativeHistogramCount++ + updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { + return newNativeHistogramValidationError(globalerror.NativeHistogramSpanNegativeOffset, err, model.Time(timestamp), labels) + }) + return true + case errors.Is(err, histogram.ErrHistogramSpansBucketsMismatch): + stats.invalidNativeHistogramCount++ + updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError { + return newNativeHistogramValidationError(globalerror.NativeHistogramSpansBucketsMismatch, err, model.Time(timestamp), labels) + }) + return true } return false } diff --git a/pkg/ingester/ingester_test.go b/pkg/ingester/ingester_test.go index ef022e1da0c..b66576e938c 100644 --- a/pkg/ingester/ingester_test.go +++ b/pkg/ingester/ingester_test.go @@ -67,6 +67,7 @@ import ( "github.com/grafana/mimir/pkg/storage/tsdb/block" "github.com/grafana/mimir/pkg/usagestats" "github.com/grafana/mimir/pkg/util" + "github.com/grafana/mimir/pkg/util/globalerror" util_math "github.com/grafana/mimir/pkg/util/math" util_test "github.com/grafana/mimir/pkg/util/test" "github.com/grafana/mimir/pkg/util/validation" @@ -98,6 +99,22 @@ func TestIngester_Push(t *testing.T) { userID := "test" now := time.Now() + histogramWithBucketCountMismatch := util_test.GenerateTestHistogram(1) + histogramWithBucketCountMismatch.Count++ + + histogramWithCountNotBigEnough := util_test.GenerateTestHistogram(1) + histogramWithCountNotBigEnough.Sum = math.NaN() + histogramWithCountNotBigEnough.Count-- + + histogramWithNegativeBucketCount := util_test.GenerateTestHistogram(1) + histogramWithNegativeBucketCount.NegativeBuckets[1] = -100 + + histogramWithSpanNegativeOffset := util_test.GenerateTestHistogram(1) + histogramWithSpanNegativeOffset.PositiveSpans[1].Offset = -2 // The first span can start at negative offset, hence the 1. + + histogramWithSpansBucketsMismatch := util_test.GenerateTestHistogram(1) + histogramWithSpansBucketsMismatch.PositiveSpans[1].Length++ + tests := map[string]struct { reqs []*mimirpb.WriteRequest expectedErr error @@ -876,6 +893,211 @@ func TestIngester_Push(t *testing.T) { cortex_ingester_tsdb_head_max_timestamp_seconds 0.01 `, }, + "should soft fail if histogram has a bucket count vs overall count mismatch": { + nativeHistograms: true, + reqs: []*mimirpb.WriteRequest{ + mimirpb.NewWriteRequest(nil, mimirpb.API).AddHistogramSeries( + [][]mimirpb.LabelAdapter{metricLabelAdapters}, + []mimirpb.Histogram{mimirpb.FromHistogramToHistogramProto(10, histogramWithBucketCountMismatch)}, + nil, + ), + }, + // Expect the error string instead of constructing the error to catch if Prometheus changes the error message. + expectedErr: newErrorWithStatus(wrapOrAnnotateWithUser(newNativeHistogramValidationError(globalerror.NativeHistogramCountMismatch, fmt.Errorf("21 observations found in buckets, but the Count field is 22: histogram's observation count should equal the number of observations found in the buckets (in absence of NaN)"), model.Time(10), []mimirpb.LabelAdapter{metricLabelAdapters[0]}), userID), codes.FailedPrecondition), + expectedMetrics: ` + # HELP cortex_ingester_ingested_samples_total The total number of samples ingested per user. + # TYPE cortex_ingester_ingested_samples_total counter + cortex_ingester_ingested_samples_total{user="test"} 0 + # HELP cortex_discarded_samples_total The total number of samples that were discarded. + # TYPE cortex_discarded_samples_total counter + cortex_discarded_samples_total{group="",reason="invalid-native-histogram",user="test"} 1 + # HELP cortex_ingester_ingested_samples_failures_total The total number of samples that errored on ingestion per user. + # TYPE cortex_ingester_ingested_samples_failures_total counter + cortex_ingester_ingested_samples_failures_total{user="test"} 1 + # HELP cortex_ingester_memory_users The current number of users in memory. + # TYPE cortex_ingester_memory_users gauge + cortex_ingester_memory_users 1 + # HELP cortex_ingester_memory_series The current number of series in memory. + # TYPE cortex_ingester_memory_series gauge + cortex_ingester_memory_series 0 + # HELP cortex_ingester_memory_series_created_total The total number of series that were created per user. + # TYPE cortex_ingester_memory_series_created_total counter + cortex_ingester_memory_series_created_total{user="test"} 0 + # HELP cortex_ingester_memory_series_removed_total The total number of series that were removed per user. + # TYPE cortex_ingester_memory_series_removed_total counter + cortex_ingester_memory_series_removed_total{user="test"} 0 + # HELP cortex_ingester_tsdb_head_min_timestamp_seconds Minimum timestamp of the head block across all tenants. + # TYPE cortex_ingester_tsdb_head_min_timestamp_seconds gauge + cortex_ingester_tsdb_head_min_timestamp_seconds 0.01 + # HELP cortex_ingester_tsdb_head_max_timestamp_seconds Maximum timestamp of the head block across all tenants. + # TYPE cortex_ingester_tsdb_head_max_timestamp_seconds gauge + cortex_ingester_tsdb_head_max_timestamp_seconds 0.01 + `, + }, + "should soft fail if histogram has a bucket count higher than overall count and sum NaN": { + nativeHistograms: true, + reqs: []*mimirpb.WriteRequest{ + mimirpb.NewWriteRequest(nil, mimirpb.API).AddHistogramSeries( + [][]mimirpb.LabelAdapter{metricLabelAdapters}, + []mimirpb.Histogram{mimirpb.FromHistogramToHistogramProto(10, histogramWithCountNotBigEnough)}, + nil, + ), + }, + // Expect the error string instead of constructing the error to catch if Prometheus changes the error message. + expectedErr: newErrorWithStatus(wrapOrAnnotateWithUser(newNativeHistogramValidationError(globalerror.NativeHistogramCountNotBigEnough, fmt.Errorf("21 observations found in buckets, but the Count field is 20: histogram's observation count should be at least the number of observations found in the buckets"), model.Time(10), []mimirpb.LabelAdapter{metricLabelAdapters[0]}), userID), codes.FailedPrecondition), + expectedMetrics: ` + # HELP cortex_ingester_ingested_samples_total The total number of samples ingested per user. + # TYPE cortex_ingester_ingested_samples_total counter + cortex_ingester_ingested_samples_total{user="test"} 0 + # HELP cortex_discarded_samples_total The total number of samples that were discarded. + # TYPE cortex_discarded_samples_total counter + cortex_discarded_samples_total{group="",reason="invalid-native-histogram",user="test"} 1 + # HELP cortex_ingester_ingested_samples_failures_total The total number of samples that errored on ingestion per user. + # TYPE cortex_ingester_ingested_samples_failures_total counter + cortex_ingester_ingested_samples_failures_total{user="test"} 1 + # HELP cortex_ingester_memory_users The current number of users in memory. + # TYPE cortex_ingester_memory_users gauge + cortex_ingester_memory_users 1 + # HELP cortex_ingester_memory_series The current number of series in memory. + # TYPE cortex_ingester_memory_series gauge + cortex_ingester_memory_series 0 + # HELP cortex_ingester_memory_series_created_total The total number of series that were created per user. + # TYPE cortex_ingester_memory_series_created_total counter + cortex_ingester_memory_series_created_total{user="test"} 0 + # HELP cortex_ingester_memory_series_removed_total The total number of series that were removed per user. + # TYPE cortex_ingester_memory_series_removed_total counter + cortex_ingester_memory_series_removed_total{user="test"} 0 + # HELP cortex_ingester_tsdb_head_min_timestamp_seconds Minimum timestamp of the head block across all tenants. + # TYPE cortex_ingester_tsdb_head_min_timestamp_seconds gauge + cortex_ingester_tsdb_head_min_timestamp_seconds 0.01 + # HELP cortex_ingester_tsdb_head_max_timestamp_seconds Maximum timestamp of the head block across all tenants. + # TYPE cortex_ingester_tsdb_head_max_timestamp_seconds gauge + cortex_ingester_tsdb_head_max_timestamp_seconds 0.01 + `, + }, + "should soft fail if histogram has a negative span offset": { + nativeHistograms: true, + reqs: []*mimirpb.WriteRequest{ + mimirpb.NewWriteRequest(nil, mimirpb.API).AddHistogramSeries( + [][]mimirpb.LabelAdapter{metricLabelAdapters}, + []mimirpb.Histogram{mimirpb.FromHistogramToHistogramProto(10, histogramWithSpanNegativeOffset)}, + nil, + ), + }, + // Expect the error string instead of constructing the error to catch if Prometheus changes the error message. + expectedErr: newErrorWithStatus(wrapOrAnnotateWithUser(newNativeHistogramValidationError(globalerror.NativeHistogramSpanNegativeOffset, fmt.Errorf("positive side: span number 2 with offset -2: histogram has a span whose offset is negative"), model.Time(10), []mimirpb.LabelAdapter{metricLabelAdapters[0]}), userID), codes.FailedPrecondition), + expectedMetrics: ` + # HELP cortex_ingester_ingested_samples_total The total number of samples ingested per user. + # TYPE cortex_ingester_ingested_samples_total counter + cortex_ingester_ingested_samples_total{user="test"} 0 + # HELP cortex_discarded_samples_total The total number of samples that were discarded. + # TYPE cortex_discarded_samples_total counter + cortex_discarded_samples_total{group="",reason="invalid-native-histogram",user="test"} 1 + # HELP cortex_ingester_ingested_samples_failures_total The total number of samples that errored on ingestion per user. + # TYPE cortex_ingester_ingested_samples_failures_total counter + cortex_ingester_ingested_samples_failures_total{user="test"} 1 + # HELP cortex_ingester_memory_users The current number of users in memory. + # TYPE cortex_ingester_memory_users gauge + cortex_ingester_memory_users 1 + # HELP cortex_ingester_memory_series The current number of series in memory. + # TYPE cortex_ingester_memory_series gauge + cortex_ingester_memory_series 0 + # HELP cortex_ingester_memory_series_created_total The total number of series that were created per user. + # TYPE cortex_ingester_memory_series_created_total counter + cortex_ingester_memory_series_created_total{user="test"} 0 + # HELP cortex_ingester_memory_series_removed_total The total number of series that were removed per user. + # TYPE cortex_ingester_memory_series_removed_total counter + cortex_ingester_memory_series_removed_total{user="test"} 0 + # HELP cortex_ingester_tsdb_head_min_timestamp_seconds Minimum timestamp of the head block across all tenants. + # TYPE cortex_ingester_tsdb_head_min_timestamp_seconds gauge + cortex_ingester_tsdb_head_min_timestamp_seconds 0.01 + # HELP cortex_ingester_tsdb_head_max_timestamp_seconds Maximum timestamp of the head block across all tenants. + # TYPE cortex_ingester_tsdb_head_max_timestamp_seconds gauge + cortex_ingester_tsdb_head_max_timestamp_seconds 0.01 + `, + }, + "should soft fail if histogram has different number of buckets then encoded in spans": { + nativeHistograms: true, + reqs: []*mimirpb.WriteRequest{ + mimirpb.NewWriteRequest(nil, mimirpb.API).AddHistogramSeries( + [][]mimirpb.LabelAdapter{metricLabelAdapters}, + []mimirpb.Histogram{mimirpb.FromHistogramToHistogramProto(10, histogramWithSpansBucketsMismatch)}, + nil, + ), + }, + // Expect the error string instead of constructing the error to catch if Prometheus changes the error message. + expectedErr: newErrorWithStatus(wrapOrAnnotateWithUser(newNativeHistogramValidationError(globalerror.NativeHistogramSpansBucketsMismatch, fmt.Errorf("positive side: spans need 5 buckets, have 4 buckets: histogram spans specify different number of buckets than provided"), model.Time(10), []mimirpb.LabelAdapter{metricLabelAdapters[0]}), userID), codes.FailedPrecondition), + expectedMetrics: ` + # HELP cortex_ingester_ingested_samples_total The total number of samples ingested per user. + # TYPE cortex_ingester_ingested_samples_total counter + cortex_ingester_ingested_samples_total{user="test"} 0 + # HELP cortex_discarded_samples_total The total number of samples that were discarded. + # TYPE cortex_discarded_samples_total counter + cortex_discarded_samples_total{group="",reason="invalid-native-histogram",user="test"} 1 + # HELP cortex_ingester_ingested_samples_failures_total The total number of samples that errored on ingestion per user. + # TYPE cortex_ingester_ingested_samples_failures_total counter + cortex_ingester_ingested_samples_failures_total{user="test"} 1 + # HELP cortex_ingester_memory_users The current number of users in memory. + # TYPE cortex_ingester_memory_users gauge + cortex_ingester_memory_users 1 + # HELP cortex_ingester_memory_series The current number of series in memory. + # TYPE cortex_ingester_memory_series gauge + cortex_ingester_memory_series 0 + # HELP cortex_ingester_memory_series_created_total The total number of series that were created per user. + # TYPE cortex_ingester_memory_series_created_total counter + cortex_ingester_memory_series_created_total{user="test"} 0 + # HELP cortex_ingester_memory_series_removed_total The total number of series that were removed per user. + # TYPE cortex_ingester_memory_series_removed_total counter + cortex_ingester_memory_series_removed_total{user="test"} 0 + # HELP cortex_ingester_tsdb_head_min_timestamp_seconds Minimum timestamp of the head block across all tenants. + # TYPE cortex_ingester_tsdb_head_min_timestamp_seconds gauge + cortex_ingester_tsdb_head_min_timestamp_seconds 0.01 + # HELP cortex_ingester_tsdb_head_max_timestamp_seconds Maximum timestamp of the head block across all tenants. + # TYPE cortex_ingester_tsdb_head_max_timestamp_seconds gauge + cortex_ingester_tsdb_head_max_timestamp_seconds 0.01 + `, + }, + "should soft fail if histogram has a negative bucket count": { + nativeHistograms: true, + reqs: []*mimirpb.WriteRequest{ + mimirpb.NewWriteRequest(nil, mimirpb.API).AddHistogramSeries( + [][]mimirpb.LabelAdapter{metricLabelAdapters}, + []mimirpb.Histogram{mimirpb.FromHistogramToHistogramProto(10, histogramWithNegativeBucketCount)}, + nil, + ), + }, + // Expect the error string instead of constructing the error to catch if Prometheus changes the error message. + expectedErr: newErrorWithStatus(wrapOrAnnotateWithUser(newNativeHistogramValidationError(globalerror.NativeHistogramNegativeBucketCount, fmt.Errorf("negative side: bucket number 2 has observation count of -98: histogram has a bucket whose observation count is negative"), model.Time(10), []mimirpb.LabelAdapter{metricLabelAdapters[0]}), userID), codes.FailedPrecondition), + expectedMetrics: ` + # HELP cortex_ingester_ingested_samples_total The total number of samples ingested per user. + # TYPE cortex_ingester_ingested_samples_total counter + cortex_ingester_ingested_samples_total{user="test"} 0 + # HELP cortex_discarded_samples_total The total number of samples that were discarded. + # TYPE cortex_discarded_samples_total counter + cortex_discarded_samples_total{group="",reason="invalid-native-histogram",user="test"} 1 + # HELP cortex_ingester_ingested_samples_failures_total The total number of samples that errored on ingestion per user. + # TYPE cortex_ingester_ingested_samples_failures_total counter + cortex_ingester_ingested_samples_failures_total{user="test"} 1 + # HELP cortex_ingester_memory_users The current number of users in memory. + # TYPE cortex_ingester_memory_users gauge + cortex_ingester_memory_users 1 + # HELP cortex_ingester_memory_series The current number of series in memory. + # TYPE cortex_ingester_memory_series gauge + cortex_ingester_memory_series 0 + # HELP cortex_ingester_memory_series_created_total The total number of series that were created per user. + # TYPE cortex_ingester_memory_series_created_total counter + cortex_ingester_memory_series_created_total{user="test"} 0 + # HELP cortex_ingester_memory_series_removed_total The total number of series that were removed per user. + # TYPE cortex_ingester_memory_series_removed_total counter + cortex_ingester_memory_series_removed_total{user="test"} 0 + # HELP cortex_ingester_tsdb_head_min_timestamp_seconds Minimum timestamp of the head block across all tenants. + # TYPE cortex_ingester_tsdb_head_min_timestamp_seconds gauge + cortex_ingester_tsdb_head_min_timestamp_seconds 0.01 + # HELP cortex_ingester_tsdb_head_max_timestamp_seconds Maximum timestamp of the head block across all tenants. + # TYPE cortex_ingester_tsdb_head_max_timestamp_seconds gauge + cortex_ingester_tsdb_head_max_timestamp_seconds 0.01 + `, + }, "successful push, active series disabled": { disableActiveSeries: true, reqs: []*mimirpb.WriteRequest{ diff --git a/pkg/ingester/metrics.go b/pkg/ingester/metrics.go index 2c422c24fab..833a802fb04 100644 --- a/pkg/ingester/metrics.go +++ b/pkg/ingester/metrics.go @@ -420,24 +420,26 @@ func (m *ingesterMetrics) deletePerUserCustomTrackerMetrics(userID string, custo } type discardedMetrics struct { - sampleOutOfBounds *prometheus.CounterVec - sampleOutOfOrder *prometheus.CounterVec - sampleTooOld *prometheus.CounterVec - sampleTooFarInFuture *prometheus.CounterVec - newValueForTimestamp *prometheus.CounterVec - perUserSeriesLimit *prometheus.CounterVec - perMetricSeriesLimit *prometheus.CounterVec + sampleOutOfBounds *prometheus.CounterVec + sampleOutOfOrder *prometheus.CounterVec + sampleTooOld *prometheus.CounterVec + sampleTooFarInFuture *prometheus.CounterVec + newValueForTimestamp *prometheus.CounterVec + perUserSeriesLimit *prometheus.CounterVec + perMetricSeriesLimit *prometheus.CounterVec + invalidNativeHistogram *prometheus.CounterVec } func newDiscardedMetrics(r prometheus.Registerer) *discardedMetrics { return &discardedMetrics{ - sampleOutOfBounds: validation.DiscardedSamplesCounter(r, reasonSampleOutOfBounds), - sampleOutOfOrder: validation.DiscardedSamplesCounter(r, reasonSampleOutOfOrder), - sampleTooOld: validation.DiscardedSamplesCounter(r, reasonSampleTooOld), - sampleTooFarInFuture: validation.DiscardedSamplesCounter(r, reasonSampleTooFarInFuture), - newValueForTimestamp: validation.DiscardedSamplesCounter(r, reasonNewValueForTimestamp), - perUserSeriesLimit: validation.DiscardedSamplesCounter(r, reasonPerUserSeriesLimit), - perMetricSeriesLimit: validation.DiscardedSamplesCounter(r, reasonPerMetricSeriesLimit), + sampleOutOfBounds: validation.DiscardedSamplesCounter(r, reasonSampleOutOfBounds), + sampleOutOfOrder: validation.DiscardedSamplesCounter(r, reasonSampleOutOfOrder), + sampleTooOld: validation.DiscardedSamplesCounter(r, reasonSampleTooOld), + sampleTooFarInFuture: validation.DiscardedSamplesCounter(r, reasonSampleTooFarInFuture), + newValueForTimestamp: validation.DiscardedSamplesCounter(r, reasonNewValueForTimestamp), + perUserSeriesLimit: validation.DiscardedSamplesCounter(r, reasonPerUserSeriesLimit), + perMetricSeriesLimit: validation.DiscardedSamplesCounter(r, reasonPerMetricSeriesLimit), + invalidNativeHistogram: validation.DiscardedSamplesCounter(r, reasonInvalidNativeHistogram), } } @@ -449,6 +451,7 @@ func (m *discardedMetrics) DeletePartialMatch(filter prometheus.Labels) { m.newValueForTimestamp.DeletePartialMatch(filter) m.perUserSeriesLimit.DeletePartialMatch(filter) m.perMetricSeriesLimit.DeletePartialMatch(filter) + m.invalidNativeHistogram.DeletePartialMatch(filter) } func (m *discardedMetrics) DeleteLabelValues(userID string, group string) { @@ -459,6 +462,7 @@ func (m *discardedMetrics) DeleteLabelValues(userID string, group string) { m.newValueForTimestamp.DeleteLabelValues(userID, group) m.perUserSeriesLimit.DeleteLabelValues(userID, group) m.perMetricSeriesLimit.DeleteLabelValues(userID, group) + m.invalidNativeHistogram.DeleteLabelValues(userID, group) } // TSDB metrics collector. Each tenant has its own registry, that TSDB code uses. diff --git a/pkg/util/globalerror/errors.go b/pkg/util/globalerror/errors.go index 2b26569324c..1a35681272f 100644 --- a/pkg/util/globalerror/errors.go +++ b/pkg/util/globalerror/errors.go @@ -72,6 +72,14 @@ const ( BucketIndexTooOld ID = "bucket-index-too-old" DistributorMaxWriteMessageSize ID = "distributor-max-write-message-size" + + // Map Prometheus TSDB native histogram validation errors to Mimir errors. + // E.g. histogram.ErrHistogramCountNotBigEnough -> NativeHistogramCountNotBigEnough + NativeHistogramCountMismatch ID = "native-histogram-count-mismatch" + NativeHistogramCountNotBigEnough ID = "native-histogram-count-not-big-enough" + NativeHistogramNegativeBucketCount ID = "native-histogram-negative-bucket-count" + NativeHistogramSpanNegativeOffset ID = "native-histogram-span-negative-offset" + NativeHistogramSpansBucketsMismatch ID = "native-histogram-spans-buckets-mismatch" ) // Message returns the provided msg, appending the error id.