Skip to content

Commit

Permalink
Handle TSDB native histogram validation errors as soft errors (#7773) (
Browse files Browse the repository at this point in the history
…#7787)

* Revert "Distributor: add bucket count validation to native histograms (#7736)"

This reverts commit fb7dbaa.

* Handle TSDB native histogram validation errors are soft errors

* add all testcases

Signed-off-by: György Krajcsovits <gyorgy.krajcsovits@grafana.com>

* Count towards discarded samples metrics with new reason

Signed-off-by: György Krajcsovits <gyorgy.krajcsovits@grafana.com>

---------

Signed-off-by: György Krajcsovits <gyorgy.krajcsovits@grafana.com>
(cherry picked from commit 51c4088)
  • Loading branch information
krajorama authored Apr 3, 2024
1 parent 2a22ed4 commit 4ca0d07
Show file tree
Hide file tree
Showing 7 changed files with 390 additions and 32 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
* [BUGFIX] querier: Don't cache context.Canceled errors for bucket index. #7620
* [BUGFIX] Store-gateway: account for `"other"` time in LabelValues and LabelNames requests. #7622
* [BUGFIX] Query-frontend: Fix memory leak on every request. #7654
* [BUGFIX] Ingester: turn native histogram validation errors in TSDB into soft ingester errors that result in returning 4xx to the end-user instead of 5xx. In the case of TSDB validation errors, the counter `cortex_discarded_samples_total` will be increased with the `reason` label set to `"invalid-native-histogram"`. #7736 #7773

### Mixin

Expand Down
47 changes: 47 additions & 0 deletions docs/sources/mimir/manage/mimir-runbooks/_index.md
Original file line number Diff line number Diff line change
Expand Up @@ -1365,6 +1365,53 @@ This non-critical error occurs when Mimir receives a write request that contains
The series containing such samples are skipped during ingestion, and valid series within the same request are ingested.
{{< /admonition >}}

### err-mimir-native-histogram-count-mismatch

This non-critical error occures when Mimir receives a write request that contains a sample that is a native histogram
where the buckets counts don't add up to the overall count recorded in the native histogram, provided that the overall
sum is a regular float number.

{{< admonition type="note" >}}
The series containing such samples are skipped during ingestion, and valid series within the same request are ingested.
{{< /admonition >}}

### err-mimir-native-histogram-count-not-big-enough

This non-critical error occures when Mimir receives a write request that contains a sample that is a native histogram
where the buckets counts add up to a higher number than the overall count recorded in the native histogram, provided
that the overall sum is not a float number (NaN).

{{< admonition type="note" >}}
The series containing such samples are skipped during ingestion, and valid series within the same request are ingested.
{{< /admonition >}}

### err-mimir-native-histogram-negative-bucket-count

This non-critical error occures when Mimir receives a write request that contains a sample that is a native histogram
where some bucket count is negative.

{{< admonition type="note" >}}
The series containing such samples are skipped during ingestion, and valid series within the same request are ingested.
{{< /admonition >}}

### err-mimir-native-histogram-span-negative-offset

This non-critical error occures when Mimir receives a write request that contains a sample that is a native histogram
where a bucket span has a negative offset.

{{< admonition type="note" >}}
The series containing such samples are skipped during ingestion, and valid series within the same request are ingested.
{{< /admonition >}}

### err-mimir-native-histogram-spans-buckets-mismatch

This non-critical error occures when Mimir receives a write request that contains a sample that is a native histogram
where the number of bucket counts does not agree with the number of buckets encoded in the bucket spans.

{{< admonition type="note" >}}
The series containing such samples are skipped during ingestion, and valid series within the same request are ingested.
{{< /admonition >}}

### err-mimir-label-invalid

This non-critical error occurs when Mimir receives a write request that contains a series with an invalid label name.
Expand Down
39 changes: 39 additions & 0 deletions pkg/ingester/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -455,6 +455,43 @@ var _ ingesterError = perMetricMetadataLimitReachedError{}
// Ensure that perMetricMetadataLimitReachedError is an softError.
var _ softError = perMetricMetadataLimitReachedError{}

// nativeHistogramValidationError indicates that native histogram bucket counts did not add up to the overall count.
type nativeHistogramValidationError struct {
id globalerror.ID
originalErr error
seriesLabels []mimirpb.LabelAdapter
timestamp model.Time
}

func newNativeHistogramValidationError(id globalerror.ID, originalErr error, timestamp model.Time, seriesLabels []mimirpb.LabelAdapter) nativeHistogramValidationError {
return nativeHistogramValidationError{
id: id,
originalErr: originalErr,
seriesLabels: seriesLabels,
timestamp: timestamp,
}
}

func (e nativeHistogramValidationError) Error() string {
return e.id.Message(fmt.Sprintf("err: %v. timestamp=%s, series=%s",
e.originalErr,
e.timestamp.Time().UTC().Format(time.RFC3339Nano),
e.seriesLabels,
))
}

func (e nativeHistogramValidationError) errorCause() mimirpb.ErrorCause {
return mimirpb.BAD_DATA
}

func (e nativeHistogramValidationError) soft() {}

// Ensure that histogramBucketCountMismatchError is an ingesterError.
var _ ingesterError = nativeHistogramValidationError{}

// Ensure that histogramBucketCountMismatchError is an softError.
var _ softError = nativeHistogramValidationError{}

// unavailableError is an ingesterError indicating that the ingester is unavailable.
type unavailableError struct {
state services.State
Expand Down Expand Up @@ -550,6 +587,7 @@ type ingesterErrSamplers struct {
maxMetadataPerMetricLimitExceeded *log.Sampler
maxSeriesPerUserLimitExceeded *log.Sampler
maxMetadataPerUserLimitExceeded *log.Sampler
nativeHistogramValidationError *log.Sampler
}

func newIngesterErrSamplers(freq int64) ingesterErrSamplers {
Expand All @@ -563,6 +601,7 @@ func newIngesterErrSamplers(freq int64) ingesterErrSamplers {
log.NewSampler(freq),
log.NewSampler(freq),
log.NewSampler(freq),
log.NewSampler(freq),
}
}

Expand Down
73 changes: 55 additions & 18 deletions pkg/ingester/ingester.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,13 +97,14 @@ const (
instanceIngestionRateTickInterval = time.Second

// Reasons for discarding samples
reasonSampleOutOfOrder = "sample-out-of-order"
reasonSampleTooOld = "sample-too-old"
reasonSampleTooFarInFuture = "sample-too-far-in-future"
reasonNewValueForTimestamp = "new-value-for-timestamp"
reasonSampleOutOfBounds = "sample-out-of-bounds"
reasonPerUserSeriesLimit = "per_user_series_limit"
reasonPerMetricSeriesLimit = "per_metric_series_limit"
reasonSampleOutOfOrder = "sample-out-of-order"
reasonSampleTooOld = "sample-too-old"
reasonSampleTooFarInFuture = "sample-too-far-in-future"
reasonNewValueForTimestamp = "new-value-for-timestamp"
reasonSampleOutOfBounds = "sample-out-of-bounds"
reasonPerUserSeriesLimit = "per_user_series_limit"
reasonPerMetricSeriesLimit = "per_metric_series_limit"
reasonInvalidNativeHistogram = "invalid-native-histogram"

replicationFactorStatsName = "ingester_replication_factor"
ringStoreStatsName = "ingester_ring_store"
Expand Down Expand Up @@ -878,17 +879,18 @@ type extendedAppender interface {
}

type pushStats struct {
succeededSamplesCount int
failedSamplesCount int
succeededExemplarsCount int
failedExemplarsCount int
sampleOutOfBoundsCount int
sampleOutOfOrderCount int
sampleTooOldCount int
sampleTooFarInFutureCount int
newValueForTimestampCount int
perUserSeriesLimitCount int
perMetricSeriesLimitCount int
succeededSamplesCount int
failedSamplesCount int
succeededExemplarsCount int
failedExemplarsCount int
sampleOutOfBoundsCount int
sampleOutOfOrderCount int
sampleTooOldCount int
sampleTooFarInFutureCount int
newValueForTimestampCount int
perUserSeriesLimitCount int
perMetricSeriesLimitCount int
invalidNativeHistogramCount int
}

type ctxKey int
Expand Down Expand Up @@ -1148,6 +1150,9 @@ func (i *Ingester) updateMetricsFromPushStats(userID string, group string, stats
if stats.perMetricSeriesLimitCount > 0 {
discarded.perMetricSeriesLimit.WithLabelValues(userID, group).Add(float64(stats.perMetricSeriesLimitCount))
}
if stats.invalidNativeHistogramCount > 0 {
discarded.invalidNativeHistogram.WithLabelValues(userID, group).Add(float64(stats.invalidNativeHistogramCount))
}
if stats.succeededSamplesCount > 0 {
i.ingestionRate.Add(int64(stats.succeededSamplesCount))

Expand Down Expand Up @@ -1223,6 +1228,38 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre
return newPerMetricSeriesLimitReachedError(i.limiter.limits.MaxGlobalSeriesPerMetric(userID), labels)
})
return true

// Map TSDB native histogram validation errors to soft errors.
case errors.Is(err, histogram.ErrHistogramCountMismatch):
stats.invalidNativeHistogramCount++
updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError {
return newNativeHistogramValidationError(globalerror.NativeHistogramCountMismatch, err, model.Time(timestamp), labels)
})
return true
case errors.Is(err, histogram.ErrHistogramCountNotBigEnough):
stats.invalidNativeHistogramCount++
updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError {
return newNativeHistogramValidationError(globalerror.NativeHistogramCountNotBigEnough, err, model.Time(timestamp), labels)
})
return true
case errors.Is(err, histogram.ErrHistogramNegativeBucketCount):
stats.invalidNativeHistogramCount++
updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError {
return newNativeHistogramValidationError(globalerror.NativeHistogramNegativeBucketCount, err, model.Time(timestamp), labels)
})
return true
case errors.Is(err, histogram.ErrHistogramSpanNegativeOffset):
stats.invalidNativeHistogramCount++
updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError {
return newNativeHistogramValidationError(globalerror.NativeHistogramSpanNegativeOffset, err, model.Time(timestamp), labels)
})
return true
case errors.Is(err, histogram.ErrHistogramSpansBucketsMismatch):
stats.invalidNativeHistogramCount++
updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError {
return newNativeHistogramValidationError(globalerror.NativeHistogramSpansBucketsMismatch, err, model.Time(timestamp), labels)
})
return true
}
return false
}
Expand Down
Loading

0 comments on commit 4ca0d07

Please sign in to comment.