diff --git a/CHANGELOG.md b/CHANGELOG.md index 2cda8eebb74..cf0ffadc26f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ ## Grafana Mimir - main / unreleased * [CHANGE] Compactor: No longer upload debug meta files to object storage. #1257 +* [FEATURE] Ruler: Allow setting `evaluation_delay` for each rule group via rules group configuration file. #1474 * [FEATURE] Distributor: Added the ability to forward specifics metrics to alternative remote_write API endpoints. #1052 * [ENHANCEMENT] Ruler: Add more detailed query information to ruler query stats logging. #1411 diff --git a/go.mod b/go.mod index 7a673bbbdac..2dd31f9e592 100644 --- a/go.mod +++ b/go.mod @@ -214,7 +214,7 @@ replace git.apache.org/thrift.git => github.com/apache/thrift v0.0.0-20180902110 replace github.com/bradfitz/gomemcache => github.com/themihai/gomemcache v0.0.0-20180902122335-24332e2d58ab // Using a fork of Prometheus while we work on querysharding to avoid a dependency on the upstream. -replace github.com/prometheus/prometheus => github.com/grafana/mimir-prometheus v0.0.0-20220210151959-f8e3195f7500 +replace github.com/prometheus/prometheus => github.com/grafana/mimir-prometheus v0.0.0-20220314132007-23ce9ad9f0ff // Pin hashicorp depencencies since the Prometheus fork, go mod tries to update them. replace github.com/hashicorp/go-immutable-radix => github.com/hashicorp/go-immutable-radix v1.2.0 diff --git a/go.sum b/go.sum index bb6b892d16e..78b82b7b5bb 100644 --- a/go.sum +++ b/go.sum @@ -1000,8 +1000,8 @@ github.com/grafana/dskit v0.0.0-20220211095946-19921f863583/go.mod h1:q51XdMLLHN github.com/grafana/e2e v0.1.0 h1:nThd0U0TjUqyOOupSb+qDd4BOdhqwhR/oYbjoqiMlZk= github.com/grafana/e2e v0.1.0/go.mod h1:+26VJWpczg2OU3D0537acnHSHzhJORpxOs6F+M27tZo= github.com/grafana/memberlist v0.2.5-0.20211201083710-c7bc8e9df94b/go.mod h1:MS2lj3INKhZjWNqd3N0m3J+Jxf3DAOnAH9VT3Sh9MUE= -github.com/grafana/mimir-prometheus v0.0.0-20220210151959-f8e3195f7500 h1:1l+T/VWSSC3Uz+9Pkgxv7pUngRLpoeRWagX3DPsTn6I= -github.com/grafana/mimir-prometheus v0.0.0-20220210151959-f8e3195f7500/go.mod h1:6K+MGuCdYASOcOEKusiGUeYeRoobrW/26smN9OCXb0M= +github.com/grafana/mimir-prometheus v0.0.0-20220314132007-23ce9ad9f0ff h1:8cfiJnBz2zYCBvMNK1zAmVc7lu27mQBtuHxoCdxpqPw= +github.com/grafana/mimir-prometheus v0.0.0-20220314132007-23ce9ad9f0ff/go.mod h1:6K+MGuCdYASOcOEKusiGUeYeRoobrW/26smN9OCXb0M= github.com/gregjones/httpcache v0.0.0-20180305231024-9cad4c3443a7/go.mod h1:FecbI9+v66THATjSRHfNgh1IVFe/9kFxbXtjV0ctIMA= github.com/grpc-ecosystem/go-grpc-middleware v1.0.0/go.mod h1:FiyG127CGDf3tlThmgyCl78X/SZQqEOJBCDaAfeWzPs= github.com/grpc-ecosystem/go-grpc-middleware v1.0.1-0.20190118093823-f849b5445de4/go.mod h1:FiyG127CGDf3tlThmgyCl78X/SZQqEOJBCDaAfeWzPs= diff --git a/pkg/ruler/compat.go b/pkg/ruler/compat.go index 06e614fb70c..4aaf095bd55 100644 --- a/pkg/ruler/compat.go +++ b/pkg/ruler/compat.go @@ -16,7 +16,6 @@ import ( "github.com/prometheus/client_golang/prometheus/promauto" "github.com/prometheus/prometheus/model/exemplar" "github.com/prometheus/prometheus/model/labels" - "github.com/prometheus/prometheus/model/value" "github.com/prometheus/prometheus/notifier" "github.com/prometheus/prometheus/promql" "github.com/prometheus/prometheus/rules" @@ -39,29 +38,15 @@ type PusherAppender struct { failedWrites prometheus.Counter totalWrites prometheus.Counter - ctx context.Context - pusher Pusher - labels []labels.Labels - samples []mimirpb.Sample - userID string - evaluationDelay time.Duration + ctx context.Context + pusher Pusher + labels []labels.Labels + samples []mimirpb.Sample + userID string } func (a *PusherAppender) Append(_ storage.SeriesRef, l labels.Labels, t int64, v float64) (storage.SeriesRef, error) { a.labels = append(a.labels, l) - - // Adapt staleness markers for ruler evaluation delay. As the upstream code - // is using the actual time, when there is a no longer available series. - // This then causes 'out of order' append failures once the series is - // becoming available again. - // see https://github.com/prometheus/prometheus/blob/6c56a1faaaad07317ff585bda75b99bdba0517ad/rules/manager.go#L647-L660 - // Similar to staleness markers, the rule manager also appends actual time to the ALERTS and ALERTS_FOR_STATE series. - // See: https://github.com/prometheus/prometheus/blob/ae086c73cb4d6db9e8b67d5038d3704fea6aec4a/rules/alerting.go#L414-L417 - metricName := l.Get(labels.MetricName) - if a.evaluationDelay > 0 && (value.IsStaleNaN(v) || metricName == "ALERTS" || metricName == "ALERTS_FOR_STATE") { - t -= a.evaluationDelay.Milliseconds() - } - a.samples = append(a.samples, mimirpb.Sample{ TimestampMs: t, Value: v, @@ -100,9 +85,8 @@ func (a *PusherAppender) Rollback() error { // PusherAppendable fulfills the storage.Appendable interface for prometheus manager type PusherAppendable struct { - pusher Pusher - userID string - rulesLimits RulesLimits + pusher Pusher + userID string totalWrites prometheus.Counter failedWrites prometheus.Counter @@ -112,7 +96,6 @@ func NewPusherAppendable(pusher Pusher, userID string, limits RulesLimits, total return &PusherAppendable{ pusher: pusher, userID: userID, - rulesLimits: limits, totalWrites: totalWrites, failedWrites: failedWrites, } @@ -124,10 +107,9 @@ func (t *PusherAppendable) Appender(ctx context.Context) storage.Appender { failedWrites: t.failedWrites, totalWrites: t.totalWrites, - ctx: ctx, - pusher: t.pusher, - userID: t.userID, - evaluationDelay: t.rulesLimits.EvaluationDelay(t.userID), + ctx: ctx, + pusher: t.pusher, + userID: t.userID, } } @@ -139,18 +121,6 @@ type RulesLimits interface { RulerMaxRulesPerRuleGroup(userID string) int } -// EngineQueryFunc returns a new query function using the rules.EngineQueryFunc function -// and passing an altered timestamp. -func EngineQueryFunc(engine *promql.Engine, q storage.Queryable, overrides RulesLimits, userID string) rules.QueryFunc { - return func(ctx context.Context, qs string, t time.Time) (promql.Vector, error) { - orig := rules.EngineQueryFunc(engine, q) - // Delay the evaluation of all rules by a set interval to give a buffer - // to metric that haven't been forwarded to cortex yet. - evaluationDelay := overrides.EvaluationDelay(userID) - return orig(ctx, qs, t.Add(-evaluationDelay)) - } -} - func MetricsQueryFunc(qf rules.QueryFunc, queries, failedQueries prometheus.Counter) rules.QueryFunc { return func(ctx context.Context, qs string, t time.Time) (promql.Vector, error) { queries.Inc() @@ -282,7 +252,7 @@ func DefaultTenantManagerFactory(cfg Config, p Pusher, queryable, federatedQuery // Errors from PromQL are always "user" errors. q = querier.NewErrorTranslateQueryableWithFn(q, WrapQueryableErrors) - queryFunc = EngineQueryFunc(engine, q, overrides, userID) + queryFunc = rules.EngineQueryFunc(engine, q) queryFunc = MetricsQueryFunc(queryFunc, totalQueries, failedQueries) queryFunc = RecordAndReportRuleQueryMetrics(queryFunc, queryTime, logger) return queryFunc @@ -304,6 +274,11 @@ func DefaultTenantManagerFactory(cfg Config, p Pusher, queryable, federatedQuery OutageTolerance: cfg.OutageTolerance, ForGracePeriod: cfg.ForGracePeriod, ResendDelay: cfg.ResendDelay, + DefaultEvaluationDelay: func() time.Duration { + // Delay the evaluation of all rules by a set interval to give a buffer + // to metric that haven't been forwarded to Mimir yet. + return overrides.EvaluationDelay(userID) + }, }) } } diff --git a/pkg/ruler/compat_test.go b/pkg/ruler/compat_test.go index 1eb1d0408a2..b8a10f92754 100644 --- a/pkg/ruler/compat_test.go +++ b/pkg/ruler/compat_test.go @@ -48,7 +48,6 @@ func TestPusherAppendable(t *testing.T) { for _, tc := range []struct { name string series string - evalDelay time.Duration value float64 expectedTS int64 }{ @@ -65,50 +64,20 @@ func TestPusherAppendable(t *testing.T) { expectedTS: 120_000, }, { - name: "tenant with delay, normal value", - series: "foo_bar", - value: 1.234, - expectedTS: 120_000, - evalDelay: time.Minute, - }, - { - name: "tenant with delay, stale nan value", - value: math.Float64frombits(value.StaleNaN), - expectedTS: 60_000, - evalDelay: time.Minute, - }, - { - name: "ALERTS without delay, normal value", + name: "ALERTS, normal value", series: `ALERTS{alertname="boop"}`, value: 1.234, expectedTS: 120_000, }, { - name: "ALERTS without delay, stale nan value", + name: "ALERTS, stale nan value", series: `ALERTS{alertname="boop"}`, value: math.Float64frombits(value.StaleNaN), expectedTS: 120_000, }, - { - name: "ALERTS with delay, normal value", - series: `ALERTS{alertname="boop"}`, - value: 1.234, - expectedTS: 60_000, - evalDelay: time.Minute, - }, - { - name: "ALERTS with delay, stale nan value", - series: `ALERTS_FOR_STATE{alertname="boop"}`, - value: math.Float64frombits(value.StaleNaN), - expectedTS: 60_000, - evalDelay: time.Minute, - }, } { t.Run(tc.name, func(t *testing.T) { ctx := context.Background() - pa.rulesLimits = &ruleLimits{ - evalDelay: tc.evalDelay, - } lbls, err := parser.ParseMetric(tc.series) require.NoError(t, err) diff --git a/vendor/github.com/prometheus/prometheus/model/rulefmt/rulefmt.go b/vendor/github.com/prometheus/prometheus/model/rulefmt/rulefmt.go index a550af72403..898a2e39de6 100644 --- a/vendor/github.com/prometheus/prometheus/model/rulefmt/rulefmt.go +++ b/vendor/github.com/prometheus/prometheus/model/rulefmt/rulefmt.go @@ -116,11 +116,12 @@ func (g *RuleGroups) Validate(node ruleGroups) (errs []error) { // RuleGroup is a list of sequentially evaluated recording and alerting rules. type RuleGroup struct { - Name string `yaml:"name"` - Interval model.Duration `yaml:"interval,omitempty"` - Limit int `yaml:"limit,omitempty"` - Rules []RuleNode `yaml:"rules"` - SourceTenants []string `yaml:"source_tenants,omitempty"` + Name string `yaml:"name"` + Interval model.Duration `yaml:"interval,omitempty"` + EvaluationDelay *model.Duration `yaml:"evaluation_delay,omitempty"` + Limit int `yaml:"limit,omitempty"` + Rules []RuleNode `yaml:"rules"` + SourceTenants []string `yaml:"source_tenants,omitempty"` } // Rule describes an alerting or recording rule. diff --git a/vendor/github.com/prometheus/prometheus/rules/alerting.go b/vendor/github.com/prometheus/prometheus/rules/alerting.go index 929f7586df4..0a3c542e28f 100644 --- a/vendor/github.com/prometheus/prometheus/rules/alerting.go +++ b/vendor/github.com/prometheus/prometheus/rules/alerting.go @@ -304,8 +304,8 @@ const resolvedRetention = 15 * time.Minute // Eval evaluates the rule expression and then creates pending alerts and fires // or removes previously pending alerts accordingly. -func (r *AlertingRule) Eval(ctx context.Context, ts time.Time, query QueryFunc, externalURL *url.URL, limit int) (promql.Vector, error) { - res, err := query(ctx, r.vector.String(), ts) +func (r *AlertingRule) Eval(ctx context.Context, evalDelay time.Duration, ts time.Time, query QueryFunc, externalURL *url.URL, limit int) (promql.Vector, error) { + res, err := query(ctx, r.vector.String(), ts.Add(-evalDelay)) if err != nil { return nil, err } @@ -419,8 +419,8 @@ func (r *AlertingRule) Eval(ctx context.Context, ts time.Time, query QueryFunc, } if r.restored { - vec = append(vec, r.sample(a, ts)) - vec = append(vec, r.forStateSample(a, ts, float64(a.ActiveAt.Unix()))) + vec = append(vec, r.sample(a, ts.Add(-evalDelay))) + vec = append(vec, r.forStateSample(a, ts.Add(-evalDelay), float64(a.ActiveAt.Unix()))) } } diff --git a/vendor/github.com/prometheus/prometheus/rules/manager.go b/vendor/github.com/prometheus/prometheus/rules/manager.go index 15d26b14e45..9c02275fb73 100644 --- a/vendor/github.com/prometheus/prometheus/rules/manager.go +++ b/vendor/github.com/prometheus/prometheus/rules/manager.go @@ -212,8 +212,9 @@ type Rule interface { Name() string // Labels of the rule. Labels() labels.Labels - // eval evaluates the rule, including any associated recording or alerting actions. - Eval(context.Context, time.Time, QueryFunc, *url.URL, int) (promql.Vector, error) + // Eval evaluates the rule, including any associated recording or alerting actions. + // The duration passed is the evaluation delay. + Eval(context.Context, time.Duration, time.Time, QueryFunc, *url.URL, int) (promql.Vector, error) // String returns a human-readable string representation of the rule. String() string // Query returns the rule query expression. @@ -244,6 +245,7 @@ type Group struct { name string file string interval time.Duration + evaluationDelay *time.Duration limit int rules []Rule sourceTenants []string @@ -267,14 +269,15 @@ type Group struct { } type GroupOptions struct { - Name, File string - Interval time.Duration - Limit int - Rules []Rule - SourceTenants []string - ShouldRestore bool - Opts *ManagerOptions - done chan struct{} + Name, File string + Interval time.Duration + Limit int + Rules []Rule + SourceTenants []string + ShouldRestore bool + Opts *ManagerOptions + EvaluationDelay *time.Duration + done chan struct{} } // NewGroup makes a new Group with the given name, options, and rules. @@ -299,6 +302,7 @@ func NewGroup(o GroupOptions) *Group { name: o.Name, file: o.File, interval: o.Interval, + evaluationDelay: o.EvaluationDelay, limit: o.Limit, rules: o.Rules, shouldRestore: o.ShouldRestore, @@ -583,6 +587,7 @@ func (g *Group) CopyState(from *Group) { // Eval runs a single evaluation cycle in which all rules are evaluated sequentially. func (g *Group) Eval(ctx context.Context, ts time.Time) { var samplesTotal float64 + evaluationDelay := g.EvaluationDelay() for i, rule := range g.rules { select { case <-g.done: @@ -604,7 +609,7 @@ func (g *Group) Eval(ctx context.Context, ts time.Time) { g.metrics.EvalTotal.WithLabelValues(GroupKey(g.File(), g.Name())).Inc() - vector, err := rule.Eval(ctx, ts, g.opts.QueryFunc, g.opts.ExternalURL, g.Limit()) + vector, err := rule.Eval(ctx, evaluationDelay, ts, g.opts.QueryFunc, g.opts.ExternalURL, g.Limit()) if err != nil { rule.SetHealth(HealthBad) rule.SetLastError(err) @@ -673,7 +678,7 @@ func (g *Group) Eval(ctx context.Context, ts time.Time) { for metric, lset := range g.seriesInPreviousEval[i] { if _, ok := seriesReturned[metric]; !ok { // Series no longer exposed, mark it stale. - _, err = app.Append(0, lset, timestamp.FromTime(ts), math.Float64frombits(value.StaleNaN)) + _, err = app.Append(0, lset, timestamp.FromTime(ts.Add(-evaluationDelay)), math.Float64frombits(value.StaleNaN)) switch errors.Cause(err) { case nil: case storage.ErrOutOfOrderSample, storage.ErrDuplicateSampleForTimestamp: @@ -692,14 +697,25 @@ func (g *Group) Eval(ctx context.Context, ts time.Time) { g.cleanupStaleSeries(ctx, ts) } +func (g *Group) EvaluationDelay() time.Duration { + if g.evaluationDelay != nil { + return *g.evaluationDelay + } + if g.opts.DefaultEvaluationDelay != nil { + return g.opts.DefaultEvaluationDelay() + } + return time.Duration(0) +} + func (g *Group) cleanupStaleSeries(ctx context.Context, ts time.Time) { if len(g.staleSeries) == 0 { return } app := g.opts.Appendable.Appender(ctx) + evaluationDelay := g.EvaluationDelay() for _, s := range g.staleSeries { // Rule that produced series no longer configured, mark it stale. - _, err := app.Append(0, s, timestamp.FromTime(ts), math.Float64frombits(value.StaleNaN)) + _, err := app.Append(0, s, timestamp.FromTime(ts.Add(-evaluationDelay)), math.Float64frombits(value.StaleNaN)) switch errors.Cause(err) { case nil: case storage.ErrOutOfOrderSample, storage.ErrDuplicateSampleForTimestamp: @@ -935,6 +951,7 @@ type ManagerOptions struct { ForGracePeriod time.Duration ResendDelay time.Duration GroupLoader GroupLoader + DefaultEvaluationDelay func() time.Duration Metrics *Metrics } @@ -1131,15 +1148,16 @@ func (m *Manager) LoadGroups( } groups[GroupKey(fn, rg.Name)] = NewGroup(GroupOptions{ - Name: rg.Name, - File: fn, - Interval: itv, - Limit: rg.Limit, - Rules: rules, - SourceTenants: rg.SourceTenants, - ShouldRestore: shouldRestore, - Opts: m.opts, - done: m.done, + Name: rg.Name, + File: fn, + Interval: itv, + Limit: rg.Limit, + Rules: rules, + SourceTenants: rg.SourceTenants, + ShouldRestore: shouldRestore, + Opts: m.opts, + EvaluationDelay: (*time.Duration)(rg.EvaluationDelay), + done: m.done, }) } } diff --git a/vendor/github.com/prometheus/prometheus/rules/recording.go b/vendor/github.com/prometheus/prometheus/rules/recording.go index 0681db9a2dd..13074158e20 100644 --- a/vendor/github.com/prometheus/prometheus/rules/recording.go +++ b/vendor/github.com/prometheus/prometheus/rules/recording.go @@ -73,8 +73,8 @@ func (rule *RecordingRule) Labels() labels.Labels { } // Eval evaluates the rule and then overrides the metric names and labels accordingly. -func (rule *RecordingRule) Eval(ctx context.Context, ts time.Time, query QueryFunc, _ *url.URL, limit int) (promql.Vector, error) { - vector, err := query(ctx, rule.vector.String(), ts) +func (rule *RecordingRule) Eval(ctx context.Context, evalDelay time.Duration, ts time.Time, query QueryFunc, _ *url.URL, limit int) (promql.Vector, error) { + vector, err := query(ctx, rule.vector.String(), ts.Add(-evalDelay)) if err != nil { return nil, err } diff --git a/vendor/github.com/prometheus/prometheus/web/api/v1/api.go b/vendor/github.com/prometheus/prometheus/web/api/v1/api.go index e6a6daffca8..e31acab9cf1 100644 --- a/vendor/github.com/prometheus/prometheus/web/api/v1/api.go +++ b/vendor/github.com/prometheus/prometheus/web/api/v1/api.go @@ -150,7 +150,6 @@ type TSDBAdminStats interface { CleanTombstones() error Delete(mint, maxt int64, ms ...*labels.Matcher) error Snapshot(dir string, withHead bool) error - Stats(statsByLabelName string) (*tsdb.Stats, error) WALReplayStatus() (tsdb.WALReplayStatus, error) } diff --git a/vendor/modules.txt b/vendor/modules.txt index 55cb888e41a..8e8f23b74e3 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -695,7 +695,7 @@ github.com/prometheus/node_exporter/https github.com/prometheus/procfs github.com/prometheus/procfs/internal/fs github.com/prometheus/procfs/internal/util -# github.com/prometheus/prometheus v1.8.2-0.20211217191541-41f1a8125e66 => github.com/grafana/mimir-prometheus v0.0.0-20220210151959-f8e3195f7500 +# github.com/prometheus/prometheus v1.8.2-0.20211217191541-41f1a8125e66 => github.com/grafana/mimir-prometheus v0.0.0-20220314132007-23ce9ad9f0ff ## explicit; go 1.16 github.com/prometheus/prometheus/config github.com/prometheus/prometheus/discovery @@ -1158,7 +1158,7 @@ gopkg.in/yaml.v2 gopkg.in/yaml.v3 # git.apache.org/thrift.git => github.com/apache/thrift v0.0.0-20180902110319-2566ecd5d999 # github.com/bradfitz/gomemcache => github.com/themihai/gomemcache v0.0.0-20180902122335-24332e2d58ab -# github.com/prometheus/prometheus => github.com/grafana/mimir-prometheus v0.0.0-20220210151959-f8e3195f7500 +# github.com/prometheus/prometheus => github.com/grafana/mimir-prometheus v0.0.0-20220314132007-23ce9ad9f0ff # github.com/hashicorp/go-immutable-radix => github.com/hashicorp/go-immutable-radix v1.2.0 # github.com/hashicorp/go-hclog => github.com/hashicorp/go-hclog v0.12.2 # github.com/hashicorp/memberlist v0.2.4 => github.com/grafana/memberlist v0.2.5-0.20211201083710-c7bc8e9df94b