Enable sharding for avg_over_time. (#10373)

**What this PR does / why we need it**: The range aggregation `avg_over_time` is simply shardable if there's no label reduction. If there is one it can be express as ``` sum by (method) ( sum_over_time( {container="app"} | json | unwrap bytes [$__interval]) ) / sum by (method) ( count_over_time( {container="app"} | json [$__interval] ) ) ``` Note that `sum by`, `sum_over_time` and `count_over_time` are shardable once more. **Checklist** - [ ] Reviewed the [`CONTRIBUTING.md`](https://github.com/grafana/loki/blob/main/CONTRIBUTING.md) guide (**required**) - [ ] Documentation added - [x] Tests updated - [x] `CHANGELOG.md` updated - [ ] If the change is worth mentioning in the release notes, add `add-to-release-notes` label - [ ] Changes that require user attention or interaction to upgrade are documented in `docs/sources/setup/upgrade/_index.md` - [ ] For Helm chart changes bump the Helm chart version in `production/helm/loki/Chart.yaml` and update `production/helm/loki/CHANGELOG.md` and `production/helm/loki/README.md`. [Example PR](d10549e) --------- Co-authored-by: Owen Diehl <ow.diehl@gmail.com>
grafana · Aug 29, 2023 · f8658fd · f8658fd
1 parent 05195f2
commit f8658fd
Show file tree

Hide file tree

Showing 9 changed files with 260 additions and 37 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -62,6 +62,7 @@
 * [10308](https://github.com/grafana/loki/pull/10308) **bboreham** Tracing: elide small traces for Stats call.
 * [10341](https://github.com/grafana/loki/pull/10341) **ashwanthgoli** Deprecate older index types and non-object stores - `aws-dynamo, gcp, gcp-columnkey, bigtable, bigtable-hashed, cassandra, grpc`
 * [10344](https://github.com/grafana/loki/pull/10344) **ashwanthgoli**  Compactor: deprecate `-boltdb.shipper.compactor.` prefix in favor of `-compactor.`.
+* [10373](https://github.com/grafana/loki/pull/10373) **jeschkies**  Loki: Shard `avg_over_time` range aggregations.
 
 ##### Fixes
 

diff --git a/pkg/logql/downstream_test.go b/pkg/logql/downstream_test.go
@@ -23,7 +23,7 @@ func TestMappingEquivalence(t *testing.T) {
 		shards   = 3
 		nStreams = 60
 		rounds   = 20
-		streams  = randomStreams(nStreams, rounds+1, shards, []string{"a", "b", "c", "d"})
+		streams  = randomStreams(nStreams, rounds+1, shards, []string{"a", "b", "c", "d"}, true)
 		start    = time.Unix(0, 0)
 		end      = time.Unix(0, int64(time.Second*time.Duration(rounds)))
 		step     = time.Second
@@ -51,6 +51,8 @@ func TestMappingEquivalence(t *testing.T) {
 		{`max(count(rate({a=~".+"}[1s])))`, false},
 		{`max(sum by (cluster) (rate({a=~".+"}[1s]))) / count(rate({a=~".+"}[1s]))`, false},
 		{`sum(rate({a=~".+"} |= "foo" != "foo"[1s]) or vector(1))`, false},
+		{`avg_over_time({a=~".+"} | logfmt | unwrap value [1s])`, false},
+		{`avg_over_time({a=~".+"} | logfmt | unwrap value [1s]) by (a)`, true},
 		// topk prefers already-seen values in tiebreakers. Since the test data generates
 		// the same log lines for each series & the resulting promql.Vectors aren't deterministically
 		// sorted by labels, we don't expect this to pass.
@@ -106,7 +108,7 @@ func TestShardCounter(t *testing.T) {
 		shards   = 3
 		nStreams = 60
 		rounds   = 20
-		streams  = randomStreams(nStreams, rounds+1, shards, []string{"a", "b", "c", "d"})
+		streams  = randomStreams(nStreams, rounds+1, shards, []string{"a", "b", "c", "d"}, false)
 		start    = time.Unix(0, 0)
 		end      = time.Unix(0, int64(time.Second*time.Duration(rounds)))
 		step     = time.Second
@@ -168,7 +170,7 @@ func TestRangeMappingEquivalence(t *testing.T) {
 		shards   = 3
 		nStreams = 60
 		rounds   = 20
-		streams  = randomStreams(nStreams, rounds+1, shards, []string{"a", "b", "c", "d"})
+		streams  = randomStreams(nStreams, rounds+1, shards, []string{"a", "b", "c", "d"}, false)
 		start    = time.Unix(0, 0)
 		end      = time.Unix(0, int64(time.Second*time.Duration(rounds)))
 		step     = time.Second
@@ -427,20 +429,20 @@ func TestRangeMappingEquivalence(t *testing.T) {
 // approximatelyEquals ensures two responses are approximately equal,
 // up to 6 decimals precision per sample
 func approximatelyEquals(t *testing.T, as, bs promql.Matrix) {
-	require.Equal(t, len(as), len(bs))
+	require.Len(t, bs, len(as))
 
 	for i := 0; i < len(as); i++ {
 		a := as[i]
 		b := bs[i]
 		require.Equal(t, a.Metric, b.Metric)
-		require.Equal(t, len(a.Floats), len(b.Floats))
+		require.Lenf(t, b.Floats, len(a.Floats), "at step %d", i)
 
 		for j := 0; j < len(a.Floats); j++ {
 			aSample := &a.Floats[j]
 			aSample.F = math.Round(aSample.F*1e6) / 1e6
 			bSample := &b.Floats[j]
 			bSample.F = math.Round(bSample.F*1e6) / 1e6
 		}
-		require.Equal(t, a, b)
+		require.Equalf(t, a, b, "metric %s differs from %s at %d", a.Metric, b.Metric, i)
 	}
 }
diff --git a/pkg/logql/range_vector.go b/pkg/logql/range_vector.go
@@ -422,6 +422,8 @@ func minOverTime(samples []promql.FPoint) float64 {
 	return min
 }
 
+// stdvarOverTime calculates the variance using Welford's online algorithm.
+// See https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm
 func stdvarOverTime(samples []promql.FPoint) float64 {
 	var aux, count, mean float64
 	for _, v := range samples {

diff --git a/pkg/logql/shardmapper.go b/pkg/logql/shardmapper.go
@@ -311,6 +311,24 @@ func (m ShardMapper) mapLabelReplaceExpr(expr *syntax.LabelReplaceExpr, r *downs
 	return &cpy, bytesPerShard, nil
 }
 
+// These functions require a different merge strategy than the default
+// concatenation.
+// This is because the same label sets may exist on multiple shards when label-reducing parsing is applied or when
+// grouping by some subset of the labels. In this case, the resulting vector may have multiple values for the same
+// series and we need to combine them appropriately given a particular operation.
+var rangeMergeMap = map[string]string{
+	// all these may be summed
+	syntax.OpRangeTypeCount:     syntax.OpTypeSum,
+	syntax.OpRangeTypeRate:      syntax.OpTypeSum,
+	syntax.OpRangeTypeBytes:     syntax.OpTypeSum,
+	syntax.OpRangeTypeBytesRate: syntax.OpTypeSum,
+	syntax.OpRangeTypeSum:       syntax.OpTypeSum,
+
+	// min & max require taking the min|max of the shards
+	syntax.OpRangeTypeMin: syntax.OpTypeMin,
+	syntax.OpRangeTypeMax: syntax.OpTypeMax,
+}
+
 func (m ShardMapper) mapRangeAggregationExpr(expr *syntax.RangeAggregationExpr, r *downstreamRecorder) (syntax.SampleExpr, uint64, error) {
 	if !expr.Shardable() {
 		exprStats, err := m.shards.GetStats(expr)
@@ -332,24 +350,6 @@ func (m ShardMapper) mapRangeAggregationExpr(expr *syntax.RangeAggregationExpr,
 			return m.mapSampleExpr(expr, r)
 		}
 
-		// These functions require a different merge strategy than the default
-		// concatenation.
-		// This is because the same label sets may exist on multiple shards when label-reducing parsing is applied or when
-		// grouping by some subset of the labels. In this case, the resulting vector may have multiple values for the same
-		// series and we need to combine them appropriately given a particular operation.
-		mergeMap := map[string]string{
-			// all these may be summed
-			syntax.OpRangeTypeCount:     syntax.OpTypeSum,
-			syntax.OpRangeTypeRate:      syntax.OpTypeSum,
-			syntax.OpRangeTypeBytes:     syntax.OpTypeSum,
-			syntax.OpRangeTypeBytesRate: syntax.OpTypeSum,
-			syntax.OpRangeTypeSum:       syntax.OpTypeSum,
-
-			// min & max require taking the min|max of the shards
-			syntax.OpRangeTypeMin: syntax.OpTypeMin,
-			syntax.OpRangeTypeMax: syntax.OpTypeMax,
-		}
-
 		// range aggregation groupings default to `without ()` behavior
 		// so we explicitly set the wrapping vector aggregation to this
 		// for parity when it's not explicitly set
@@ -361,7 +361,7 @@ func (m ShardMapper) mapRangeAggregationExpr(expr *syntax.RangeAggregationExpr,
 		mapped, bytes, err := m.mapSampleExpr(expr, r)
 		// max_over_time(_) -> max without() (max_over_time(_) ++ max_over_time(_)...)
 		// max_over_time(_) by (foo) -> max by (foo) (max_over_time(_) by (foo) ++ max_over_time(_) by (foo)...)
-		merger, ok := mergeMap[expr.Operation]
+		merger, ok := rangeMergeMap[expr.Operation]
 		if !ok {
 			return nil, 0, fmt.Errorf(
 				"error while finding merge operation for %s", expr.Operation,
@@ -373,6 +373,52 @@ func (m ShardMapper) mapRangeAggregationExpr(expr *syntax.RangeAggregationExpr,
 			Operation: merger,
 		}, bytes, err
 
+	case syntax.OpRangeTypeAvg:
+		potentialConflict := syntax.ReducesLabels(expr)
+		if !potentialConflict && (expr.Grouping == nil || expr.Grouping.Noop()) {
+			return m.mapSampleExpr(expr, r)
+		}
+
+		// avg_overtime() by (foo) -> sum by (foo) (sum_over_time()) / sum by (foo) (count_over_time())
+		lhs, lhsBytesPerShard, err := m.mapVectorAggregationExpr(&syntax.VectorAggregationExpr{
+			Left: &syntax.RangeAggregationExpr{
+				Left:      expr.Left,
+				Operation: syntax.OpRangeTypeSum,
+			},
+			Grouping:  expr.Grouping,
+			Operation: syntax.OpTypeSum,
+		}, r)
+		if err != nil {
+			return nil, 0, err
+		}
+
+		// Strip unwrap from log range
+		countOverTimeSelector, err := expr.Left.WithoutUnwrap()
+		if err != nil {
+			return nil, 0, err
+		}
+
+		rhs, rhsBytesPerShard, err := m.mapVectorAggregationExpr(&syntax.VectorAggregationExpr{
+			Left: &syntax.RangeAggregationExpr{
+				Left:      countOverTimeSelector,
+				Operation: syntax.OpRangeTypeCount,
+			},
+			Grouping:  expr.Grouping,
+			Operation: syntax.OpTypeSum,
+		}, r)
+		if err != nil {
+			return nil, 0, err
+		}
+
+		// We take the maximum bytes per shard of both sides of the operation
+		bytesPerShard := uint64(math.Max(int(lhsBytesPerShard), int(rhsBytesPerShard)))
+
+		return &syntax.BinOpExpr{
+			SampleExpr: lhs,
+			RHS:        rhs,
+			Op:         syntax.OpTypeDiv,
+		}, bytesPerShard, nil
+
 	default:
 		// don't shard if there's not an appropriate optimization
 		exprStats, err := m.shards.GetStats(expr)

diff --git a/pkg/logql/shardmapper_test.go b/pkg/logql/shardmapper_test.go
@@ -328,12 +328,40 @@ func TestMappingStrings(t *testing.T) {
 			)`,
 		},
 		{
-			in:  `avg(avg_over_time({job=~"myapps.*"} |= "stats" | json busy="utilization" | unwrap busy [5m]))`,
-			out: `avg(avg_over_time({job=~"myapps.*"} |= "stats" | json busy="utilization" | unwrap busy [5m]))`,
+			in: `avg(avg_over_time({job=~"myapps.*"} |= "stats" | json busy="utilization" | unwrap busy [5m]))`,
+			out: `(
+				sum(
+					downstream<sum(avg_over_time({job=~"myapps.*"}|="stats" | json busy="utilization" | unwrap busy [5m])),shard=0_of_2>
+					++
+					downstream<sum(avg_over_time({job=~"myapps.*"}|="stats" | json busy="utilization" | unwrap busy [5m])),shard=1_of_2>)
+				/
+				sum(
+					downstream<count(avg_over_time({job=~"myapps.*"}|="stats" | json busy="utilization" | unwrap busy [5m])),shard=0_of_2>
+					++
+					downstream<count(avg_over_time({job=~"myapps.*"}|="stats" | json busy="utilization" |unwrap busy [5m])),shard=1_of_2>
+				)
+			)`,
 		},
 		{
-			in:  `avg_over_time({job=~"myapps.*"} |= "stats" | json busy="utilization" | unwrap busy [5m])`,
-			out: `avg_over_time({job=~"myapps.*"} |= "stats" | json busy="utilization" | unwrap busy [5m])`,
+			in: `avg_over_time({job=~"myapps.*"} |= "stats" | json busy="utilization" | unwrap busy [5m])`,
+			out: `downstream<avg_over_time({job=~"myapps.*"}|= "stats" | json busy="utilization" | unwrap busy [5m]),shard=0_of_2>
+					++ downstream<avg_over_time({job=~"myapps.*"}|="stats" | json busy="utilization" | unwrap busy [5m]),shard=1_of_2>`,
+		},
+		{
+			in: `avg_over_time({job=~"myapps.*"} |= "stats" | json busy="utilization" | unwrap busy [5m]) by (cluster)`,
+			out: `(
+				sum by (cluster) (
+					downstream<sum by (cluster) (sum_over_time({job=~"myapps.*"}|="stats" | json busy="utilization" | unwrap busy [5m])),shard=0_of_2>
+					++
+					downstream<sum by (cluster) (sum_over_time({job=~"myapps.*"}|="stats" | json busy="utilization" | unwrap busy [5m])),shard=1_of_2>
+				)
+				/
+				sum by (cluster) (
+					downstream<sum by (cluster) (count_over_time({job=~"myapps.*"}|="stats" | json busy="utilization" [5m])),shard=0_of_2>
+					++
+					downstream<sum by (cluster) (count_over_time({job=~"myapps.*"}|="stats" | json busy="utilization" [5m])),shard=1_of_2>
+				)
+			)`,
 		},
 		// should be noop if VectorExpr
 		{
@@ -1180,6 +1208,124 @@ func TestMapping(t *testing.T) {
 				},
 			},
 		},
+		{
+			in: `avg_over_time({foo="bar"} | unwrap bytes [5m]) by (cluster)`,
+			expr: &syntax.BinOpExpr{
+				Op: syntax.OpTypeDiv,
+				SampleExpr: &syntax.VectorAggregationExpr{
+					Grouping: &syntax.Grouping{
+						Groups: []string{"cluster"},
+					},
+					Operation: syntax.OpTypeSum,
+					Left: &ConcatSampleExpr{
+						DownstreamSampleExpr: DownstreamSampleExpr{
+							shard: &astmapper.ShardAnnotation{
+								Shard: 0,
+								Of:    2,
+							},
+							SampleExpr: &syntax.VectorAggregationExpr{
+								Grouping: &syntax.Grouping{
+									Groups: []string{"cluster"},
+								},
+								Operation: syntax.OpTypeSum,
+								Left: &syntax.RangeAggregationExpr{
+									Operation: syntax.OpRangeTypeSum,
+									Left: &syntax.LogRange{
+										Left: &syntax.MatchersExpr{
+											Mts: []*labels.Matcher{mustNewMatcher(labels.MatchEqual, "foo", "bar")},
+										},
+										Interval: 5 * time.Minute,
+										Unwrap: &syntax.UnwrapExpr{
+											Identifier: "bytes",
+										},
+									},
+								},
+							},
+						},
+						next: &ConcatSampleExpr{
+							DownstreamSampleExpr: DownstreamSampleExpr{
+								shard: &astmapper.ShardAnnotation{
+									Shard: 1,
+									Of:    2,
+								},
+								SampleExpr: &syntax.VectorAggregationExpr{
+									Grouping: &syntax.Grouping{
+										Groups: []string{"cluster"},
+									},
+									Operation: syntax.OpTypeSum,
+									Left: &syntax.RangeAggregationExpr{
+										Operation: syntax.OpRangeTypeSum,
+										Left: &syntax.LogRange{
+											Left: &syntax.MatchersExpr{
+												Mts: []*labels.Matcher{mustNewMatcher(labels.MatchEqual, "foo", "bar")},
+											},
+											Interval: 5 * time.Minute,
+											Unwrap: &syntax.UnwrapExpr{
+												Identifier: "bytes",
+											},
+										},
+									},
+								},
+							},
+							next: nil,
+						},
+					},
+				},
+				RHS: &syntax.VectorAggregationExpr{
+					Operation: syntax.OpTypeSum,
+					Grouping: &syntax.Grouping{
+						Groups: []string{"cluster"},
+					},
+					Left: &ConcatSampleExpr{
+						DownstreamSampleExpr: DownstreamSampleExpr{
+							shard: &astmapper.ShardAnnotation{
+								Shard: 0,
+								Of:    2,
+							},
+							SampleExpr: &syntax.VectorAggregationExpr{
+								Grouping: &syntax.Grouping{
+									Groups: []string{"cluster"},
+								},
+								Operation: syntax.OpTypeSum,
+								Left: &syntax.RangeAggregationExpr{
+									Operation: syntax.OpRangeTypeCount,
+									Left: &syntax.LogRange{
+										Left: &syntax.MatchersExpr{
+											Mts: []*labels.Matcher{mustNewMatcher(labels.MatchEqual, "foo", "bar")},
+										},
+										Interval: 5 * time.Minute,
+									},
+								},
+							},
+						},
+						next: &ConcatSampleExpr{
+							DownstreamSampleExpr: DownstreamSampleExpr{
+								shard: &astmapper.ShardAnnotation{
+									Shard: 1,
+									Of:    2,
+								},
+								SampleExpr: &syntax.VectorAggregationExpr{
+									Grouping: &syntax.Grouping{
+										Groups: []string{"cluster"},
+									},
+									Operation: syntax.OpTypeSum,
+									Left: &syntax.RangeAggregationExpr{
+										Operation: syntax.OpRangeTypeCount,
+										Left: &syntax.LogRange{
+											Left: &syntax.MatchersExpr{
+												Mts: []*labels.Matcher{mustNewMatcher(labels.MatchEqual, "foo", "bar")},
+											},
+											Interval: 5 * time.Minute,
+										},
+									},
+								},
+							},
+							next: nil,
+						},
+					},
+				},
+			},
+		},
 	} {
 		t.Run(tc.in, func(t *testing.T) {
 			ast, err := syntax.ParseExpr(tc.in)