From a835d705231b214916196671dcb4bde70dc7f31c Mon Sep 17 00:00:00 2001 From: Jordan Lewis Date: Tue, 23 Oct 2018 12:46:53 -0400 Subject: [PATCH 1/3] exec: add runTests helper This test helper runs tests over the given input tuples with differing batch sizes and random selection vectors. Release note: None --- pkg/sql/exec/main_test.go | 27 ++++++++++++++++ pkg/sql/exec/utils_test.go | 64 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 89 insertions(+), 2 deletions(-) create mode 100644 pkg/sql/exec/main_test.go diff --git a/pkg/sql/exec/main_test.go b/pkg/sql/exec/main_test.go new file mode 100644 index 000000000000..57cf26d61ce5 --- /dev/null +++ b/pkg/sql/exec/main_test.go @@ -0,0 +1,27 @@ +// Copyright 2018 The Cockroach Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. + +package exec + +import ( + "os" + "testing" + + "github.com/cockroachdb/cockroach/pkg/util/randutil" +) + +func TestMain(m *testing.M) { + randutil.SeedForTests() + os.Exit(m.Run()) +} diff --git a/pkg/sql/exec/utils_test.go b/pkg/sql/exec/utils_test.go index 561207466e23..e4a4bb6cfe7b 100644 --- a/pkg/sql/exec/utils_test.go +++ b/pkg/sql/exec/utils_test.go @@ -16,9 +16,12 @@ package exec import ( "fmt" + "math/rand" "reflect" "testing" + "github.com/cockroachdb/cockroach/pkg/util/randutil" + "github.com/cockroachdb/cockroach/pkg/sql/exec/types" "github.com/pkg/errors" ) @@ -29,6 +32,30 @@ type tuple []interface{} // tuples represents a table of a single type. type tuples []tuple +// runTests is a helper that automatically runs your tests with varied batch +// sizes and with and without a random selection vector. +// Provide a test function that takes an input Operator, which will give back +// the tuples provided in batches. +func runTests( + t *testing.T, tups tuples, extraTypes []types.T, test func(t *testing.T, input Operator), +) { + rng, _ := randutil.NewPseudoRand() + + for _, batchSize := range []uint16{1, 2, 3, 16, 1024} { + for _, useSel := range []bool{false, true} { + t.Run(fmt.Sprintf("batchSize=%d/sel=%t", batchSize, useSel), func(t *testing.T) { + var tupleSource Operator + if useSel { + tupleSource = newOpTestSelInput(rng, batchSize, tups, extraTypes...) + } else { + tupleSource = newOpTestInput(batchSize, tups, extraTypes...) + } + test(t, tupleSource) + }) + } + } +} + // opTestInput is an Operator that columnarizes test input in the form of tuples // of arbitrary Go types. It's meant to be used in Operator unit tests in // conjunction with opTestOutput like the following: @@ -50,6 +77,9 @@ type opTestInput struct { batchSize uint16 tuples tuples batch ColBatch + useSel bool + rng *rand.Rand + selection []uint16 } var _ Operator = &opTestInput{} @@ -69,6 +99,20 @@ func newOpTestInput(batchSize uint16, tuples tuples, extraCols ...types.T) *opTe return ret } +func newOpTestSelInput( + rng *rand.Rand, batchSize uint16, tuples tuples, extraCols ...types.T, +) *opTestInput { + ret := &opTestInput{ + useSel: true, + rng: rng, + batchSize: batchSize, + tuples: tuples, + extraCols: extraCols, + } + ret.Init() + return ret +} + func (s *opTestInput) Init() { if len(s.tuples) == 0 { panic("empty tuple source") @@ -80,6 +124,11 @@ func (s *opTestInput) Init() { } s.typs = typs s.batch = NewMemBatch(append(typs, s.extraCols...)) + + s.selection = make([]uint16, ColBatchSize) + for i := range s.selection { + s.selection[i] = uint16(i) + } } func (s *opTestInput) Next() ColBatch { @@ -101,19 +150,30 @@ func (s *opTestInput) Next() ColBatch { tups[i], tupleLen)) } } + + if s.useSel { + s.rng.Shuffle(len(s.selection), func(i, j int) { + s.selection[i], s.selection[j] = s.selection[j], s.selection[i] + }) + s.batch.SetSelection(true) + copy(s.batch.Selection(), s.selection) + } else { + s.batch.SetSelection(false) + } + for i := range s.typs { vec := s.batch.ColVec(i) // Automatically convert the Go values into exec.Type slice elements using // reflection. This is slow, but acceptable for tests. col := reflect.ValueOf(vec.Col()) for j := uint16(0); j < batchSize; j++ { - col.Index(int(j)).Set( + outputIdx := s.selection[j] + col.Index(int(outputIdx)).Set( reflect.ValueOf(tups[j][i]).Convert(reflect.TypeOf(vec.Col()).Elem())) } } s.batch.SetLength(batchSize) - s.batch.SetSelection(false) return s.batch } From 1f42429096472a99a700b32464bec1685616e0df Mon Sep 17 00:00:00 2001 From: Jordan Lewis Date: Mon, 22 Oct 2018 01:01:37 -0400 Subject: [PATCH 2/3] exec: add the boolean zero operator It zeroes a column. This should be execgen templated later. Release note: None --- pkg/sql/exec/zero.go | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 pkg/sql/exec/zero.go diff --git a/pkg/sql/exec/zero.go b/pkg/sql/exec/zero.go new file mode 100644 index 000000000000..1f45beebc224 --- /dev/null +++ b/pkg/sql/exec/zero.go @@ -0,0 +1,36 @@ +// Copyright 2018 The Cockroach Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. + +package exec + +var zeroBoolVec = make([]bool, ColBatchSize) + +// This operator zeroes a column. +type zeroBoolOp struct { + input Operator + + colIdx int +} + +func (z zeroBoolOp) Next() ColBatch { + batch := z.input.Next() + if batch.Length() == 0 { + return batch + } + + copy(batch.ColVec(z.colIdx).Bool(), zeroBoolVec) + return batch +} + +func (zeroBoolOp) Init() {} From 7810171572e29284af5d2ed5f2014c3d1fb92306 Mon Sep 17 00:00:00 2001 From: Jordan Lewis Date: Mon, 22 Oct 2018 01:02:32 -0400 Subject: [PATCH 3/3] exec: add the int64 sorted distinct operator flow This operator runs sorted distinct over n columns, populating the selection vector with the indices of the tuples that are distinct in the input. This will be execgen templated later. ``` BenchmarkSortedDistinct-8 500000 3924 ns/op 6262.34 MB/s ``` Release note: None --- pkg/sql/exec/distinct.go | 157 ++++++++++++++++++++++++++++++++++ pkg/sql/exec/distinct_test.go | 137 +++++++++++++++++++++++++++++ 2 files changed, 294 insertions(+) create mode 100644 pkg/sql/exec/distinct.go create mode 100644 pkg/sql/exec/distinct_test.go diff --git a/pkg/sql/exec/distinct.go b/pkg/sql/exec/distinct.go new file mode 100644 index 000000000000..80f9553a2d96 --- /dev/null +++ b/pkg/sql/exec/distinct.go @@ -0,0 +1,157 @@ +// Copyright 2018 The Cockroach Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. + +package exec + +// sortedDistinctInt64Op runs a distinct on the column in sortedDistinctCol, +// writing true to the resultant bool column for every value that differs from +// the previous one. +type sortedDistinctInt64Op struct { + input Operator + + // sortedDistinctCol is the index of the column to distinct upon. + sortedDistinctCol int + + // outputColIdx is the index of the boolean output column in the input batch. + outputColIdx int + + // Set to true at runtime when we've seen the first row. Distinct always + // outputs the first row that it sees. + foundFirstRow bool + + // lastVal is the last value seen by the operator, so that the distincting + // still works across batch boundaries. + lastVal int64 // template +} + +var _ Operator = &sortedDistinctInt64Op{} + +func (p *sortedDistinctInt64Op) Init() {} + +func (p *sortedDistinctInt64Op) Next() ColBatch { + batch := p.input.Next() + if batch.Length() == 0 { + return batch + } + outputCol := batch.ColVec(p.outputColIdx).Bool() + col := batch.ColVec(p.sortedDistinctCol).Int64() + + // We always output the first row. + lastVal := p.lastVal + sel := batch.Selection() + if !p.foundFirstRow { + if sel != nil { + lastVal = col[sel[0]] + outputCol[sel[0]] = true + } else { + lastVal = col[0] + outputCol[0] = true + } + } + + startIdx := uint16(0) + if !p.foundFirstRow { + startIdx = 1 + } + + n := batch.Length() + if sel != nil { + // Bounds check elimination. + sel = sel[startIdx:n] + for _, i := range sel { + v := col[i] + // Note that not inlining this unique var actually makes a non-trivial + // performance difference. + unique := v != lastVal + outputCol[i] = outputCol[i] || unique + lastVal = v + } + } else { + // Bounds check elimination. + col = col[startIdx:n] + outputCol = outputCol[startIdx:n] + for i := range col { + v := col[i] + // Note that not inlining this unique var actually makes a non-trivial + // performance difference. + unique := v != lastVal + outputCol[i] = outputCol[i] || unique + lastVal = v + } + } + + p.lastVal = lastVal + p.foundFirstRow = true + + return batch +} + +// This finalizer op transforms the vector in outputColIdx to the selection +// vector, by adding an index to the selection for each true value in the column +// at outputColIdx. +type sortedDistinctFinalizerOp struct { + input Operator + + // outputColIdx is the index of the boolean output column from previous + // distinct ops in the input batch. + outputColIdx int +} + +var _ Operator = &sortedDistinctFinalizerOp{} + +func (p *sortedDistinctFinalizerOp) Next() ColBatch { + // Loop until we have non-zero amount of output to return, or our input's been + // exhausted. + for { + batch := p.input.Next() + if batch.Length() == 0 { + return batch + } + outputCol := batch.ColVec(p.outputColIdx).Bool() + + // Convert outputCol to a selection vector by outputting the index of each + // tuple whose outputCol value is true. + // Note that, if the input already had a selection vector, the output + // selection vector will be a subset of the input selection vector. + idx := uint16(0) + n := batch.Length() + if sel := batch.Selection(); sel != nil { + for s := uint16(0); s < n; s++ { + i := sel[s] + if outputCol[i] { + sel[idx] = i + idx++ + } + } + } else { + batch.SetSelection(true) + sel := batch.Selection() + for i := uint16(0); i < n; i++ { + if outputCol[i] { + sel[idx] = i + idx++ + } + } + } + + if idx == 0 { + continue + } + + batch.SetLength(idx) + return batch + } +} + +func (p *sortedDistinctFinalizerOp) Init() {} diff --git a/pkg/sql/exec/distinct_test.go b/pkg/sql/exec/distinct_test.go new file mode 100644 index 000000000000..3cdae698d474 --- /dev/null +++ b/pkg/sql/exec/distinct_test.go @@ -0,0 +1,137 @@ +// Copyright 2018 The Cockroach Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. + +package exec + +import ( + "testing" + + "github.com/cockroachdb/cockroach/pkg/util/randutil" + + "github.com/cockroachdb/cockroach/pkg/sql/exec/types" +) + +func TestSortedDistinct(t *testing.T) { + tcs := []struct { + distinctCols []int + numCols int + tuples []tuple + expected []tuple + }{ + { + distinctCols: []int{0, 1, 2}, + numCols: 4, + tuples: tuples{ + {1, 2, 3, 4}, + {1, 2, 3, 5}, + {2, 2, 3, 4}, + {2, 3, 3, 4}, + {2, 3, 4, 4}, + {2, 3, 4, 4}, + }, + expected: tuples{ + {1, 2, 3, 4}, + {2, 2, 3, 4}, + {2, 3, 3, 4}, + {2, 3, 4, 4}, + }, + }, + } + + for _, tc := range tcs { + runTests(t, tc.tuples, []types.T{types.Bool}, func(t *testing.T, input Operator) { + zeroOp := &zeroBoolOp{ + input: input, + colIdx: tc.numCols, + } + zeroOp.Init() + + var lastOp Operator = zeroOp + for _, cIdx := range tc.distinctCols { + sdop := &sortedDistinctInt64Op{ + input: lastOp, + sortedDistinctCol: cIdx, + outputColIdx: tc.numCols, + } + sdop.Init() + lastOp = sdop + } + + finalizer := &sortedDistinctFinalizerOp{ + input: lastOp, + outputColIdx: tc.numCols, + } + out := newOpTestOutput(finalizer, []int{0, 1, 2, 3}, tc.expected) + + if err := out.Verify(); err != nil { + t.Fatal(err) + } + }) + } +} + +func BenchmarkSortedDistinct(b *testing.B) { + rng, _ := randutil.NewPseudoRand() + + batch := NewMemBatch([]types.T{types.Int64, types.Int64, types.Int64, types.Bool}) + aCol := batch.ColVec(1).Int64() + bCol := batch.ColVec(2).Int64() + lastA := int64(0) + lastB := int64(0) + for i := 0; i < ColBatchSize; i++ { + // 1/4 chance of changing each distinct col. + if rng.Float64() > 0.75 { + lastA++ + } + if rng.Float64() > 0.75 { + lastB++ + } + aCol[i] = lastA + bCol[i] = lastB + } + batch.SetLength(ColBatchSize) + source := newRepeatableBatchSource(batch) + source.Init() + + zeroOp := &zeroBoolOp{ + input: source, + colIdx: 3, + } + zeroOp.Init() + + sdop := &sortedDistinctInt64Op{ + sortedDistinctCol: 1, + outputColIdx: 3, + input: zeroOp, + } + sdop.Init() + + sdop = &sortedDistinctInt64Op{ + sortedDistinctCol: 2, + outputColIdx: 3, + input: sdop, + } + sdop.Init() + + finalizer := &sortedDistinctFinalizerOp{ + input: sdop, + outputColIdx: 3, + } + + // don't count the artificial zeroOp'd column in the throughput + b.SetBytes(int64(8 * ColBatchSize * 3)) + for i := 0; i < b.N; i++ { + finalizer.Next() + } +}