-
Notifications
You must be signed in to change notification settings - Fork 3.8k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
exec: add the int64 sorted distinct operator flow
This operator runs sorted distinct over n columns, populating the selection vector with the indices of the tuples that are distinct in the input. This will be execgen templated later. ``` BenchmarkSortedDistinct-8 500000 3924 ns/op 6262.34 MB/s ``` Release note: None
- Loading branch information
1 parent
1f42429
commit 7810171
Showing
2 changed files
with
294 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,157 @@ | ||
// Copyright 2018 The Cockroach Authors. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or | ||
// implied. See the License for the specific language governing | ||
// permissions and limitations under the License. | ||
|
||
package exec | ||
|
||
// sortedDistinctInt64Op runs a distinct on the column in sortedDistinctCol, | ||
// writing true to the resultant bool column for every value that differs from | ||
// the previous one. | ||
type sortedDistinctInt64Op struct { | ||
input Operator | ||
|
||
// sortedDistinctCol is the index of the column to distinct upon. | ||
sortedDistinctCol int | ||
|
||
// outputColIdx is the index of the boolean output column in the input batch. | ||
outputColIdx int | ||
|
||
// Set to true at runtime when we've seen the first row. Distinct always | ||
// outputs the first row that it sees. | ||
foundFirstRow bool | ||
|
||
// lastVal is the last value seen by the operator, so that the distincting | ||
// still works across batch boundaries. | ||
lastVal int64 // template | ||
} | ||
|
||
var _ Operator = &sortedDistinctInt64Op{} | ||
|
||
func (p *sortedDistinctInt64Op) Init() {} | ||
|
||
func (p *sortedDistinctInt64Op) Next() ColBatch { | ||
batch := p.input.Next() | ||
if batch.Length() == 0 { | ||
return batch | ||
} | ||
outputCol := batch.ColVec(p.outputColIdx).Bool() | ||
col := batch.ColVec(p.sortedDistinctCol).Int64() | ||
|
||
// We always output the first row. | ||
lastVal := p.lastVal | ||
sel := batch.Selection() | ||
if !p.foundFirstRow { | ||
if sel != nil { | ||
lastVal = col[sel[0]] | ||
outputCol[sel[0]] = true | ||
} else { | ||
lastVal = col[0] | ||
outputCol[0] = true | ||
} | ||
} | ||
|
||
startIdx := uint16(0) | ||
if !p.foundFirstRow { | ||
startIdx = 1 | ||
} | ||
|
||
n := batch.Length() | ||
if sel != nil { | ||
// Bounds check elimination. | ||
sel = sel[startIdx:n] | ||
for _, i := range sel { | ||
v := col[i] | ||
// Note that not inlining this unique var actually makes a non-trivial | ||
// performance difference. | ||
unique := v != lastVal | ||
outputCol[i] = outputCol[i] || unique | ||
lastVal = v | ||
} | ||
} else { | ||
// Bounds check elimination. | ||
col = col[startIdx:n] | ||
outputCol = outputCol[startIdx:n] | ||
for i := range col { | ||
v := col[i] | ||
// Note that not inlining this unique var actually makes a non-trivial | ||
// performance difference. | ||
unique := v != lastVal | ||
outputCol[i] = outputCol[i] || unique | ||
lastVal = v | ||
} | ||
} | ||
|
||
p.lastVal = lastVal | ||
p.foundFirstRow = true | ||
|
||
return batch | ||
} | ||
|
||
// This finalizer op transforms the vector in outputColIdx to the selection | ||
// vector, by adding an index to the selection for each true value in the column | ||
// at outputColIdx. | ||
type sortedDistinctFinalizerOp struct { | ||
input Operator | ||
|
||
// outputColIdx is the index of the boolean output column from previous | ||
// distinct ops in the input batch. | ||
outputColIdx int | ||
} | ||
|
||
var _ Operator = &sortedDistinctFinalizerOp{} | ||
|
||
func (p *sortedDistinctFinalizerOp) Next() ColBatch { | ||
// Loop until we have non-zero amount of output to return, or our input's been | ||
// exhausted. | ||
for { | ||
batch := p.input.Next() | ||
if batch.Length() == 0 { | ||
return batch | ||
} | ||
outputCol := batch.ColVec(p.outputColIdx).Bool() | ||
|
||
// Convert outputCol to a selection vector by outputting the index of each | ||
// tuple whose outputCol value is true. | ||
// Note that, if the input already had a selection vector, the output | ||
// selection vector will be a subset of the input selection vector. | ||
idx := uint16(0) | ||
n := batch.Length() | ||
if sel := batch.Selection(); sel != nil { | ||
for s := uint16(0); s < n; s++ { | ||
i := sel[s] | ||
if outputCol[i] { | ||
sel[idx] = i | ||
idx++ | ||
} | ||
} | ||
} else { | ||
batch.SetSelection(true) | ||
sel := batch.Selection() | ||
for i := uint16(0); i < n; i++ { | ||
if outputCol[i] { | ||
sel[idx] = i | ||
idx++ | ||
} | ||
} | ||
} | ||
|
||
if idx == 0 { | ||
continue | ||
} | ||
|
||
batch.SetLength(idx) | ||
return batch | ||
} | ||
} | ||
|
||
func (p *sortedDistinctFinalizerOp) Init() {} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,137 @@ | ||
// Copyright 2018 The Cockroach Authors. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or | ||
// implied. See the License for the specific language governing | ||
// permissions and limitations under the License. | ||
|
||
package exec | ||
|
||
import ( | ||
"testing" | ||
|
||
"github.com/cockroachdb/cockroach/pkg/util/randutil" | ||
|
||
"github.com/cockroachdb/cockroach/pkg/sql/exec/types" | ||
) | ||
|
||
func TestSortedDistinct(t *testing.T) { | ||
tcs := []struct { | ||
distinctCols []int | ||
numCols int | ||
tuples []tuple | ||
expected []tuple | ||
}{ | ||
{ | ||
distinctCols: []int{0, 1, 2}, | ||
numCols: 4, | ||
tuples: tuples{ | ||
{1, 2, 3, 4}, | ||
{1, 2, 3, 5}, | ||
{2, 2, 3, 4}, | ||
{2, 3, 3, 4}, | ||
{2, 3, 4, 4}, | ||
{2, 3, 4, 4}, | ||
}, | ||
expected: tuples{ | ||
{1, 2, 3, 4}, | ||
{2, 2, 3, 4}, | ||
{2, 3, 3, 4}, | ||
{2, 3, 4, 4}, | ||
}, | ||
}, | ||
} | ||
|
||
for _, tc := range tcs { | ||
runTests(t, tc.tuples, []types.T{types.Bool}, func(t *testing.T, input Operator) { | ||
zeroOp := &zeroBoolOp{ | ||
input: input, | ||
colIdx: tc.numCols, | ||
} | ||
zeroOp.Init() | ||
|
||
var lastOp Operator = zeroOp | ||
for _, cIdx := range tc.distinctCols { | ||
sdop := &sortedDistinctInt64Op{ | ||
input: lastOp, | ||
sortedDistinctCol: cIdx, | ||
outputColIdx: tc.numCols, | ||
} | ||
sdop.Init() | ||
lastOp = sdop | ||
} | ||
|
||
finalizer := &sortedDistinctFinalizerOp{ | ||
input: lastOp, | ||
outputColIdx: tc.numCols, | ||
} | ||
out := newOpTestOutput(finalizer, []int{0, 1, 2, 3}, tc.expected) | ||
|
||
if err := out.Verify(); err != nil { | ||
t.Fatal(err) | ||
} | ||
}) | ||
} | ||
} | ||
|
||
func BenchmarkSortedDistinct(b *testing.B) { | ||
rng, _ := randutil.NewPseudoRand() | ||
|
||
batch := NewMemBatch([]types.T{types.Int64, types.Int64, types.Int64, types.Bool}) | ||
aCol := batch.ColVec(1).Int64() | ||
bCol := batch.ColVec(2).Int64() | ||
lastA := int64(0) | ||
lastB := int64(0) | ||
for i := 0; i < ColBatchSize; i++ { | ||
// 1/4 chance of changing each distinct col. | ||
if rng.Float64() > 0.75 { | ||
lastA++ | ||
} | ||
if rng.Float64() > 0.75 { | ||
lastB++ | ||
} | ||
aCol[i] = lastA | ||
bCol[i] = lastB | ||
} | ||
batch.SetLength(ColBatchSize) | ||
source := newRepeatableBatchSource(batch) | ||
source.Init() | ||
|
||
zeroOp := &zeroBoolOp{ | ||
input: source, | ||
colIdx: 3, | ||
} | ||
zeroOp.Init() | ||
|
||
sdop := &sortedDistinctInt64Op{ | ||
sortedDistinctCol: 1, | ||
outputColIdx: 3, | ||
input: zeroOp, | ||
} | ||
sdop.Init() | ||
|
||
sdop = &sortedDistinctInt64Op{ | ||
sortedDistinctCol: 2, | ||
outputColIdx: 3, | ||
input: sdop, | ||
} | ||
sdop.Init() | ||
|
||
finalizer := &sortedDistinctFinalizerOp{ | ||
input: sdop, | ||
outputColIdx: 3, | ||
} | ||
|
||
// don't count the artificial zeroOp'd column in the throughput | ||
b.SetBytes(int64(8 * ColBatchSize * 3)) | ||
for i := 0; i < b.N; i++ { | ||
finalizer.Next() | ||
} | ||
} |