Skip to content

Commit

Permalink
exec: add the int64 sorted distinct operator flow
Browse files Browse the repository at this point in the history
This operator runs sorted distinct over n columns, populating the
selection vector with the indices of the tuples that are distinct in the
input.

This will be execgen templated later.

```
BenchmarkSortedDistinct-8         500000              3924 ns/op        6262.34 MB/s
```

Release note: None
  • Loading branch information
jordanlewis committed Oct 23, 2018
1 parent 1f42429 commit 7810171
Show file tree
Hide file tree
Showing 2 changed files with 294 additions and 0 deletions.
157 changes: 157 additions & 0 deletions pkg/sql/exec/distinct.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
// Copyright 2018 The Cockroach Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package exec

// sortedDistinctInt64Op runs a distinct on the column in sortedDistinctCol,
// writing true to the resultant bool column for every value that differs from
// the previous one.
type sortedDistinctInt64Op struct {
input Operator

// sortedDistinctCol is the index of the column to distinct upon.
sortedDistinctCol int

// outputColIdx is the index of the boolean output column in the input batch.
outputColIdx int

// Set to true at runtime when we've seen the first row. Distinct always
// outputs the first row that it sees.
foundFirstRow bool

// lastVal is the last value seen by the operator, so that the distincting
// still works across batch boundaries.
lastVal int64 // template
}

var _ Operator = &sortedDistinctInt64Op{}

func (p *sortedDistinctInt64Op) Init() {}

func (p *sortedDistinctInt64Op) Next() ColBatch {
batch := p.input.Next()
if batch.Length() == 0 {
return batch
}
outputCol := batch.ColVec(p.outputColIdx).Bool()
col := batch.ColVec(p.sortedDistinctCol).Int64()

// We always output the first row.
lastVal := p.lastVal
sel := batch.Selection()
if !p.foundFirstRow {
if sel != nil {
lastVal = col[sel[0]]
outputCol[sel[0]] = true
} else {
lastVal = col[0]
outputCol[0] = true
}
}

startIdx := uint16(0)
if !p.foundFirstRow {
startIdx = 1
}

n := batch.Length()
if sel != nil {
// Bounds check elimination.
sel = sel[startIdx:n]
for _, i := range sel {
v := col[i]
// Note that not inlining this unique var actually makes a non-trivial
// performance difference.
unique := v != lastVal
outputCol[i] = outputCol[i] || unique
lastVal = v
}
} else {
// Bounds check elimination.
col = col[startIdx:n]
outputCol = outputCol[startIdx:n]
for i := range col {
v := col[i]
// Note that not inlining this unique var actually makes a non-trivial
// performance difference.
unique := v != lastVal
outputCol[i] = outputCol[i] || unique
lastVal = v
}
}

p.lastVal = lastVal
p.foundFirstRow = true

return batch
}

// This finalizer op transforms the vector in outputColIdx to the selection
// vector, by adding an index to the selection for each true value in the column
// at outputColIdx.
type sortedDistinctFinalizerOp struct {
input Operator

// outputColIdx is the index of the boolean output column from previous
// distinct ops in the input batch.
outputColIdx int
}

var _ Operator = &sortedDistinctFinalizerOp{}

func (p *sortedDistinctFinalizerOp) Next() ColBatch {
// Loop until we have non-zero amount of output to return, or our input's been
// exhausted.
for {
batch := p.input.Next()
if batch.Length() == 0 {
return batch
}
outputCol := batch.ColVec(p.outputColIdx).Bool()

// Convert outputCol to a selection vector by outputting the index of each
// tuple whose outputCol value is true.
// Note that, if the input already had a selection vector, the output
// selection vector will be a subset of the input selection vector.
idx := uint16(0)
n := batch.Length()
if sel := batch.Selection(); sel != nil {
for s := uint16(0); s < n; s++ {
i := sel[s]
if outputCol[i] {
sel[idx] = i
idx++
}
}
} else {
batch.SetSelection(true)
sel := batch.Selection()
for i := uint16(0); i < n; i++ {
if outputCol[i] {
sel[idx] = i
idx++
}
}
}

if idx == 0 {
continue
}

batch.SetLength(idx)
return batch
}
}

func (p *sortedDistinctFinalizerOp) Init() {}
137 changes: 137 additions & 0 deletions pkg/sql/exec/distinct_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
// Copyright 2018 The Cockroach Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
// implied. See the License for the specific language governing
// permissions and limitations under the License.

package exec

import (
"testing"

"github.com/cockroachdb/cockroach/pkg/util/randutil"

"github.com/cockroachdb/cockroach/pkg/sql/exec/types"
)

func TestSortedDistinct(t *testing.T) {
tcs := []struct {
distinctCols []int
numCols int
tuples []tuple
expected []tuple
}{
{
distinctCols: []int{0, 1, 2},
numCols: 4,
tuples: tuples{
{1, 2, 3, 4},
{1, 2, 3, 5},
{2, 2, 3, 4},
{2, 3, 3, 4},
{2, 3, 4, 4},
{2, 3, 4, 4},
},
expected: tuples{
{1, 2, 3, 4},
{2, 2, 3, 4},
{2, 3, 3, 4},
{2, 3, 4, 4},
},
},
}

for _, tc := range tcs {
runTests(t, tc.tuples, []types.T{types.Bool}, func(t *testing.T, input Operator) {
zeroOp := &zeroBoolOp{
input: input,
colIdx: tc.numCols,
}
zeroOp.Init()

var lastOp Operator = zeroOp
for _, cIdx := range tc.distinctCols {
sdop := &sortedDistinctInt64Op{
input: lastOp,
sortedDistinctCol: cIdx,
outputColIdx: tc.numCols,
}
sdop.Init()
lastOp = sdop
}

finalizer := &sortedDistinctFinalizerOp{
input: lastOp,
outputColIdx: tc.numCols,
}
out := newOpTestOutput(finalizer, []int{0, 1, 2, 3}, tc.expected)

if err := out.Verify(); err != nil {
t.Fatal(err)
}
})
}
}

func BenchmarkSortedDistinct(b *testing.B) {
rng, _ := randutil.NewPseudoRand()

batch := NewMemBatch([]types.T{types.Int64, types.Int64, types.Int64, types.Bool})
aCol := batch.ColVec(1).Int64()
bCol := batch.ColVec(2).Int64()
lastA := int64(0)
lastB := int64(0)
for i := 0; i < ColBatchSize; i++ {
// 1/4 chance of changing each distinct col.
if rng.Float64() > 0.75 {
lastA++
}
if rng.Float64() > 0.75 {
lastB++
}
aCol[i] = lastA
bCol[i] = lastB
}
batch.SetLength(ColBatchSize)
source := newRepeatableBatchSource(batch)
source.Init()

zeroOp := &zeroBoolOp{
input: source,
colIdx: 3,
}
zeroOp.Init()

sdop := &sortedDistinctInt64Op{
sortedDistinctCol: 1,
outputColIdx: 3,
input: zeroOp,
}
sdop.Init()

sdop = &sortedDistinctInt64Op{
sortedDistinctCol: 2,
outputColIdx: 3,
input: sdop,
}
sdop.Init()

finalizer := &sortedDistinctFinalizerOp{
input: sdop,
outputColIdx: 3,
}

// don't count the artificial zeroOp'd column in the throughput
b.SetBytes(int64(8 * ColBatchSize * 3))
for i := 0; i < b.N; i++ {
finalizer.Next()
}
}

0 comments on commit 7810171

Please sign in to comment.