Skip to content

Commit

Permalink
Zip UDF fast path breaks with non-dense in order elements (#3564)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: #3564

The fuzzer discovered an issue with the Zip UDF
#3542

I hadn't accounted for the fact that just because all of the arrays are flat and the same size, doesn't mean they all have the same offsets into their elements Vectors.

E.g. with 3 arrays of size 3, the offsets could be 0, 3, 6 or 6, 3, 0 or 0, 4, 7 (if there are unused elements)

When this is the case the fast path added to the Zip UDF produces incorrect results.

The fix is just to validate the offsets are the same in addition to the sizes.

Reviewed By: kagamiori

Differential Revision: D42183237

fbshipit-source-id: 5aa720d9fb10eaa777f186dcc08d056e639a8ee6
  • Loading branch information
Kevin Wilfong authored and facebook-github-bot committed Dec 21, 2022
1 parent bacce93 commit 797711e
Show file tree
Hide file tree
Showing 2 changed files with 184 additions and 5 deletions.
24 changes: 19 additions & 5 deletions velox/functions/prestosql/Zip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
* limitations under the License.
*/
#include <boost/algorithm/string/join.hpp>
#include <limits>
#include "velox/expression/Expr.h"
#include "velox/expression/VectorFunction.h"
#include "velox/functions/lib/LambdaFunctionUtil.h"
Expand Down Expand Up @@ -88,15 +89,21 @@ class ZipFunction : public exec::VectorFunction {
// This is true if for all rows, all the arrays within a row are the same
// size.
bool allSameSize = true;
// This is true if for all rows, all the arrays within a row have the same
// starting offset in their elements Vector.
bool allSameOffsets = true;

// Determine what the size of the resultant elements will be so we can
// reserve enough space.
auto getMaxArraySize = [&](vector_size_t row) -> vector_size_t {
vector_size_t maxSize = 0;
vector_size_t offset = -1;
for (int i = 0; i < numInputArrays; i++) {
vector_size_t size = rawSizes[i][indices[i][row]];
allSameSize &= i == 0 || maxSize == size;
allSameOffsets &= i == 0 || offset == rawOffsets[i][indices[i][row]];
maxSize = std::max(maxSize, size);
offset = rawOffsets[i][indices[i][row]];
}
return maxSize;
};
Expand All @@ -110,7 +117,7 @@ class ZipFunction : public exec::VectorFunction {
rawResultArraySizes[row] = maxSize;
});

if (allSameSize) {
if (allSameSize && allSameOffsets) {
// This is true if all input vectors have the "flat" Array encoding.
bool allFlat = true;
for (const auto& arg : args) {
Expand All @@ -119,21 +126,28 @@ class ZipFunction : public exec::VectorFunction {

if (allFlat) {
// Fast path if all input Vectors are flat and for all rows, all arrays
// within a row are the same size. In this case we don't have to add
// nulls, or decode the arrays, we can just pass in the element Vectors
// as is to be the fields of the output Rows.
// within a row are the same size and start at the same offset. In this
// case we don't have to add nulls, or decode the arrays, we can just
// pass in the element Vectors as is to be the fields of the output
// Rows.
std::vector<VectorPtr> elements;
elements.reserve(args.size());
// Since the offsets and sizes are all the same, using the minimum size
// is big enough to contain all elements, while also guaranteeing all
// child Vectors in the RowVector are at least this big.
vector_size_t minElementsSize =
std::numeric_limits<vector_size_t>::max();
for (const auto& arg : args) {
elements.push_back(arg->as<ArrayVector>()->elements());
minElementsSize = std::min(minElementsSize, elements.back()->size());
}

auto rowType = outputType->childAt(0);
auto rowVector = std::make_shared<RowVector>(
pool,
rowType,
BufferPtr(nullptr),
resultElementsSize,
minElementsSize,
std::move(elements));

// Now convert these to an Array
Expand Down
165 changes: 165 additions & 0 deletions velox/functions/prestosql/tests/ZipTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -251,4 +251,169 @@ TEST_F(ZipTest, dictionaryArrays) {

assertEqualVectors(expected, result);
}

/// Test if we can zip two flat vectors of arrays with rows starting at
/// different offsets
TEST_F(ZipTest, flatArraysWithDifferentOffsets) {
auto firstElements = makeFlatVector<int32_t>({0, 1, 2, 3, 4, 5, 6, 7, 8});
auto firstVector = makeArrayVector({0, 3, 6}, firstElements);

// Use different offsets.
auto secondElements =
makeFlatVector<int32_t>({10, 11, 12, 13, 14, 15, 16, 17, 18});
const auto size = 3;
BufferPtr offsetsBuffer = allocateOffsets(size, pool_.get());
BufferPtr sizesBuffer = allocateSizes(size, pool_.get());
auto rawOffsets = offsetsBuffer->asMutable<vector_size_t>();
auto rawSizes = sizesBuffer->asMutable<vector_size_t>();

rawOffsets[0] = 6;
rawOffsets[1] = 3;
rawOffsets[2] = 0;

for (int i = 0; i < size; i++) {
rawSizes[i] = 3;
}

auto secondVector = std::make_shared<ArrayVector>(
pool_.get(),
ARRAY(INTEGER()),
nullptr,
size,
offsetsBuffer,
sizesBuffer,
secondElements);

auto result = evaluate<ArrayVector>(
"zip(c0, c1)",
makeRowVector({
firstVector,
secondVector,
}));

auto firstResult = makeFlatVector<int32_t>({0, 1, 2, 3, 4, 5, 6, 7, 8});
auto secondResult =
makeFlatVector<int32_t>({16, 17, 18, 13, 14, 15, 10, 11, 12});

auto rowVector = makeRowVector({firstResult, secondResult});

// create the expected ArrayVector
auto expected = makeArrayVector({0, 3, 6}, rowVector);

assertEqualVectors(expected, result);
}

/// Test if we can zip two flat vectors of arrays with overlapping ranges of
/// elements.
TEST_F(ZipTest, flatArraysWithOverlappingRanges) {
const auto size = 3;
BufferPtr offsetsBuffer = allocateOffsets(size, pool_.get());
BufferPtr sizesBuffer = allocateSizes(size, pool_.get());
auto rawOffsets = offsetsBuffer->asMutable<vector_size_t>();
auto rawSizes = sizesBuffer->asMutable<vector_size_t>();

rawOffsets[0] = 0;
rawOffsets[1] = 1;
rawOffsets[2] = 2;

for (int i = 0; i < size; i++) {
rawSizes[i] = 3;
}

auto firstElements = makeFlatVector<int32_t>({0, 1, 2, 3, 4});
auto firstVector = std::make_shared<ArrayVector>(
pool_.get(),
ARRAY(INTEGER()),
nullptr,
size,
offsetsBuffer,
sizesBuffer,
firstElements);

auto secondElements = makeFlatVector<int32_t>({10, 11, 12, 13, 14});
auto secondVector = std::make_shared<ArrayVector>(
pool_.get(),
ARRAY(INTEGER()),
nullptr,
size,
offsetsBuffer,
sizesBuffer,
secondElements);

auto result = evaluate<ArrayVector>(
"zip(c0, c1)",
makeRowVector({
firstVector,
secondVector,
}));

auto firstResult = makeFlatVector<int32_t>({0, 1, 2, 1, 2, 3, 2, 3, 4});
auto secondResult =
makeFlatVector<int32_t>({10, 11, 12, 11, 12, 13, 12, 13, 14});

auto rowVector = makeRowVector({firstResult, secondResult});

// create the expected ArrayVector
auto expected = makeArrayVector({0, 3, 6}, rowVector);

assertEqualVectors(expected, result);
}

/// Test if we can zip two flat vectors of arrays with a gap in the range of
/// elements.
TEST_F(ZipTest, flatArraysWithGapInElements) {
const auto size = 3;
BufferPtr offsetsBuffer = allocateOffsets(size, pool_.get());
BufferPtr sizesBuffer = allocateSizes(size, pool_.get());
auto rawOffsets = offsetsBuffer->asMutable<vector_size_t>();
auto rawSizes = sizesBuffer->asMutable<vector_size_t>();

rawOffsets[0] = 0;
rawOffsets[1] = 6;
rawOffsets[2] = 9;

for (int i = 0; i < size; i++) {
rawSizes[i] = 3;
}

auto firstElements =
makeFlatVector<int32_t>({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
auto firstVector = std::make_shared<ArrayVector>(
pool_.get(),
ARRAY(INTEGER()),
nullptr,
size,
offsetsBuffer,
sizesBuffer,
firstElements);

auto secondElements =
makeFlatVector<int32_t>({10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21});
auto secondVector = std::make_shared<ArrayVector>(
pool_.get(),
ARRAY(INTEGER()),
nullptr,
size,
offsetsBuffer,
sizesBuffer,
secondElements);

auto result = evaluate<ArrayVector>(
"zip(c0, c1)",
makeRowVector({
firstVector,
secondVector,
}));

auto firstResult = makeFlatVector<int32_t>({0, 1, 2, 6, 7, 8, 9, 10, 11});
auto secondResult =
makeFlatVector<int32_t>({10, 11, 12, 16, 17, 18, 19, 20, 21});

auto rowVector = makeRowVector({firstResult, secondResult});

// create the expected ArrayVector
auto expected = makeArrayVector({0, 3, 6}, rowVector);

assertEqualVectors(expected, result);
}
} // namespace

0 comments on commit 797711e

Please sign in to comment.