Skip to content

Commit

Permalink
refactor shuffle unit tests; replace with DecodedArgs
Browse files Browse the repository at this point in the history
  • Loading branch information
darrenfu committed Jan 19, 2023
1 parent cc0cbd7 commit 2fb6732
Show file tree
Hide file tree
Showing 2 changed files with 218 additions and 207 deletions.
34 changes: 24 additions & 10 deletions velox/functions/prestosql/ArrayShuffle.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,21 @@ namespace facebook::velox::functions {
namespace {
// See documentation at
// https://prestodb.io/docs/current/functions/array.html#shuffle
//
// This function will shuffle identical arrays independently, i.e. even when
// the input has duplicate rows represented using constant and dictionary
// encoding, the output is flat and likely yields different values.
//
// E.g.1: constant encoding
// Input: ConstantVector(base=ArrayVector[{1,2,3}], length=3, index=0)
// Possible Output: ArrayVector[{1,3,2},{2,3,1},{3,2,1}]
//
// E.g.2: dict encoding
// Input: DictionaryVector(
// dictionaryValues=ArrayVector[{1,2,3},{4,5},{1,2,3}],
// dictionaryIndices=[1,2,0])
// Possible Output: ArrayVector[{5,4},{2,1,3},{1,3,2}]
//
class ArrayShuffleFunction : public exec::VectorFunction {
public:
bool isDeterministic() const override {
Expand All @@ -37,19 +52,18 @@ class ArrayShuffleFunction : public exec::VectorFunction {
VectorPtr& result) const override {
VELOX_CHECK_EQ(args.size(), 1);

// For deterministic single-arg functions, expression evaluation will peel
// off encodings and guarantee that we will only see flat or constant
// inputs. But this guarantee does not exist for non-deterministic
// functions like shuffle(). Hence, we need to use DecodedVector to handle
// all the possible dict encodings including constant, flat or dictionary.
exec::LocalDecodedVector localDecodedVector(context, *args[0], rows);
const auto& decodedArray = *localDecodedVector.get();
auto arrayVector = decodedArray.base()->as<ArrayVector>();
// This is a non-deterministic function, which violates the guarantee on a
// deterministic single-arg function that the expression evaluation will
// peel off encodings, and we will only see flat or constant inputs. Hence,
// we need to use DecodedVector to handle ALL encodings.
exec::DecodedArgs decodedArgs(rows, args, context);
auto decodedArg = decodedArgs.at(0);
auto arrayVector = decodedArg->base()->as<ArrayVector>();
auto elementsVector = arrayVector->elements();

vector_size_t numElements = 0;
context.applyToSelectedNoThrow(rows, [&](auto row) {
const auto size = arrayVector->sizeAt(decodedArray.index(row));
const auto size = arrayVector->sizeAt(decodedArg->index(row));
numElements += size;
});

Expand All @@ -65,7 +79,7 @@ class ArrayShuffleFunction : public exec::VectorFunction {
vector_size_t newOffset = 0;
std::mt19937 randGen(std::random_device{}());
context.applyToSelectedNoThrow(rows, [&](auto row) {
vector_size_t arrayRow = decodedArray.index(row);
vector_size_t arrayRow = decodedArg->index(row);
vector_size_t size = arrayVector->sizeAt(arrayRow);
vector_size_t offset = arrayVector->offsetAt(arrayRow);

Expand Down
Loading

0 comments on commit 2fb6732

Please sign in to comment.