diff --git a/velox/docs/functions/array.rst b/velox/docs/functions/array.rst index 6fef18ed9827..519133fe67c2 100644 --- a/velox/docs/functions/array.rst +++ b/velox/docs/functions/array.rst @@ -156,6 +156,14 @@ Array Functions Returns an array which has the reversed order of the input array. +.. function:: shuffle(array(E)) -> array(E) + + Generate a random permutation of the given ``array``:: + + SELECT shuffle(ARRAY [1, 2, 3]); -- [3, 1, 2] or any other random permutation + SELECT shuffle(ARRAY [0, 0, 0]); -- [0, 0, 0] + SELECT shuffle(ARRAY [1, NULL, 1, NULL, 2]); -- [2, NULL, NULL, NULL, 1] or any other random permutation + .. function:: slice(array(E), start, length) -> array(E) Returns a subarray starting from index ``start``(or starting from the end diff --git a/velox/functions/prestosql/ArrayShuffle.cpp b/velox/functions/prestosql/ArrayShuffle.cpp new file mode 100644 index 000000000000..0736ddde9bc7 --- /dev/null +++ b/velox/functions/prestosql/ArrayShuffle.cpp @@ -0,0 +1,126 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include "velox/expression/EvalCtx.h" +#include "velox/expression/Expr.h" +#include "velox/expression/VectorFunction.h" + +namespace facebook::velox::functions { +namespace { +// See documentation at +// https://prestodb.io/docs/current/functions/array.html#shuffle +// +// This function will shuffle identical arrays independently, i.e. even when +// the input has duplicate rows represented using constant and dictionary +// encoding, the output is flat and likely yields different values. +// +// E.g.1: constant encoding +// Input: ConstantVector(base=ArrayVector[{1,2,3}], length=3, index=0) +// Possible Output: ArrayVector[{1,3,2},{2,3,1},{3,2,1}] +// +// E.g.2: dict encoding +// Input: DictionaryVector( +// dictionaryValues=ArrayVector[{1,2,3},{4,5},{1,2,3}], +// dictionaryIndices=[1,2,0]) +// Possible Output: ArrayVector[{5,4},{2,1,3},{1,3,2}] +// +class ArrayShuffleFunction : public exec::VectorFunction { + public: + bool isDeterministic() const override { + return false; + } + + void apply( + const SelectivityVector& rows, + std::vector& args, + const TypePtr& /*outputType*/, + exec::EvalCtx& context, + VectorPtr& result) const override { + VELOX_CHECK_EQ(args.size(), 1); + + // This is a non-deterministic function, which violates the guarantee on a + // deterministic single-arg function that the expression evaluation will + // peel off encodings, and we will only see flat or constant inputs. Hence, + // we need to use DecodedVector to handle ALL encodings. + exec::DecodedArgs decodedArgs(rows, args, context); + auto decodedArg = decodedArgs.at(0); + auto arrayVector = decodedArg->base()->as(); + auto elementsVector = arrayVector->elements(); + + vector_size_t numElements = 0; + context.applyToSelectedNoThrow(rows, [&](auto row) { + const auto size = arrayVector->sizeAt(decodedArg->index(row)); + numElements += size; + }); + + // Allocate new buffer to hold shuffled indices. + BufferPtr shuffledIndices = allocateIndices(numElements, context.pool()); + BufferPtr offsets = allocateOffsets(rows.size(), context.pool()); + BufferPtr sizes = allocateSizes(rows.size(), context.pool()); + + vector_size_t* rawIndices = shuffledIndices->asMutable(); + vector_size_t* rawOffsets = offsets->asMutable(); + vector_size_t* rawSizes = sizes->asMutable(); + + vector_size_t newOffset = 0; + std::mt19937 randGen(std::random_device{}()); + context.applyToSelectedNoThrow(rows, [&](auto row) { + vector_size_t arrayRow = decodedArg->index(row); + vector_size_t size = arrayVector->sizeAt(arrayRow); + vector_size_t offset = arrayVector->offsetAt(arrayRow); + + std::iota(rawIndices + newOffset, rawIndices + newOffset + size, offset); + std::shuffle( + rawIndices + newOffset, rawIndices + newOffset + size, randGen); + + rawSizes[row] = size; + rawOffsets[row] = newOffset; + newOffset += size; + }); + + auto resultElements = BaseVector::wrapInDictionary( + nullptr, shuffledIndices, numElements, elementsVector); + auto localResult = std::make_shared( + context.pool(), + arrayVector->type(), + nullptr, + rows.size(), + std::move(offsets), + std::move(sizes), + std::move(resultElements)); + + context.moveOrCopyResult(localResult, rows, result); + } +}; + +std::vector> signatures() { + return {// array(T) -> array(T) + exec::FunctionSignatureBuilder() + .typeVariable("T") + .returnType("array(T)") + .argumentType("array(T)") + .build()}; +} + +} // namespace + +// Register function. +VELOX_DECLARE_VECTOR_FUNCTION( + udf_array_shuffle, + signatures(), + std::make_unique()); +} // namespace facebook::velox::functions diff --git a/velox/functions/prestosql/CMakeLists.txt b/velox/functions/prestosql/CMakeLists.txt index 7f40ecd77948..7bb46d693199 100644 --- a/velox/functions/prestosql/CMakeLists.txt +++ b/velox/functions/prestosql/CMakeLists.txt @@ -23,6 +23,7 @@ add_library( ArrayDuplicates.cpp ArrayIntersectExcept.cpp ArrayPosition.cpp + ArrayShuffle.cpp ArraySort.cpp ArraySum.cpp Comparisons.cpp diff --git a/velox/functions/prestosql/registration/ArrayFunctionsRegistration.cpp b/velox/functions/prestosql/registration/ArrayFunctionsRegistration.cpp index 03739bc0b974..c0c0ab185983 100644 --- a/velox/functions/prestosql/registration/ArrayFunctionsRegistration.cpp +++ b/velox/functions/prestosql/registration/ArrayFunctionsRegistration.cpp @@ -71,6 +71,7 @@ void registerArrayFunctions() { VELOX_REGISTER_VECTOR_FUNCTION(udf_zip, "zip"); VELOX_REGISTER_VECTOR_FUNCTION(udf_zip_with, "zip_with"); VELOX_REGISTER_VECTOR_FUNCTION(udf_array_position, "array_position"); + VELOX_REGISTER_VECTOR_FUNCTION(udf_array_shuffle, "shuffle"); VELOX_REGISTER_VECTOR_FUNCTION(udf_array_sort, "array_sort"); VELOX_REGISTER_VECTOR_FUNCTION(udf_array_sum, "array_sum"); VELOX_REGISTER_VECTOR_FUNCTION(udf_repeat, "repeat"); diff --git a/velox/functions/prestosql/tests/ArrayShuffleTest.cpp b/velox/functions/prestosql/tests/ArrayShuffleTest.cpp new file mode 100644 index 000000000000..a30ef8a96237 --- /dev/null +++ b/velox/functions/prestosql/tests/ArrayShuffleTest.cpp @@ -0,0 +1,297 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include + +#include "velox/expression/VectorReaders.h" +#include "velox/functions/prestosql/tests/utils/FunctionBaseTest.h" + +using namespace facebook::velox; +using namespace facebook::velox::test; +using namespace facebook::velox::functions::test; + +namespace { +template +std::string printArray(const std::vector>& input) { + std::stringstream out; + out << "["; + std::for_each(input.begin(), input.end(), [&out](const auto& e) { + if (e.has_value()) { + out << e.value(); + } else { + out << "NULL"; + } + out << ", "; + }); + out << "]"; + return out.str(); +} + +template +std::string printArray( + const std::vector>>>& input) { + std::stringstream out; + out << "["; + std::for_each(input.begin(), input.end(), [&out](const auto& e) { + if (e.has_value()) { + out << printArray(e.value()); + } else { + out << "NULL"; + } + out << ", "; + }); + out << "]"; + return out.str(); +} + +template +std::string printArray(const std::vector& input) { + std::stringstream out; + out << "["; + std::for_each( + input.begin(), input.end(), [&out](const auto& e) { out << e << ", "; }); + out << "]"; + return out.str(); +} +} // namespace + +namespace { +class ArrayShuffleTest : public FunctionBaseTest { + protected: + template + void testShuffle(const VectorPtr& input) { + DecodedVector decodedExpected(*input.get()); + exec::VectorReader> readerExpected(&decodedExpected); + + auto actualVector = + evaluate("shuffle(C0)", makeRowVector({input})); + // Validate each row from the actual decoded ArrayVector is a permutation + // of the corresponding row from the expected decoded ArrayVector. + DecodedVector decodedActual(*actualVector.get()); + exec::VectorReader> readerActual(&decodedActual); + + for (auto i = 0; i < input->size(); i++) { + // Must materialize into std::vector, otherwise it will throw error + // because ArrayView doesn't support std::is_permutation() yet. + auto actualArray = readerActual[i].materialize(); + auto expectedArray = readerExpected[i].materialize(); + + // Assert the two vectors contain the same elements (ignoring order). + ASSERT_TRUE(std::is_permutation( + actualArray.begin(), actualArray.end(), expectedArray.begin())) + << "Actual array " << printArray(actualArray) << " at " << i + << " must produce a permutation of expected array " + << printArray(expectedArray); + } + } + + /// To test shuffle's randomness, we need the following steps: + /// 1. Shuffle kNumShuffleTimes times with range-based input + /// {0, 1, ..., kNumDistinctValues-1}, and it should be at least + /// kUniquenessRate% times different in total. + /// 2. Verify that results are a permutation of the input, i.e. + /// no value is missing and no extra value is present. + /// NOTE: + /// The combination of tests above is a straightforward way of + /// verifying the shuffle's (1) randomness and (2) and correctness. + /// However, it doesn't guarantee (3) uniformity. + void testShuffleRandomness(VectorEncoding::Simple encoding) { + // Generate a range-based array N {0, 1, ..., n-1} as the input for + // test shuffle randomness purpose. + // 1. For flat encoding: we generate an ArrayVector with all identical + // elements (N) and its size equals to t. + // 2. For constant encoding: we generate a ConstantVector with + // its valueVector = N and its size equals to t. + // 3. For dict encoding: we generate a DictionaryVector with dictValues + // being an ArrayVector (with all identical elements: N and size=2t) and + // indices with size=2t (by duplicating the first half of N). + const int32_t kNumShuffleTimes = 100; + const int32_t kNumDistinctValues = 10; + const double kUniquenessRate = .7; + + std::vector inputData(kNumDistinctValues); + std::iota(inputData.begin(), inputData.end(), 0); + + VectorPtr inputVector; + switch (encoding) { + case VectorEncoding::Simple::FLAT: { + std::vector> flatData(kNumShuffleTimes); + std::fill(flatData.begin(), flatData.end(), inputData); + inputVector = makeArrayVector(flatData); + break; + } + case VectorEncoding::Simple::CONSTANT: { + auto valueVector = makeArrayVector({inputData}); + inputVector = + BaseVector::wrapInConstant(kNumShuffleTimes, 0, valueVector); + break; + } + case VectorEncoding::Simple::DICTIONARY: { + vector_size_t dictSize = kNumShuffleTimes * 2; + std::vector> baseData(dictSize); + std::fill(baseData.begin(), baseData.end(), inputData); + auto dictValues = makeArrayVector(baseData); + + std::vector indicesData(dictSize); + // Test duplicate indices. + std::iota( + indicesData.begin(), indicesData.begin() + kNumShuffleTimes, 0); + std::iota(indicesData.begin() + kNumShuffleTimes, indicesData.end(), 0); + auto indices = makeIndices(indicesData); + inputVector = wrapInDictionary(indices, dictValues); + break; + } + default: + VELOX_FAIL( + "Unsupported vector encoding: {}", + VectorEncoding::mapSimpleToName(encoding)); + } + + DecodedVector decodedExpected(*inputVector.get()); + exec::VectorReader> readerExpected(&decodedExpected); + + using materialize_t = + typename exec::ArrayView::materialize_t; + folly::F14FastSet distinctValueSet; + + auto actualVector = + evaluate("shuffle(C0)", makeRowVector({inputVector})); + + DecodedVector decodedActual(*actualVector.get()); + exec::VectorReader> readerActual(&decodedActual); + + for (auto i = 0; i < actualVector->size(); i++) { + auto actualArray = readerActual.readNullFree(i).materialize(); + auto expectedArray = readerExpected.readNullFree(i).materialize(); + + ASSERT_TRUE(std::is_permutation( + actualArray.begin(), actualArray.end(), expectedArray.begin())) + << "Actual " << inputVector->encoding() << " array " + << printArray(actualArray) << " at " << i + << " must produce a permutation of expected array " + << printArray(expectedArray); + + distinctValueSet.insert(actualArray); + } + + // Shuffled arrays should be kUniquenessRate% different in total. + const int32_t kThreshold = (int32_t)(kNumShuffleTimes * kUniquenessRate); + auto numDistinctValues = distinctValueSet.size(); + ASSERT_TRUE(numDistinctValues >= kThreshold) + << "Shuffle " << inputVector->encoding() + << " array must yield >= " << kThreshold + << " distinct values, but only got " << numDistinctValues; + } +}; +} // namespace + +TEST_F(ArrayShuffleTest, bigintArrays) { + auto input = makeNullableArrayVector( + {{}, + {std::nullopt}, + {std::nullopt, std::nullopt}, + {-1, 0, 1}, + {std::nullopt, 0, 0}, + {std::numeric_limits::max(), + std::numeric_limits::min(), + 0, + 1, + 2, + 3}}); + testShuffle(input); +} + +TEST_F(ArrayShuffleTest, nestedArrays) { + using innerArrayType = std::vector>; + using outerArrayType = + std::vector>>>; + innerArrayType a{1, 2, 3, 4}; + innerArrayType b{5, 6}; + innerArrayType c{6, 7, 8}; + outerArrayType row1{{a}, {b}}; + outerArrayType row2{std::nullopt, std::nullopt, {a}, {b}, {c}}; + outerArrayType row3{{{}}}; + outerArrayType row4{{{std::nullopt}}}; + auto input = + makeNullableNestedArrayVector({{row1}, {row2}, {row3}, {row4}}); + + testShuffle>(input); +} + +TEST_F(ArrayShuffleTest, sortAndShuffle) { + auto input = makeNullableArrayVector( + {{-1, 0, std::nullopt, 1, std::nullopt}, + {4, 1, 5, 3, 2}, + {std::numeric_limits::max(), + std::numeric_limits::min(), + 4, + 1, + 3, + 2, + std::nullopt}}); + auto inputVector = makeRowVector({input}); + auto result1 = evaluate("array_sort(C0)", inputVector); + auto result2 = evaluate("array_sort(shuffle(C0))", inputVector); + + assertEqualVectors(result1, result2); +} + +TEST_F(ArrayShuffleTest, constantEncoding) { + vector_size_t size = 100; + // Test empty array, array with null element, + // array with duplicate elements, and array with distinct values. + auto valueVector = makeNullableArrayVector( + {{}, {std::nullopt, 0}, {5, 5}, {1, 2, 3}}); + + for (auto i = 0; i < valueVector->size(); i++) { + auto input = BaseVector::wrapInConstant(size, i, valueVector); + testShuffle(input); + } +} + +TEST_F(ArrayShuffleTest, dictEncoding) { + // Test dict with repeated elements: {1,2,3} x 3, {4,5} x 2. + auto base = makeNullableArrayVector( + {{0}, + {1, 2, 3}, + {4, 5, std::nullopt}, + {1, 2, 3}, + {1, 2, 3}, + {4, 5, std::nullopt}}); + // Test repeated index elements and indices filtering (filter out element at + // index 0). + auto indices = makeIndices({3, 3, 4, 2, 2, 1, 1, 1}); + auto input = wrapInDictionary(indices, base); + + testShuffle(input); +} + +TEST_F(ArrayShuffleTest, flatEncodingRandomness) { + testShuffleRandomness(VectorEncoding::Simple::FLAT); +} + +TEST_F(ArrayShuffleTest, constantEncodingRandomness) { + testShuffleRandomness(VectorEncoding::Simple::CONSTANT); +} + +TEST_F(ArrayShuffleTest, dictEncodingRandomness) { + testShuffleRandomness(VectorEncoding::Simple::DICTIONARY); +} diff --git a/velox/functions/prestosql/tests/CMakeLists.txt b/velox/functions/prestosql/tests/CMakeLists.txt index 4fb0aec33bfd..aeaccf61e678 100644 --- a/velox/functions/prestosql/tests/CMakeLists.txt +++ b/velox/functions/prestosql/tests/CMakeLists.txt @@ -31,6 +31,7 @@ add_executable( ArrayMinTest.cpp ArrayPositionTest.cpp ArraySortTest.cpp + ArrayShuffleTest.cpp ArraysOverlapTest.cpp ArraySumTest.cpp BitwiseTest.cpp