From 8996be8f329241fed0e92abcd95e390548d3366c Mon Sep 17 00:00:00 2001 From: Darren Fu Date: Wed, 18 Jan 2023 23:56:12 -0800 Subject: [PATCH] refactor shuffle unit tests; replace with DecodedArgs --- velox/functions/prestosql/ArrayShuffle.cpp | 34 +- .../prestosql/tests/ArrayShuffleTest.cpp | 397 +++++++++--------- 2 files changed, 218 insertions(+), 213 deletions(-) diff --git a/velox/functions/prestosql/ArrayShuffle.cpp b/velox/functions/prestosql/ArrayShuffle.cpp index 158705e613f6..dc755599bbf8 100644 --- a/velox/functions/prestosql/ArrayShuffle.cpp +++ b/velox/functions/prestosql/ArrayShuffle.cpp @@ -23,6 +23,21 @@ namespace facebook::velox::functions { namespace { // See documentation at // https://prestodb.io/docs/current/functions/array.html#shuffle +// +// This function will shuffle identical arrays independently, i.e. even when +// the input has duplicate rows represented using constant and dictionary +// encoding, the output is flat and likely yields different values. +// +// E.g.1: constant encoding +// Input: ConstantVector(base=ArrayVector[{1,2,3}], length=3, index=0) +// Possible Output: ArrayVector[{1,3,2},{2,3,1},{3,2,1}] +// +// E.g.2: dict encoding +// Input: DictionaryVector( +// dictionaryValues=ArrayVector[{1,2,3},{4,5},{1,2,3}], +// dictionaryIndices=[1,2,0]) +// Possible Output: ArrayVector[{5,4},{2,1,3},{1,3,2}] +// class ArrayShuffleFunction : public exec::VectorFunction { public: bool isDeterministic() const override { @@ -37,19 +52,18 @@ class ArrayShuffleFunction : public exec::VectorFunction { VectorPtr& result) const override { VELOX_CHECK_EQ(args.size(), 1); - // For deterministic single-arg functions, expression evaluation will peel - // off encodings and guarantee that we will only see flat or constant - // inputs. But this guarantee does not exist for non-deterministic - // functions like shuffle(). Hence, we need to use DecodedVector to handle - // all the possible dict encodings including constant, flat or dictionary. - exec::LocalDecodedVector localDecodedVector(context, *args[0], rows); - const auto& decodedArray = *localDecodedVector.get(); - auto arrayVector = decodedArray.base()->as(); + // This is a non-deterministic function, which violates the guarantee on a + // deterministic single-arg function that the expression evaluation will + // peel off encodings, and we will only see flat or constant inputs. Hence, + // we need to use DecodedVector to handle ALL encodings. + exec::DecodedArgs decodedArgs(rows, args, context); + auto decodedArg = decodedArgs.at(0); + auto arrayVector = decodedArg->base()->as(); auto elementsVector = arrayVector->elements(); vector_size_t numElements = 0; context.applyToSelectedNoThrow(rows, [&](auto row) { - const auto size = arrayVector->sizeAt(decodedArray.index(row)); + const auto size = arrayVector->sizeAt(decodedArg->index(row)); numElements += size; }); @@ -65,7 +79,7 @@ class ArrayShuffleFunction : public exec::VectorFunction { vector_size_t newOffset = 0; std::mt19937 randGen(std::random_device{}()); context.applyToSelectedNoThrow(rows, [&](auto row) { - vector_size_t arrayRow = decodedArray.index(row); + vector_size_t arrayRow = decodedArg->index(row); vector_size_t size = arrayVector->sizeAt(arrayRow); vector_size_t offset = arrayVector->offsetAt(arrayRow); diff --git a/velox/functions/prestosql/tests/ArrayShuffleTest.cpp b/velox/functions/prestosql/tests/ArrayShuffleTest.cpp index 12445c3e5fe5..60e5cf03dd0a 100644 --- a/velox/functions/prestosql/tests/ArrayShuffleTest.cpp +++ b/velox/functions/prestosql/tests/ArrayShuffleTest.cpp @@ -14,6 +14,7 @@ * limitations under the License. */ +#include #include #include #include @@ -28,12 +29,6 @@ using namespace facebook::velox::test; using namespace facebook::velox::functions::test; namespace { -DecodedVector* decode(DecodedVector& decoder, const BaseVector& vector) { - SelectivityVector rows(vector.size()); - decoder.decode(vector, rows); - return &decoder; -} - template std::string printArray(const std::vector>& input) { std::stringstream out; @@ -77,57 +72,100 @@ std::string printArray(const std::vector& input) { return out.str(); } +template +exec::ArrayView read( + exec::VectorReader>& reader, + size_t offset) { + if constexpr (returnsOptionalValues) { + return reader[offset]; + } else { + return reader.readNullFree(offset); + } +} +} // namespace + +namespace { class ArrayShuffleTest : public FunctionBaseTest { protected: - constexpr static const int32_t kNumOfEvalTimes = 100; + constexpr static const int32_t kNumShuffleTimes = 100; constexpr static const int32_t kNums = 10; constexpr static const double kUniquenessRate = .7; - void testTwoExprsEquals( - const std::string& expression1, - const std::string& expression2, - const std::vector& input) { - auto result1 = evaluate(expression1, makeRowVector(input)); - auto result2 = evaluate(expression2, makeRowVector(input)); - assertEqualVectors(result1, result2); - } - - template - void decodeElementsAndTestIsAPermutation( - const BaseVector& actualVector, - const std::vector>>& expected) { - DecodedVector decoded; - exec::VectorReader reader(decode(decoded, actualVector)); - - for (auto i = 0; i < actualVector.size(); i++) { - ASSERT_TRUE(reader.isSet(i)) - << "Actual array at " << i - << " is a null, which we don't need to test here"; - auto actualArray = reader[i].materialize(); - // 3. assert the two vectors containing the same elements (ignoring order) - LOG(WARNING) << "Actual array " << printArray(actualArray) << " at " << i - << " vs expected array " << printArray(expected[i]); + template + void testShuffle(const VectorPtr& input) { + DecodedVector decodedExpected; + decodedExpected.decode(*input.get()); + exec::VectorReader> readerExpected(&decodedExpected); + + auto actualVector = + evaluate("shuffle(C0)", makeRowVector({input})); + // Validate each row from the actual decoded ArrayVector is a permutation + // of the corresponding row from the expected decoded ArrayVector. + DecodedVector decodedActual; + decodedActual.decode(*actualVector.get()); + exec::VectorReader> readerActual(&decodedActual); + + for (auto i = 0; i < input->size(); i++) { + // Must materialize into std::vector, otherwise it will throw error + // because ArrayView doesn't support std::is_permutation() yet. + auto actualArray = + read(readerActual, i).materialize(); + auto expectedArray = + read(readerExpected, i).materialize(); + + // Assert the two vectors containing the same elements (ignoring order). ASSERT_TRUE(std::is_permutation( - actualArray.begin(), actualArray.end(), expected[i].begin())) + actualArray.begin(), actualArray.end(), expectedArray.begin())) << "Actual array " << printArray(actualArray) << " at " << i - << " must produce a permutation of the expected array " - << printArray(expected[i]); + << " must produce a permutation of expected array " + << printArray(expectedArray); } } - template > - void testIsPermutation( - const std::vector>>& expected, - const std::string& expression, - const std::vector& input) { - auto arrayVectorResult = - evaluate(expression, makeRowVector(input)); - decodeElementsAndTestIsAPermutation( - *arrayVectorResult.get(), expected); + /// Generate a range-based array N {0, 1, ..., n-1} as the input for + /// test shuffle randomness purpose. + /// 1. For flat encoding: we generate an t-size ArrayVector with identical + /// elements (N). + /// 2. For constant encoding: we generate an t-size ConstantVector with + /// valueVector = N. + /// 3. For dict encoding: we generate a DictionaryVector with a 2t-size base + /// ArrayVector (with and identical elements: N) and a t-size indices. + VectorPtr generateVector( + const VectorEncoding::Simple& encoding, + const int32_t n = kNums, + const int32_t t = kNumShuffleTimes) { + std::vector inputData(n); + std::iota(inputData.begin(), inputData.end(), 0); + + if (encoding == VectorEncoding::Simple::FLAT) { + std::vector> flatData(t); + std::fill(flatData.begin(), flatData.end(), inputData); + return makeArrayVector(flatData); + } + if (encoding == VectorEncoding::Simple::CONSTANT) { + vector_size_t size = t; + auto valueVector = makeArrayVector({inputData}); + return BaseVector::wrapInConstant(size, 0, valueVector); + } + if (encoding == VectorEncoding::Simple::DICTIONARY) { + vector_size_t baseSize = t * 2; + std::vector> baseData(baseSize); + std::fill(baseData.begin(), baseData.end(), inputData); + auto base = makeArrayVector(baseData); + + std::vector indicesData(t); + std::iota(indicesData.begin(), indicesData.end(), 0); + auto indices = makeIndices(indicesData); + return wrapInDictionary(indices, base); + } + // Never reach here. + std::stringstream out; + out << encoding; + VELOX_FAIL("Unsupported encoding: " + out.str()); } - /// To test a shuffle, we will do the following steps: - /// 1. run the test for T times with range-based input {0, 1, ..., n-1}, + /// To test shuffle's randomness, we need the following steps: + /// 1. shuffle T times with range-based input {0, 1, ..., n-1}, /// and it should be at least X% times different in total. /// Let's say T = 100, n = 10, X = 70. /// 2. test if each shuffle array is a permutation of the origin array, @@ -138,188 +176,141 @@ class ArrayShuffleTest : public FunctionBaseTest { /// verifying the shuffle's (1) randomness and (2) and correctness. However, /// it doesn't guarantee the uniformity. We will avoid over testing with /// leveraging standard deviation, etc. - void testRandomness( - const int32_t t = kNumOfEvalTimes, - const int32_t n = kNums) { - // generate a range-based array {0, 1, ..., n-1} and shuffle it - std::vector actual(n); - std::iota(actual.begin(), actual.end(), 0); - std::vector expected(actual); - std::mt19937 rng(std::random_device{}()); - std::shuffle(actual.begin(), actual.end(), rng); - - const int32_t kThreshold = (int32_t)(n * kUniquenessRate); - auto actualVector = makeArrayVector({actual}); - folly::F14FastSet> distinctResults; - - for (auto i = 0; i < t; i++) { - auto arrayVectorResult = - evaluate("shuffle(C0)", makeRowVector({actualVector})); - // decode element values from output ArrayVector - DecodedVector decoded; - exec::VectorReader reader(decode(decoded, *arrayVectorResult)); - auto arrayReader = reader[0].castTo>(); - std::vector shuffledResult(kNums); - for (auto offset = 0; offset < arrayReader.size(); offset++) { - shuffledResult[offset] = arrayReader[offset].value(); - } - // assert the two vectors containing the same elements (ignoring order) - ASSERT_TRUE(std::is_permutation( - shuffledResult.begin(), shuffledResult.end(), expected.begin())) - << "Actual array " << printArray(shuffledResult) << " at " << i - << " must produce a permutation of the expected array " - << printArray(expected); - distinctResults.insert(shuffledResult); - } + void testShuffleRandomness(const VectorPtr& input) { + DecodedVector decodedExpected; + decodedExpected.decode(*input.get()); + exec::VectorReader> readerExpected(&decodedExpected); - // shuffled arrays should be X% different in total. - int32_t numOfDistincts = distinctResults.size(); - ASSERT_TRUE(numOfDistincts >= kThreshold) - << "Shuffle must produce at least " << kThreshold - << " distinct results out of " << t << " total results, but got only " - << numOfDistincts; - } + using materialize_t = + typename exec::ArrayView::materialize_t; + folly::F14FastSet distinctValueSet; - void testBigintNullable() { - auto actual = makeNullableArrayVector( - {{}, - {std::nullopt}, - {std::nullopt, std::nullopt, std::nullopt}, - {-1, 0, std::nullopt, 1, std::nullopt}, - {0, std::nullopt, 0, 0}, - { - std::numeric_limits::max(), - std::numeric_limits::max(), - std::numeric_limits::max(), - 1, - 0, - -1, - -1, - std::nullopt, - std::nullopt, - }}); - std::vector>> expected{ - {}, - {std::nullopt}, - {std::nullopt, std::nullopt, std::nullopt}, - {-1, 0, 1, std::nullopt, std::nullopt}, - {std::nullopt, 0, 0, 0}, - {std::nullopt, - std::nullopt, - -1, - -1, - 0, - 1, - std::numeric_limits::max(), - std::numeric_limits::max(), - std::numeric_limits::max()}}; - testIsPermutation(expected, "shuffle(C0)", {actual}); - } + auto actualVector = + evaluate("shuffle(C0)", makeRowVector({input})); - void testNestedArrayNullable() { - using innerArrayType = std::vector>; - using outerArrayType = - std::vector>>>; - innerArrayType a{1, 2, 3, 4}; - innerArrayType b{5, 6}; - innerArrayType c{6, 7, 8}; - outerArrayType row1{{a}, {b}}; - outerArrayType row2{std::nullopt, std::nullopt, {a}, {b}, {c}}; - outerArrayType row3{{{}}}; - outerArrayType row4{{{std::nullopt}}}; - auto actual = makeNullableNestedArrayVector( - {{row1}, {row2}, {row3}, {row4}}); - std::vector expected{ - {{{b}, {a}}}, - {{{b}, {a}, {c}, std::nullopt, std::nullopt}}, - {row3}, - {row4}}; - testIsPermutation>>( - expected, "shuffle(C0)", {actual}); - } + DecodedVector decodedActual; + decodedActual.decode(*actualVector.get()); + exec::VectorReader> readerActual(&decodedActual); - void testSortedArrayAndSortedShuffleArray() { - auto actual = makeNullableArrayVector( - {{-1, 0, std::nullopt, 1, std::nullopt}, - {4, 1, 5, 3, 2}, - {std::numeric_limits::max(), - std::numeric_limits::min(), - 4, - 1, - 3, - 2, - std::nullopt}}); - testTwoExprsEquals("array_sort(C0)", "array_sort(shuffle(C0))", {actual}); - } + for (auto i = 0; i < actualVector->size(); i++) { + auto actualArray = read(readerActual, i).materialize(); + auto expectedArray = + read(readerExpected, i).materialize(); - void testConstantEncoding() { - vector_size_t size = 1'000; - auto valueVector = makeArrayVector({{}, {4, 5}, {1, 2, 3}}); - - auto evaluateConstant = [&](vector_size_t row, const VectorPtr& vector) { - return evaluate( - "shuffle(c0)", - makeRowVector({BaseVector::wrapInConstant(size, row, vector)})); - }; - - std::vector>> expected(size); - std::vector>> data{ - {}, {4, 5}, {1, 2, 3}}; - - auto result = evaluateConstant(0, valueVector); - std::fill(expected.begin(), expected.end(), data[0]); - decodeElementsAndTestIsAPermutation>( - *result.get(), expected); - - result = evaluateConstant(1, valueVector); - std::fill(expected.begin(), expected.end(), data[1]); - decodeElementsAndTestIsAPermutation>( - *result.get(), expected); - - result = evaluateConstant(2, valueVector); - std::fill(expected.begin(), expected.end(), data[2]); - decodeElementsAndTestIsAPermutation>( - *result.get(), expected); - } - - void testDictEncoding() { - // Testing the cases of repeated elements and filtering. - auto base = makeArrayVector( - {{0}, {1, 2, 3}, {4, 5}, {1, 2, 3}, {1, 2, 3}, {4, 5}}); - std::vector indicesArray({3, 4, 1, 2}); - auto indices = makeIndices(indicesArray); - auto input = wrapInDictionary(indices, base); + ASSERT_TRUE(std::is_permutation( + actualArray.begin(), actualArray.end(), expectedArray.begin())) + << "Actual array " << printArray(actualArray) << " at " << i + << " must produce a permutation of expected array " + << printArray(expectedArray); - std::vector>> expected{ - {2, 3, 1}, {1, 2, 3}, {1, 3, 2}, {4, 5}}; + distinctValueSet.insert(actualArray); + } - auto vectorResult = evaluate("shuffle(C0)", makeRowVector({input})); - decodeElementsAndTestIsAPermutation>( - *vectorResult.get(), expected); + // Shuffled arrays should be X% different in total. + const int32_t kThreshold = + (int32_t)(actualVector->size() * kUniquenessRate); + auto numDistinctValues = distinctValueSet.size(); + ASSERT_TRUE(numDistinctValues >= kThreshold) + << "Shuffle " << input->encoding() + << " array must yield >= " << kThreshold + << " distinct values, but only got " << numDistinctValues; } }; } // namespace TEST_F(ArrayShuffleTest, normalArrays) { - testBigintNullable(); + auto input = makeNullableArrayVector( + {{}, + {std::nullopt}, + {std::nullopt, std::nullopt}, + {-1, 0, 1}, + {std::nullopt, 0, 0}, + {std::numeric_limits::max(), + std::numeric_limits::min(), + 0, + 1, + 2, + 3}}); + testShuffle(input); } TEST_F(ArrayShuffleTest, nestedArrays) { - testNestedArrayNullable(); + using innerArrayType = std::vector>; + using outerArrayType = + std::vector>>>; + innerArrayType a{1, 2, 3, 4}; + innerArrayType b{5, 6}; + innerArrayType c{6, 7, 8}; + outerArrayType row1{{a}, {b}}; + outerArrayType row2{std::nullopt, std::nullopt, {a}, {b}, {c}}; + outerArrayType row3{{{}}}; + outerArrayType row4{{{std::nullopt}}}; + auto input = + makeNullableNestedArrayVector({{row1}, {row2}, {row3}, {row4}}); + + testShuffle, true>(input); } TEST_F(ArrayShuffleTest, sortAndShuffle) { - testSortedArrayAndSortedShuffleArray(); + auto input = makeNullableArrayVector( + {{-1, 0, std::nullopt, 1, std::nullopt}, + {4, 1, 5, 3, 2}, + {std::numeric_limits::max(), + std::numeric_limits::min(), + 4, + 1, + 3, + 2, + std::nullopt}}); + auto inputVector = makeRowVector({input}); + auto result1 = evaluate("array_sort(C0)", inputVector); + auto result2 = evaluate("array_sort(shuffle(C0))", inputVector); + + assertEqualVectors(result1, result2); } TEST_F(ArrayShuffleTest, constantEncoding) { - testConstantEncoding(); + vector_size_t size = 100; + // Considering the cases of empty array, nullable array, duplicate-element + // array, and distinct-value array. + auto valueVector = makeNullableArrayVector( + {{}, {std::nullopt, 0}, {5, 5}, {1, 2, 3}}); + + for (auto i = 0; i < valueVector->size(); i++) { + auto input = BaseVector::wrapInConstant(size, i, valueVector); + testShuffle(input); + } } TEST_F(ArrayShuffleTest, dictEncoding) { - testDictEncoding(); + // Considering the case of repeated elements: {1,2,3} x 3, {4,5} x 2. + auto base = makeNullableArrayVector( + {{0}, + {1, 2, 3}, + {4, 5, std::nullopt}, + {1, 2, 3}, + {1, 2, 3}, + {4, 5, std::nullopt}}); + // Considering the case of indices filtering (filter out element at index 0) + std::vector indicesArray({3, 4, 1, 2}); + + auto indices = makeIndices(indicesArray); + auto input = wrapInDictionary(indices, base); + + testShuffle(input); } -TEST_F(ArrayShuffleTest, randomness) { - testRandomness(); +TEST_F(ArrayShuffleTest, flatEncodingRandomness) { + auto inputVector = generateVector(VectorEncoding::Simple::FLAT); + testShuffleRandomness(inputVector); } + +TEST_F(ArrayShuffleTest, constantEncodingRandomness) { + auto inputVector = generateVector(VectorEncoding::Simple::CONSTANT); + testShuffleRandomness(inputVector); +} + +TEST_F(ArrayShuffleTest, dictEncodingRandomness) { + auto inputVector = generateVector(VectorEncoding::Simple::DICTIONARY); + testShuffleRandomness(inputVector); +} \ No newline at end of file