diff --git a/velox/docs/functions/presto/map.rst b/velox/docs/functions/presto/map.rst index 28b5c2edc2f2..9c77af3312ff 100644 --- a/velox/docs/functions/presto/map.rst +++ b/velox/docs/functions/presto/map.rst @@ -27,17 +27,17 @@ Map Functions See also :func:`map_agg` for creating a map as an aggregation. +.. function:: map_concat(map1(K,V), map2(K,V), ..., mapN(K,V)) -> map(K,V) + + Returns the union of all the given maps. If a key is found in multiple given maps, + that key's value in the resulting map comes from the last one of those maps. + .. function:: map_entries(map(K,V)) -> array(row(K,V)) Returns an array of all entries in the given map. :: SELECT map_entries(MAP(ARRAY[1, 2], ARRAY['x', 'y'])); -- [ROW(1, 'x'), ROW(2, 'y')] -.. function:: map_concat(map1(K,V), map2(K,V), ..., mapN(K,V)) -> map(K,V) - - Returns the union of all the given maps. If a key is found in multiple given maps, - that key's value in the resulting map comes from the last one of those maps. - .. function:: map_filter(map(K,V), function(K,V,boolean)) -> map(K,V) Constructs a map from those entries of ``map`` for which ``function`` returns true:: @@ -46,6 +46,12 @@ Map Functions SELECT map_filter(MAP(ARRAY[10, 20, 30], ARRAY['a', NULL, 'c']), (k, v) -> v IS NOT NULL); -- {10 -> a, 30 -> c} SELECT map_filter(MAP(ARRAY['k1', 'k2', 'k3'], ARRAY[20, 3, 15]), (k, v) -> v > 10); -- {k1 -> 20, k3 -> 15} +.. function:: map_from_entries(array(row(K, V))) -> map(K, V) + + Returns a map created from the given array of entries. :: + + SELECT map_from_entries(ARRAY[(1, 'x'), (2, 'y')]); -- {1 -> 'x', 2 -> 'y'} + .. function:: map_keys(x(K,V)) -> array(K) Returns all the keys in the map ``x``. @@ -54,14 +60,6 @@ Map Functions Returns all the values in the map ``x``. -.. function:: subscript(map(K, V), key) -> V - :noindex: - - Returns value for given ``key``. Throws if the key is not contained in the map. - Corresponds to SQL subscript operator []. - - SELECT name_to_age_map['Bob'] AS bob_age; - .. function:: map_zip_with(map(K,V1), map(K,V2), function(K,V1,V2,V3)) -> map(K,V3) Merges the two given maps into a single map by applying ``function`` to the pair of values with the same key. @@ -77,6 +75,14 @@ Map Functions MAP(ARRAY['a', 'b', 'c'], ARRAY[1, 2, 3]), (k, v1, v2) -> k || CAST(v1/v2 AS VARCHAR)); +.. function:: subscript(map(K, V), key) -> V + :noindex: + + Returns value for given ``key``. Throws if the key is not contained in the map. + Corresponds to SQL subscript operator []. + + SELECT name_to_age_map['Bob'] AS bob_age; + .. function:: transform_keys(map(K1,V), function(K1,V,K2)) -> map(K2,V) Returns a map that applies ``function`` to each entry of ``map`` and transforms the keys:: diff --git a/velox/functions/prestosql/CMakeLists.txt b/velox/functions/prestosql/CMakeLists.txt index 63a558189235..c47b42234d01 100644 --- a/velox/functions/prestosql/CMakeLists.txt +++ b/velox/functions/prestosql/CMakeLists.txt @@ -38,6 +38,7 @@ add_library( JsonFunctions.cpp Map.cpp MapEntries.cpp + MapFromEntries.cpp MapKeysAndValues.cpp MapZipWith.cpp Not.cpp diff --git a/velox/functions/prestosql/MapFromEntries.cpp b/velox/functions/prestosql/MapFromEntries.cpp new file mode 100644 index 000000000000..ebf028999d26 --- /dev/null +++ b/velox/functions/prestosql/MapFromEntries.cpp @@ -0,0 +1,136 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "velox/expression/EvalCtx.h" +#include "velox/expression/Expr.h" +#include "velox/expression/VectorFunction.h" +#include "velox/functions/lib/CheckDuplicateKeys.h" +#include "velox/functions/lib/RowsTranslationUtil.h" + +namespace facebook::velox::functions { +namespace { +// See documentation at https://prestodb.io/docs/current/functions/map.html +class MapFromEntriesFunction : public exec::VectorFunction { + public: + void apply( + const SelectivityVector& rows, + std::vector& args, + const TypePtr& outputType, + exec::EvalCtx& context, + VectorPtr& result) const override { + VELOX_CHECK_EQ(args.size(), 1); + auto& arg = args[0]; + VectorPtr localResult; + + // Input can be constant or flat. + if (arg->isConstantEncoding()) { + auto* constantArray = arg->as>(); + const auto& flatArray = constantArray->valueVector(); + const auto flatIndex = constantArray->index(); + + exec::LocalSelectivityVector singleRow(context, flatIndex + 1); + singleRow->clearAll(); + singleRow->setValid(flatIndex, true); + singleRow->updateBounds(); + + localResult = applyFlat( + *singleRow.get(), flatArray->as(), outputType, context); + localResult = + BaseVector::wrapInConstant(rows.size(), flatIndex, localResult); + } else { + localResult = + applyFlat(rows, arg->as(), outputType, context); + } + + context.moveOrCopyResult(localResult, rows, result); + } + + static std::vector> signatures() { + return {// array(unknown) -> map(unknown, unknown) + exec::FunctionSignatureBuilder() + .returnType("map(unknown, unknown)") + .argumentType("array(unknown)") + .build(), + // array(row(K,V)) -> map(K,V) + exec::FunctionSignatureBuilder() + .knownTypeVariable("K") + .typeVariable("V") + .returnType("map(K,V)") + .argumentType("array(row(K,V))") + .build()}; + } + + private: + VectorPtr applyFlat( + const SelectivityVector& rows, + const ArrayVector* inputArray, + const TypePtr& outputType, + exec::EvalCtx& context) const { + auto& inputRowVector = inputArray->elements(); + exec::LocalDecodedVector decodedRow(context); + decodedRow.get()->decode(*inputRowVector); + auto rowVector = decodedRow->base()->as(); + auto rowKeyVector = rowVector->childAt(0); + + // Validate all map entries and map keys are not null. + if (decodedRow->mayHaveNulls() || rowKeyVector->mayHaveNulls()) { + context.applyToSelectedNoThrow(rows, [&](vector_size_t row) { + auto size = inputArray->sizeAt(row); + auto offset = inputArray->offsetAt(row); + for (auto i = 0; i < size; ++i) { + bool isMapEntryNull = decodedRow->isNullAt(offset + i); + VELOX_USER_CHECK(!isMapEntryNull, "map entry cannot be null"); + bool isMapKeyNull = + rowKeyVector->isNullAt(decodedRow->index(offset + i)); + VELOX_USER_CHECK(!isMapKeyNull, "map key cannot be null"); + } + }); + } + + VectorPtr wrappedKeys; + VectorPtr wrappedValues; + if (decodedRow->isIdentityMapping()) { + wrappedKeys = rowVector->childAt(0); + wrappedValues = rowVector->childAt(1); + } else { + wrappedKeys = decodedRow->wrap( + rowVector->childAt(0), *inputRowVector, inputRowVector->size()); + wrappedValues = decodedRow->wrap( + rowVector->childAt(1), *inputRowVector, inputRowVector->size()); + } + + // To avoid creating new buffers, we try to reuse the input's buffers + // as many as possible. + auto mapVector = std::make_shared( + context.pool(), + outputType, + inputArray->nulls(), + rows.end(), + inputArray->offsets(), + inputArray->sizes(), + wrappedKeys, + wrappedValues); + + checkDuplicateKeys(mapVector, rows, context); + return mapVector; + } +}; +} // namespace + +VELOX_DECLARE_VECTOR_FUNCTION( + udf_map_from_entries, + MapFromEntriesFunction::signatures(), + std::make_unique()); +} // namespace facebook::velox::functions diff --git a/velox/functions/prestosql/registration/MapFunctionsRegistration.cpp b/velox/functions/prestosql/registration/MapFunctionsRegistration.cpp index 64387269ae79..7781e27c8173 100644 --- a/velox/functions/prestosql/registration/MapFunctionsRegistration.cpp +++ b/velox/functions/prestosql/registration/MapFunctionsRegistration.cpp @@ -26,6 +26,8 @@ void registerMapFunctions(const std::string& prefix) { udf_transform_values, prefix + "transform_values"); VELOX_REGISTER_VECTOR_FUNCTION(udf_map, prefix + "map"); VELOX_REGISTER_VECTOR_FUNCTION(udf_map_entries, prefix + "map_entries"); + VELOX_REGISTER_VECTOR_FUNCTION( + udf_map_from_entries, prefix + "map_from_entries"); VELOX_REGISTER_VECTOR_FUNCTION(udf_map_keys, prefix + "map_keys"); VELOX_REGISTER_VECTOR_FUNCTION(udf_map_values, prefix + "map_values"); VELOX_REGISTER_VECTOR_FUNCTION(udf_map_zip_with, prefix + "map_zip_with"); diff --git a/velox/functions/prestosql/tests/CMakeLists.txt b/velox/functions/prestosql/tests/CMakeLists.txt index a72acbfee748..8b11ce47c71a 100644 --- a/velox/functions/prestosql/tests/CMakeLists.txt +++ b/velox/functions/prestosql/tests/CMakeLists.txt @@ -56,6 +56,7 @@ add_executable( JsonFunctionsTest.cpp MapEntriesTest.cpp MapFilterTest.cpp + MapFromEntriesTest.cpp MapKeysAndValuesTest.cpp MapTest.cpp MapZipWithTest.cpp diff --git a/velox/functions/prestosql/tests/MapFromEntriesTest.cpp b/velox/functions/prestosql/tests/MapFromEntriesTest.cpp new file mode 100644 index 000000000000..be77f088d529 --- /dev/null +++ b/velox/functions/prestosql/tests/MapFromEntriesTest.cpp @@ -0,0 +1,240 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "velox/common/base/tests/GTestUtils.h" +#include "velox/functions/prestosql/tests/utils/FunctionBaseTest.h" +#include "velox/vector/tests/TestingDictionaryArrayElementsFunction.h" + +using namespace facebook::velox; +using namespace facebook::velox::test; +using namespace facebook::velox::functions::test; + +namespace { +std::optional>>> O( + const std::vector>>& vector) { + return std::make_optional(vector); +} +} // namespace + +namespace { +class MapFromEntriesTest : public FunctionBaseTest { + protected: + /// Create an MAP vector of size 1 using specified 'keys' and 'values' vector. + VectorPtr makeSingleRowMapVector( + const VectorPtr& keys, + const VectorPtr& values) { + BufferPtr offsets = allocateOffsets(1, pool()); + BufferPtr sizes = allocateSizes(1, pool()); + sizes->asMutable()[0] = keys->size(); + + return std::make_shared( + pool(), + MAP(keys->type(), values->type()), + nullptr, + 1, + offsets, + sizes, + keys, + values); + } + + void verifyMapFromEntries( + const std::vector& input, + const VectorPtr& expected, + const std::string& funcArg = "C0", + bool wrappedWithTry = false) { + const std::string expr = wrappedWithTry + ? fmt::format("try(map_from_entries({}))", funcArg) + : fmt::format("map_from_entries({})", funcArg); + auto result = evaluate(expr, makeRowVector(input)); + assertEqualVectors(expected, result); + } + + // Evaluate an expression only, usually expect error thrown. + void evaluateExpr( + const std::string& expression, + const std::vector& input) { + evaluate(expression, makeRowVector(input)); + } +}; +} // namespace + +TEST_F(MapFromEntriesTest, intKeyAndVarcharValue) { + auto rowType = ROW({INTEGER(), VARCHAR()}); + std::vector> data = { + {variant::row({1, "red"}), + variant::row({2, "blue"}), + variant::row({3, "green"})}}; + auto input = makeArrayOfRowVector(rowType, data); + auto expected = makeMapVector( + {{{1, "red"_sv}, {2, "blue"_sv}, {3, "green"_sv}}}); + verifyMapFromEntries({input}, expected); +} + +TEST_F(MapFromEntriesTest, nullMapEntries) { + auto rowType = ROW({INTEGER(), INTEGER()}); + std::vector> data = { + {variant(TypeKind::ROW)}, {variant::row({1, 11})}}; + auto input = makeArrayOfRowVector(rowType, data); + VELOX_ASSERT_THROW( + evaluateExpr("map_from_entries(C0)", {input}), + "map entry cannot be null"); + auto expected = + makeNullableMapVector({std::nullopt, O({{1, 11}})}); + verifyMapFromEntries({input}, expected, "C0", true); +} + +TEST_F(MapFromEntriesTest, nullKeys) { + auto rowType = ROW({INTEGER(), INTEGER()}); + std::vector> data = { + {variant::row({variant::null(TypeKind::INTEGER), 0})}, + {variant::row({1, 11})}}; + auto input = makeArrayOfRowVector(rowType, data); + VELOX_ASSERT_THROW( + evaluateExpr("map_from_entries(C0)", {input}), "map key cannot be null"); + auto expected = + makeNullableMapVector({std::nullopt, O({{1, 11}})}); + verifyMapFromEntries({input}, expected, "C0", true); +} + +TEST_F(MapFromEntriesTest, duplicateKeys) { + auto rowType = ROW({INTEGER(), INTEGER()}); + std::vector> data = { + {variant::row({1, 10}), variant::row({1, 11})}, {variant::row({2, 22})}}; + auto input = makeArrayOfRowVector(rowType, data); + VELOX_ASSERT_THROW( + evaluateExpr("map_from_entries(C0)", {input}), + "Duplicate map keys (1) are not allowed"); + auto expected = + makeNullableMapVector({std::nullopt, O({{2, 22}})}); + verifyMapFromEntries({input}, expected, "C0", true); +} + +TEST_F(MapFromEntriesTest, nullValues) { + auto rowType = ROW({INTEGER(), INTEGER()}); + std::vector> data = { + {variant::row({1, variant::null(TypeKind::INTEGER)}), + variant::row({2, 22}), + variant::row({3, 33})}}; + auto input = makeArrayOfRowVector(rowType, data); + auto expected = + makeMapVector({{{1, std::nullopt}, {2, 22}, {3, 33}}}); + verifyMapFromEntries({input}, expected); +} + +TEST_F(MapFromEntriesTest, constant) { + const vector_size_t kConstantSize = 1'000; + auto rowType = ROW({VARCHAR(), INTEGER()}); + std::vector> data = { + {variant::row({"red", 1}), + variant::row({"blue", 2}), + variant::row({"green", 3})}, + {variant::row({"red shiny car ahead", 4}), + variant::row({"blue clear sky above", 5})}, + {variant::row({"r", 11}), + variant::row({"g", 22}), + variant::row({"b", 33})}}; + auto input = makeArrayOfRowVector(rowType, data); + + auto evaluateConstant = [&](vector_size_t row, const VectorPtr& vector) { + return evaluate( + "map_from_entries(C0)", + makeRowVector( + {BaseVector::wrapInConstant(kConstantSize, row, vector)})); + }; + + auto result = evaluateConstant(0, input); + auto expected = BaseVector::wrapInConstant( + kConstantSize, + 0, + makeSingleRowMapVector( + makeFlatVector({"red"_sv, "blue"_sv, "green"_sv}), + makeFlatVector({1, 2, 3}))); + test::assertEqualVectors(expected, result); + + result = evaluateConstant(1, input); + expected = BaseVector::wrapInConstant( + kConstantSize, + 0, + makeSingleRowMapVector( + makeFlatVector( + {"red shiny car ahead"_sv, "blue clear sky above"_sv}), + makeFlatVector({4, 5}))); + test::assertEqualVectors(expected, result); + + result = evaluateConstant(2, input); + expected = BaseVector::wrapInConstant( + kConstantSize, + 0, + makeSingleRowMapVector( + makeFlatVector({"r"_sv, "g"_sv, "b"_sv}), + makeFlatVector({11, 22, 33}))); + test::assertEqualVectors(expected, result); +} + +TEST_F(MapFromEntriesTest, dictionaryEncodedElementsInFlat) { + exec::registerVectorFunction( + "testing_dictionary_array_elements", + test::TestingDictionaryArrayElementsFunction::signatures(), + std::make_unique()); + + auto rowType = ROW({INTEGER(), VARCHAR()}); + std::vector> data = { + {variant::row({1, "red"}), + variant::row({2, "blue"}), + variant::row({3, "green"})}}; + auto input = makeArrayOfRowVector(rowType, data); + auto expected = makeMapVector( + {{{1, "red"_sv}, {2, "blue"_sv}, {3, "green"_sv}}}); + verifyMapFromEntries( + {input}, expected, "testing_dictionary_array_elements(C0)"); +} + +TEST_F(MapFromEntriesTest, outputSizeIsBoundBySelectedRows) { + // This test makes sure that map_from_entries output vector size is + // `rows.end()` instead of `rows.size()`. + + auto rowType = ROW({INTEGER(), INTEGER()}); + auto function = + exec::getVectorFunction("map_from_entries", {ARRAY(rowType)}, {}); + + std::vector> data = { + {variant::row({1, 11}), variant::row({2, 22}), variant::row({3, 33})}, + {variant::row({4, 44}), variant::row({5, 55})}, + {variant::row({6, 66})}}; + auto array = makeArrayOfRowVector(rowType, data); + + auto rowVector = makeRowVector({array}); + + // Only the first 2 rows selected. + SelectivityVector rows(2); + // This is larger than input array size but rows beyond the input vector size + // are not selected. + rows.resize(1000, false); + + ASSERT_EQ(rows.size(), 1000); + ASSERT_EQ(rows.end(), 2); + ASSERT_EQ(array->size(), 3); + + auto typedExpr = + makeTypedExpr("map_from_entries(c0)", asRowType(rowVector->type())); + std::vector results(1); + + exec::ExprSet exprSet({typedExpr}, &execCtx_); + exec::EvalCtx evalCtx(&execCtx_, &exprSet, rowVector.get()); + exprSet.eval(rows, evalCtx, results); + + ASSERT_EQ(results[0]->size(), 2); +} diff --git a/velox/vector/tests/utils/VectorTestBase.h b/velox/vector/tests/utils/VectorTestBase.h index 12232e8bc24b..4f2bc4de4a0e 100644 --- a/velox/vector/tests/utils/VectorTestBase.h +++ b/velox/vector/tests/utils/VectorTestBase.h @@ -244,9 +244,7 @@ class VectorTestBase { // Create an ArrayVector from nested std::vectors of variants. // Example: - // auto arrayVector = makeArrayOfRowVector( - // ROW({INTEGER(), VARCHAR()}), - // { + // auto arrayVector = makeArrayOfRowVector({ // {variant::row({1, "red"}), variant::row({1, "blue"})}, // {}, // {variant::row({3, "green"})},