From e15dbf06f0b78102914762ac797caef62f1c224a Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Mon, 22 Feb 2021 09:31:35 -0700 Subject: [PATCH 01/22] Rename aggregation::Kind::COLLECT to aggregation::Kind::COLLECT_LIST. This paves the way for the upcomming aggregation::Kind::COLLECT_SET. --- cpp/include/cudf/aggregation.hpp | 6 +++--- .../cudf/detail/aggregation/aggregation.hpp | 12 +++++------ cpp/src/aggregation/aggregation.cpp | 2 +- cpp/src/groupby/sort/groupby.cu | 4 ++-- cpp/src/rolling/rolling_detail.cuh | 20 +++++++++---------- cpp/src/rolling/rolling_detail.hpp | 12 +++++------ cpp/tests/groupby/group_collect_test.cpp | 2 +- 7 files changed, 29 insertions(+), 29 deletions(-) diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp index a81b6ebc8a1..1311fa3f27b 100644 --- a/cpp/include/cudf/aggregation.hpp +++ b/cpp/include/cudf/aggregation.hpp @@ -74,7 +74,7 @@ class aggregation { NUNIQUE, ///< count number of unique elements NTH_ELEMENT, ///< get the nth element ROW_NUMBER, ///< get row-number of current index (relative to rolling window) - COLLECT, ///< collect values into a list + COLLECT_LIST, ///< collect values into a list LEAD, ///< window function, accesses row at specified offset following current row LAG, ///< window function, accesses row at specified offset preceding current row PTX, ///< PTX UDF based reduction @@ -205,9 +205,9 @@ std::unique_ptr make_nth_element_aggregation( std::unique_ptr make_row_number_aggregation(); /** - * @brief Factory to create a COLLECT aggregation + * @brief Factory to create a COLLECT_LIST aggregation * - * `COLLECT` returns a list column of all included elements in the group/series. + * `COLLECT_LIST` returns a list column of all included elements in the group/series. * * If `null_handling` is set to `EXCLUDE`, null elements are dropped from each * of the list rows. diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp index 1cafad25c9c..df5a0040ce5 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.hpp +++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp @@ -320,11 +320,11 @@ struct udf_aggregation final : derived_aggregation { }; /** - * @brief Derived aggregation class for specifying COLLECT aggregation + * @brief Derived aggregation class for specifying COLLECT_LIST aggregation */ struct collect_list_aggregation final : derived_aggregation { explicit collect_list_aggregation(null_policy null_handling = null_policy::INCLUDE) - : derived_aggregation{COLLECT}, _null_handling{null_handling} + : derived_aggregation{COLLECT_LIST}, _null_handling{null_handling} { } null_policy _null_handling; ///< include or exclude nulls @@ -514,9 +514,9 @@ struct target_type_impl { using type = cudf::size_type; }; -// Always use list for COLLECT +// Always use list for COLLECT_LIST template -struct target_type_impl { +struct target_type_impl { using type = cudf::list_view; }; @@ -617,8 +617,8 @@ CUDA_HOST_DEVICE_CALLABLE decltype(auto) aggregation_dispatcher(aggregation::Kin return f.template operator()(std::forward(args)...); case aggregation::ROW_NUMBER: return f.template operator()(std::forward(args)...); - case aggregation::COLLECT: - return f.template operator()(std::forward(args)...); + case aggregation::COLLECT_LIST: + return f.template operator()(std::forward(args)...); case aggregation::LEAD: return f.template operator()(std::forward(args)...); case aggregation::LAG: diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp index 04dc8776d20..e5487b2d2f8 100644 --- a/cpp/src/aggregation/aggregation.cpp +++ b/cpp/src/aggregation/aggregation.cpp @@ -125,7 +125,7 @@ std::unique_ptr make_row_number_aggregation() { return std::make_unique(aggregation::ROW_NUMBER); } -/// Factory to create a COLLECT aggregation +/// Factory to create a COLLECT_LIST aggregation std::unique_ptr make_collect_aggregation(null_policy null_handling) { return std::make_unique(null_handling); diff --git a/cpp/src/groupby/sort/groupby.cu b/cpp/src/groupby/sort/groupby.cu index 5c54dd3cb4c..04ccb1244a7 100644 --- a/cpp/src/groupby/sort/groupby.cu +++ b/cpp/src/groupby/sort/groupby.cu @@ -401,12 +401,12 @@ void store_result_functor::operator()(aggregation cons } template <> -void store_result_functor::operator()(aggregation const& agg) +void store_result_functor::operator()(aggregation const& agg) { auto null_handling = static_cast(agg)._null_handling; CUDF_EXPECTS(null_handling == null_policy::INCLUDE, - "null exclusion is not supported on groupby COLLECT aggregation."); + "null exclusion is not supported on groupby COLLECT_LIST aggregation."); if (cache.has_result(col_idx, agg)) return; diff --git a/cpp/src/rolling/rolling_detail.cuh b/cpp/src/rolling/rolling_detail.cuh index dcc48aafb39..c476601aa64 100644 --- a/cpp/src/rolling/rolling_detail.cuh +++ b/cpp/src/rolling/rolling_detail.cuh @@ -315,7 +315,7 @@ template ::value and !(op == aggregation::COUNT_VALID || op == aggregation::COUNT_ALL || op == aggregation::ROW_NUMBER || op == aggregation::LEAD || - op == aggregation::LAG || op == aggregation::COLLECT)>* = nullptr> + op == aggregation::LAG || op == aggregation::COLLECT_LIST)>* = nullptr> bool __device__ process_rolling_window(column_device_view input, column_device_view ignored_default_outputs, mutable_column_device_view output, @@ -814,7 +814,7 @@ struct rolling_window_launcher { typename PrecedingWindowIterator, typename FollowingWindowIterator> std::enable_if_t> operator()(column_view const& input, column_view const& default_outputs, @@ -897,11 +897,11 @@ struct rolling_window_launcher { } /** - * @brief Creates the offsets child of the result of the `COLLECT` window aggregation + * @brief Creates the offsets child of the result of the `COLLECT_LIST` window aggregation * * Given the input column, the preceding/following window bounds, and `min_periods`, * the sizes of each list row may be computed. These values can then be used to - * calculate the offsets for the result of `COLLECT`. + * calculate the offsets for the result of `COLLECT_LIST`. * * Note: If `min_periods` exceeds the number of observations for a window, the size * is set to `0` (since the result is `null`). @@ -945,7 +945,7 @@ struct rolling_window_launcher { } /** - * @brief Generate mapping of each row in the COLLECT result's child column + * @brief Generate mapping of each row in the COLLECT_LIST result's child column * to the index of the row it belongs to. * * If @@ -1030,7 +1030,7 @@ struct rolling_window_launcher { /** * @brief Create gather map to generate the child column of the result of - * the `COLLECT` window aggregation. + * the `COLLECT_LIST` window aggregation. */ template std::unique_ptr create_collect_gather_map(column_view const& child_offsets, @@ -1064,7 +1064,7 @@ struct rolling_window_launcher { } /** - * @brief Count null entries in result of COLLECT. + * @brief Count null entries in result of COLLECT_LIST. */ size_type count_child_nulls(column_view const& input, std::unique_ptr const& gather_map, @@ -1139,7 +1139,7 @@ struct rolling_window_launcher { } template - std::enable_if_t<(op == aggregation::COLLECT), std::unique_ptr> operator()( + std::enable_if_t<(op == aggregation::COLLECT_LIST), std::unique_ptr> operator()( column_view const& input, column_view const& default_outputs, PrecedingIter preceding_begin_raw, @@ -1150,7 +1150,7 @@ struct rolling_window_launcher { rmm::mr::device_memory_resource* mr) { CUDF_EXPECTS(default_outputs.is_empty(), - "COLLECT window function does not support default values."); + "COLLECT_LIST window function does not support default values."); if (input.is_empty()) return empty_like(input); @@ -1370,7 +1370,7 @@ std::unique_ptr rolling_window(column_view const& input, auto input_col = cudf::is_dictionary(input.type()) ? dictionary_column_view(input).get_indices_annotated() : input; - auto output = cudf::type_dispatcher(input_col.type(), + auto output = cudf::type_dispatcher(input_col.type(), dispatch_rolling{}, input_col, default_outputs, diff --git a/cpp/src/rolling/rolling_detail.hpp b/cpp/src/rolling/rolling_detail.hpp index d7fa92f1978..18bd0ea2217 100644 --- a/cpp/src/rolling/rolling_detail.hpp +++ b/cpp/src/rolling/rolling_detail.hpp @@ -41,7 +41,7 @@ static constexpr bool is_rolling_supported() (op == aggregation::SUM) or (op == aggregation::MIN) or (op == aggregation::MAX) or (op == aggregation::COUNT_VALID) or (op == aggregation::COUNT_ALL) or (op == aggregation::MEAN) or (op == aggregation::ROW_NUMBER) or (op == aggregation::LEAD) or - (op == aggregation::LAG) or (op == aggregation::COLLECT); + (op == aggregation::LAG) or (op == aggregation::COLLECT_LIST); constexpr bool is_valid_numeric_agg = (cudf::is_numeric() or cudf::is_duration() or @@ -54,23 +54,23 @@ static constexpr bool is_rolling_supported() return (op == aggregation::MIN) or (op == aggregation::MAX) or (op == aggregation::COUNT_VALID) or (op == aggregation::COUNT_ALL) or (op == aggregation::ROW_NUMBER) or (op == aggregation::LEAD) or - (op == aggregation::LAG) or (op == aggregation::COLLECT); + (op == aggregation::LAG) or (op == aggregation::COLLECT_LIST); } else if (cudf::is_fixed_point()) { return (op == aggregation::SUM) or (op == aggregation::MIN) or (op == aggregation::MAX) or (op == aggregation::COUNT_VALID) or (op == aggregation::COUNT_ALL) or (op == aggregation::ROW_NUMBER) or (op == aggregation::LEAD) or - (op == aggregation::LAG) or (op == aggregation::COLLECT); + (op == aggregation::LAG) or (op == aggregation::COLLECT_LIST); } else if (std::is_same()) { return (op == aggregation::MIN) or (op == aggregation::MAX) or (op == aggregation::COUNT_VALID) or (op == aggregation::COUNT_ALL) or - (op == aggregation::ROW_NUMBER) or (op == aggregation::COLLECT); + (op == aggregation::ROW_NUMBER) or (op == aggregation::COLLECT_LIST); } else if (std::is_same()) { return (op == aggregation::COUNT_VALID) or (op == aggregation::COUNT_ALL) or - (op == aggregation::ROW_NUMBER) or (op == aggregation::COLLECT); + (op == aggregation::ROW_NUMBER) or (op == aggregation::COLLECT_LIST); } else if (std::is_same()) { // TODO: Add support for COUNT_VALID, COUNT_ALL, ROW_NUMBER. - return op == aggregation::COLLECT; + return op == aggregation::COLLECT_LIST; } else { return false; } diff --git a/cpp/tests/groupby/group_collect_test.cpp b/cpp/tests/groupby/group_collect_test.cpp index 9edd0a6932a..6f37337be41 100644 --- a/cpp/tests/groupby/group_collect_test.cpp +++ b/cpp/tests/groupby/group_collect_test.cpp @@ -124,7 +124,7 @@ TYPED_TEST(groupby_collect_test, CollectFailsWithNullExclusion) agg_requests[0].aggregations.push_back(cudf::make_collect_aggregation(null_policy::EXCLUDE)); CUDF_EXPECT_THROW_MESSAGE(gby.aggregate(agg_requests), - "null exclusion is not supported on groupby COLLECT aggregation."); + "null exclusion is not supported on groupby COLLECT_LIST aggregation."); } } // namespace test From 73a0fa3abb50a7aae315ac05d72fd4e9ca47a1fa Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Mon, 22 Feb 2021 09:47:25 -0700 Subject: [PATCH 02/22] Rename functions `*_collect_*` into `*_collect_list_*` functions --- cpp/include/cudf/aggregation.hpp | 2 +- cpp/src/aggregation/aggregation.cpp | 2 +- cpp/tests/groupby/group_collect_test.cpp | 25 ++-- cpp/tests/rolling/collect_list_test.cpp | 146 +++++++++++------------ 4 files changed, 88 insertions(+), 87 deletions(-) diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp index 1311fa3f27b..4d65db856e7 100644 --- a/cpp/include/cudf/aggregation.hpp +++ b/cpp/include/cudf/aggregation.hpp @@ -214,7 +214,7 @@ std::unique_ptr make_row_number_aggregation(); * * @param null_handling Indicates whether to include/exclude nulls in list elements. */ -std::unique_ptr make_collect_aggregation( +std::unique_ptr make_collect_list_aggregation( null_policy null_handling = null_policy::INCLUDE); /// Factory to create a LAG aggregation diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp index e5487b2d2f8..fbb41b6a6f2 100644 --- a/cpp/src/aggregation/aggregation.cpp +++ b/cpp/src/aggregation/aggregation.cpp @@ -126,7 +126,7 @@ std::unique_ptr make_row_number_aggregation() return std::make_unique(aggregation::ROW_NUMBER); } /// Factory to create a COLLECT_LIST aggregation -std::unique_ptr make_collect_aggregation(null_policy null_handling) +std::unique_ptr make_collect_list_aggregation(null_policy null_handling) { return std::make_unique(null_handling); } diff --git a/cpp/tests/groupby/group_collect_test.cpp b/cpp/tests/groupby/group_collect_test.cpp index 6f37337be41..def20b909c1 100644 --- a/cpp/tests/groupby/group_collect_test.cpp +++ b/cpp/tests/groupby/group_collect_test.cpp @@ -26,15 +26,15 @@ namespace cudf { namespace test { template -struct groupby_collect_test : public cudf::test::BaseFixture { +struct groupby_collect_list_test : public cudf::test::BaseFixture { }; using FixedWidthTypesNotBool = cudf::test::Concat; -TYPED_TEST_CASE(groupby_collect_test, FixedWidthTypesNotBool); +TYPED_TEST_CASE(groupby_collect_list_test, FixedWidthTypesNotBool); -TYPED_TEST(groupby_collect_test, CollectWithoutNulls) +TYPED_TEST(groupby_collect_list_test, CollectWithoutNulls) { using K = int32_t; using V = TypeParam; @@ -45,11 +45,11 @@ TYPED_TEST(groupby_collect_test, CollectWithoutNulls) fixed_width_column_wrapper expect_keys{1, 2}; lists_column_wrapper expect_vals{{1, 2, 3}, {4, 5, 6}}; - auto agg = cudf::make_collect_aggregation(); + auto agg = cudf::make_collect_list_aggregation(); test_single_agg(keys, values, expect_keys, expect_vals, std::move(agg)); } -TYPED_TEST(groupby_collect_test, CollectWithNulls) +TYPED_TEST(groupby_collect_list_test, CollectWithNulls) { using K = int32_t; using V = TypeParam; @@ -64,11 +64,11 @@ TYPED_TEST(groupby_collect_test, CollectWithNulls) lists_column_wrapper expect_vals{ {{1, 2}, validity.begin()}, {{3, 4}, validity.begin()}, {{5, 6}, validity.begin()}}; - auto agg = cudf::make_collect_aggregation(); + auto agg = cudf::make_collect_list_aggregation(); test_single_agg(keys, values, expect_keys, expect_vals, std::move(agg)); } -TYPED_TEST(groupby_collect_test, CollectLists) +TYPED_TEST(groupby_collect_list_test, CollectLists) { using K = int32_t; using V = TypeParam; @@ -83,11 +83,11 @@ TYPED_TEST(groupby_collect_test, CollectLists) lists_column_wrapper expect_vals{ {{1, 2}, {3, 4}}, {{5, 6, 7}, LCW{}}, {{9, 10}, {11}}}; - auto agg = cudf::make_collect_aggregation(); + auto agg = cudf::make_collect_list_aggregation(); test_single_agg(keys, values, expect_keys, expect_vals, std::move(agg)); } -TYPED_TEST(groupby_collect_test, dictionary) +TYPED_TEST(groupby_collect_list_test, dictionary) { using K = int32_t; using V = TypeParam; @@ -105,10 +105,11 @@ TYPED_TEST(groupby_collect_test, dictionary) 0, rmm::device_buffer{0}); - test_single_agg(keys, vals, expect_keys, expect_vals->view(), cudf::make_collect_aggregation()); + test_single_agg( + keys, vals, expect_keys, expect_vals->view(), cudf::make_collect_list_aggregation()); } -TYPED_TEST(groupby_collect_test, CollectFailsWithNullExclusion) +TYPED_TEST(groupby_collect_list_test, CollectFailsWithNullExclusion) { using K = int32_t; using V = TypeParam; @@ -121,7 +122,7 @@ TYPED_TEST(groupby_collect_test, CollectFailsWithNullExclusion) std::vector agg_requests(1); agg_requests[0].values = values; - agg_requests[0].aggregations.push_back(cudf::make_collect_aggregation(null_policy::EXCLUDE)); + agg_requests[0].aggregations.push_back(cudf::make_collect_list_aggregation(null_policy::EXCLUDE)); CUDF_EXPECT_THROW_MESSAGE(gby.aggregate(agg_requests), "null exclusion is not supported on groupby COLLECT_LIST aggregation."); diff --git a/cpp/tests/rolling/collect_list_test.cpp b/cpp/tests/rolling/collect_list_test.cpp index 6a3a80601d0..de179223d68 100644 --- a/cpp/tests/rolling/collect_list_test.cpp +++ b/cpp/tests/rolling/collect_list_test.cpp @@ -64,7 +64,7 @@ TYPED_TEST(TypedCollectListTest, BasicRollingWindow) static_cast(foll_column).size()); auto const result_column_based_window = - rolling_window(input_column, prev_column, foll_column, 1, make_collect_aggregation()); + rolling_window(input_column, prev_column, foll_column, 1, make_collect_list_aggregation()); auto const expected_result = lists_column_wrapper{ @@ -79,11 +79,11 @@ TYPED_TEST(TypedCollectListTest, BasicRollingWindow) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_column_based_window->view()); auto const result_fixed_window = - rolling_window(input_column, 2, 1, 1, make_collect_aggregation()); + rolling_window(input_column, 2, 1, 1, make_collect_list_aggregation()); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_fixed_window->view()); auto const result_with_nulls_excluded = - rolling_window(input_column, 2, 1, 1, make_collect_aggregation(null_policy::EXCLUDE)); + rolling_window(input_column, 2, 1, 1, make_collect_list_aggregation(null_policy::EXCLUDE)); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view()); } @@ -104,7 +104,7 @@ TYPED_TEST(TypedCollectListTest, RollingWindowWithEmptyOutputLists) static_cast(foll_column).size()); auto const result_column_based_window = - rolling_window(input_column, prev_column, foll_column, 0, make_collect_aggregation()); + rolling_window(input_column, prev_column, foll_column, 0, make_collect_list_aggregation()); auto const expected_result = lists_column_wrapper{ @@ -120,7 +120,7 @@ TYPED_TEST(TypedCollectListTest, RollingWindowWithEmptyOutputLists) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_column_based_window->view()); auto const result_with_nulls_excluded = rolling_window( - input_column, prev_column, foll_column, 0, make_collect_aggregation(null_policy::EXCLUDE)); + input_column, prev_column, foll_column, 0, make_collect_list_aggregation(null_policy::EXCLUDE)); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view()); } @@ -138,7 +138,7 @@ TYPED_TEST(TypedCollectListTest, RollingWindowWithEmptyOutputListsAtEnds) auto foll_column = fixed_width_column_wrapper{0, 1, 1, 1, 1, 0}; auto const result = - rolling_window(input_column, prev_column, foll_column, 0, make_collect_aggregation()); + rolling_window(input_column, prev_column, foll_column, 0, make_collect_list_aggregation()); auto const expected_result = lists_column_wrapper{{}, {0, 1, 2}, {1, 2, 3}, {2, 3, 4}, {3, 4, 5}, {}}.release(); @@ -146,7 +146,7 @@ TYPED_TEST(TypedCollectListTest, RollingWindowWithEmptyOutputListsAtEnds) CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result->view()); auto const result_with_nulls_excluded = rolling_window( - input_column, prev_column, foll_column, 0, make_collect_aggregation(null_policy::EXCLUDE)); + input_column, prev_column, foll_column, 0, make_collect_list_aggregation(null_policy::EXCLUDE)); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view()); } @@ -164,11 +164,11 @@ TYPED_TEST(TypedCollectListTest, RollingWindowHonoursMinPeriods) auto const input_column = fixed_width_column_wrapper{0, 1, 2, 3, 4, 5}; auto const num_elements = static_cast(input_column).size(); - auto preceding = 2; - auto following = 1; - auto min_periods = 3; - auto const result = - rolling_window(input_column, preceding, following, min_periods, make_collect_aggregation()); + auto preceding = 2; + auto following = 1; + auto min_periods = 3; + auto const result = rolling_window( + input_column, preceding, following, min_periods, make_collect_list_aggregation()); auto const expected_result = lists_column_wrapper{ {{}, {0, 1, 2}, {1, 2, 3}, {2, 3, 4}, {3, 4, 5}, {}}, @@ -183,7 +183,7 @@ TYPED_TEST(TypedCollectListTest, RollingWindowHonoursMinPeriods) preceding, following, min_periods, - make_collect_aggregation(null_policy::EXCLUDE)); + make_collect_list_aggregation(null_policy::EXCLUDE)); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view()); @@ -191,8 +191,8 @@ TYPED_TEST(TypedCollectListTest, RollingWindowHonoursMinPeriods) following = 2; min_periods = 4; - auto result_2 = - rolling_window(input_column, preceding, following, min_periods, make_collect_aggregation()); + auto result_2 = rolling_window( + input_column, preceding, following, min_periods, make_collect_list_aggregation()); auto expected_result_2 = lists_column_wrapper{ {{}, {0, 1, 2, 3}, {1, 2, 3, 4}, {2, 3, 4, 5}, {}, {}}, cudf::detail::make_counting_transform_iterator(0, [num_elements](auto i) { @@ -206,7 +206,7 @@ TYPED_TEST(TypedCollectListTest, RollingWindowHonoursMinPeriods) preceding, following, min_periods, - make_collect_aggregation(null_policy::EXCLUDE)); + make_collect_list_aggregation(null_policy::EXCLUDE)); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result_2->view(), result_2_with_nulls_excluded->view()); @@ -228,11 +228,11 @@ TYPED_TEST(TypedCollectListTest, RollingWindowWithNullInputsHonoursMinPeriods) { // One result row at each end should be null. - auto preceding = 2; - auto following = 1; - auto min_periods = 3; - auto const result = - rolling_window(input_column, preceding, following, min_periods, make_collect_aggregation()); + auto preceding = 2; + auto following = 1; + auto min_periods = 3; + auto const result = rolling_window( + input_column, preceding, following, min_periods, make_collect_list_aggregation()); auto expected_result_child_values = std::vector{0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4, 5}; auto expected_result_child_validity = std::vector{1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1}; @@ -265,7 +265,7 @@ TYPED_TEST(TypedCollectListTest, RollingWindowWithNullInputsHonoursMinPeriods) preceding, following, min_periods, - make_collect_aggregation(null_policy::EXCLUDE)); + make_collect_list_aggregation(null_policy::EXCLUDE)); auto expected_result_child_values = std::vector{0, 2, 2, 3, 2, 3, 3, 5}; auto expected_result_child = fixed_width_column_wrapper( @@ -287,11 +287,11 @@ TYPED_TEST(TypedCollectListTest, RollingWindowWithNullInputsHonoursMinPeriods) { // First result row, and the last two result rows should be null. - auto preceding = 2; - auto following = 2; - auto min_periods = 4; - auto const result = - rolling_window(input_column, preceding, following, min_periods, make_collect_aggregation()); + auto preceding = 2; + auto following = 2; + auto min_periods = 4; + auto const result = rolling_window( + input_column, preceding, following, min_periods, make_collect_list_aggregation()); auto expected_result_child_values = std::vector{0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5}; auto expected_result_child_validity = std::vector{1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1}; @@ -325,7 +325,7 @@ TYPED_TEST(TypedCollectListTest, RollingWindowWithNullInputsHonoursMinPeriods) preceding, following, min_periods, - make_collect_aggregation(null_policy::EXCLUDE)); + make_collect_list_aggregation(null_policy::EXCLUDE)); auto expected_result_child_values = std::vector{0, 2, 3, 2, 3, 2, 3, 5}; auto expected_result_child = fixed_width_column_wrapper( @@ -358,11 +358,11 @@ TEST_F(CollectListTest, RollingWindowHonoursMinPeriodsOnStrings) auto const input_column = strings_column_wrapper{"0", "1", "2", "3", "4", "5"}; auto const num_elements = static_cast(input_column).size(); - auto preceding = 2; - auto following = 1; - auto min_periods = 3; - auto const result = - rolling_window(input_column, preceding, following, min_periods, make_collect_aggregation()); + auto preceding = 2; + auto following = 1; + auto min_periods = 3; + auto const result = rolling_window( + input_column, preceding, following, min_periods, make_collect_list_aggregation()); auto const expected_result = lists_column_wrapper{ {{}, {"0", "1", "2"}, {"1", "2", "3"}, {"2", "3", "4"}, {"3", "4", "5"}, {}}, @@ -377,7 +377,7 @@ TEST_F(CollectListTest, RollingWindowHonoursMinPeriodsOnStrings) preceding, following, min_periods, - make_collect_aggregation(null_policy::EXCLUDE)); + make_collect_list_aggregation(null_policy::EXCLUDE)); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view()); @@ -385,8 +385,8 @@ TEST_F(CollectListTest, RollingWindowHonoursMinPeriodsOnStrings) following = 2; min_periods = 4; - auto result_2 = - rolling_window(input_column, preceding, following, min_periods, make_collect_aggregation()); + auto result_2 = rolling_window( + input_column, preceding, following, min_periods, make_collect_list_aggregation()); auto expected_result_2 = lists_column_wrapper{ {{}, {"0", "1", "2", "3"}, {"1", "2", "3", "4"}, {"2", "3", "4", "5"}, {}, {}}, cudf::detail::make_counting_transform_iterator(0, [num_elements](auto i) { @@ -400,7 +400,7 @@ TEST_F(CollectListTest, RollingWindowHonoursMinPeriodsOnStrings) preceding, following, min_periods, - make_collect_aggregation(null_policy::EXCLUDE)); + make_collect_list_aggregation(null_policy::EXCLUDE)); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result_2->view(), result_2_with_nulls_excluded->view()); @@ -421,11 +421,11 @@ TEST_F(CollectListTest, RollingWindowHonoursMinPeriodsWithDecimal) { // One result row at each end should be null. - auto preceding = 2; - auto following = 1; - auto min_periods = 3; - auto const result = - rolling_window(input_column, preceding, following, min_periods, make_collect_aggregation()); + auto preceding = 2; + auto following = 1; + auto min_periods = 3; + auto const result = rolling_window( + input_column, preceding, following, min_periods, make_collect_list_aggregation()); auto expected_result_child_values = std::vector{0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4, 5}; auto expected_result_child = @@ -451,7 +451,7 @@ TEST_F(CollectListTest, RollingWindowHonoursMinPeriodsWithDecimal) preceding, following, min_periods, - make_collect_aggregation(null_policy::EXCLUDE)); + make_collect_list_aggregation(null_policy::EXCLUDE)); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view()); @@ -459,11 +459,11 @@ TEST_F(CollectListTest, RollingWindowHonoursMinPeriodsWithDecimal) { // First result row, and the last two result rows should be null. - auto preceding = 2; - auto following = 2; - auto min_periods = 4; - auto const result = - rolling_window(input_column, preceding, following, min_periods, make_collect_aggregation()); + auto preceding = 2; + auto following = 2; + auto min_periods = 4; + auto const result = rolling_window( + input_column, preceding, following, min_periods, make_collect_list_aggregation()); auto expected_result_child_values = std::vector{0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5}; auto expected_result_child = @@ -489,7 +489,7 @@ TEST_F(CollectListTest, RollingWindowHonoursMinPeriodsWithDecimal) preceding, following, min_periods, - make_collect_aggregation(null_policy::EXCLUDE)); + make_collect_list_aggregation(null_policy::EXCLUDE)); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view()); @@ -515,7 +515,7 @@ TYPED_TEST(TypedCollectListTest, BasicGroupedRollingWindow) preceding, following, min_periods, - make_collect_aggregation()); + make_collect_list_aggregation()); auto const expected_result = lists_column_wrapper{ {10, 11}, @@ -536,7 +536,7 @@ TYPED_TEST(TypedCollectListTest, BasicGroupedRollingWindow) preceding, following, min_periods, - make_collect_aggregation(null_policy::EXCLUDE)); + make_collect_list_aggregation(null_policy::EXCLUDE)); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view()); } @@ -563,7 +563,7 @@ TYPED_TEST(TypedCollectListTest, BasicGroupedRollingWindowWithNulls) preceding, following, min_periods, - make_collect_aggregation()); + make_collect_list_aggregation()); auto expected_child = fixed_width_column_wrapper{ {10, 11, 10, 11, 12, 11, 12, 13, 12, 13, 14, 13, 14, 20, 21, 20, 21, 22, 21, 22, 23, 22, 23}, @@ -587,7 +587,7 @@ TYPED_TEST(TypedCollectListTest, BasicGroupedRollingWindowWithNulls) preceding, following, min_periods, - make_collect_aggregation(null_policy::EXCLUDE)); + make_collect_list_aggregation(null_policy::EXCLUDE)); auto expected_child = fixed_width_column_wrapper{ 10, 10, 12, 12, 13, 12, 13, 14, 13, 14, 20, 20, 22, 22, 23, 22, 23}; @@ -627,7 +627,7 @@ TYPED_TEST(TypedCollectListTest, BasicGroupedTimeRangeRollingWindow) preceding, following, min_periods, - make_collect_aggregation()); + make_collect_list_aggregation()); auto const expected_result = lists_column_wrapper{ {10, 11, 12, 13}, @@ -650,7 +650,7 @@ TYPED_TEST(TypedCollectListTest, BasicGroupedTimeRangeRollingWindow) preceding, following, min_periods, - make_collect_aggregation(null_policy::EXCLUDE)); + make_collect_list_aggregation(null_policy::EXCLUDE)); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view()); } @@ -678,7 +678,7 @@ TYPED_TEST(TypedCollectListTest, GroupedTimeRangeRollingWindowWithNulls) preceding, following, min_periods, - make_collect_aggregation()); + make_collect_list_aggregation()); auto null_at_0 = iterator_with_null_at(0); auto null_at_1 = iterator_with_null_at(1); @@ -705,7 +705,7 @@ TYPED_TEST(TypedCollectListTest, GroupedTimeRangeRollingWindowWithNulls) preceding, following, min_periods, - make_collect_aggregation(null_policy::EXCLUDE)); + make_collect_list_aggregation(null_policy::EXCLUDE)); // After null exclusion, `11`, `21`, and `null` should not appear. auto const expected_result_with_nulls_excluded = lists_column_wrapper{ @@ -744,7 +744,7 @@ TEST_F(CollectListTest, BasicGroupedTimeRangeRollingWindowOnStrings) preceding, following, min_periods, - make_collect_aggregation()); + make_collect_list_aggregation()); auto const expected_result = lists_column_wrapper{ {"10", "11", "12", "13"}, @@ -767,7 +767,7 @@ TEST_F(CollectListTest, BasicGroupedTimeRangeRollingWindowOnStrings) preceding, following, min_periods, - make_collect_aggregation(null_policy::EXCLUDE)); + make_collect_list_aggregation(null_policy::EXCLUDE)); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view()); } @@ -793,7 +793,7 @@ TEST_F(CollectListTest, GroupedTimeRangeRollingWindowOnStringsWithNulls) preceding, following, min_periods, - make_collect_aggregation()); + make_collect_list_aggregation()); auto null_at_0 = iterator_with_null_at(0); auto null_at_1 = iterator_with_null_at(1); @@ -821,7 +821,7 @@ TEST_F(CollectListTest, GroupedTimeRangeRollingWindowOnStringsWithNulls) preceding, following, min_periods, - make_collect_aggregation(null_policy::EXCLUDE)); + make_collect_list_aggregation(null_policy::EXCLUDE)); // After null exclusion, `11`, `21`, and `null` should not appear. auto const expected_result_with_nulls_excluded = lists_column_wrapper{ @@ -868,7 +868,7 @@ TYPED_TEST(TypedCollectListTest, BasicGroupedTimeRangeRollingWindowOnStructs) preceding, following, min_periods, - make_collect_aggregation()); + make_collect_list_aggregation()); auto expected_numeric_column = fixed_width_column_wrapper{ 10, 11, 12, 13, 10, 11, 12, 13, 10, 11, 12, 13, 14, 10, 11, 12, @@ -898,7 +898,7 @@ TYPED_TEST(TypedCollectListTest, BasicGroupedTimeRangeRollingWindowOnStructs) preceding, following, min_periods, - make_collect_aggregation(null_policy::EXCLUDE)); + make_collect_list_aggregation(null_policy::EXCLUDE)); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view()); } @@ -928,7 +928,7 @@ TYPED_TEST(TypedCollectListTest, GroupedTimeRangeRollingWindowWithMinPeriods) preceding, following, min_periods, - make_collect_aggregation()); + make_collect_list_aggregation()); auto const expected_result = lists_column_wrapper{ {{10, 11, 12, 13}, @@ -954,7 +954,7 @@ TYPED_TEST(TypedCollectListTest, GroupedTimeRangeRollingWindowWithMinPeriods) preceding, following, min_periods, - make_collect_aggregation(null_policy::EXCLUDE)); + make_collect_list_aggregation(null_policy::EXCLUDE)); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view()); } @@ -984,7 +984,7 @@ TYPED_TEST(TypedCollectListTest, GroupedTimeRangeRollingWindowWithNullsAndMinPer preceding, following, min_periods, - make_collect_aggregation()); + make_collect_list_aggregation()); auto null_at_1 = iterator_with_null_at(1); @@ -1013,7 +1013,7 @@ TYPED_TEST(TypedCollectListTest, GroupedTimeRangeRollingWindowWithNullsAndMinPer preceding, following, min_periods, - make_collect_aggregation(null_policy::EXCLUDE)); + make_collect_list_aggregation(null_policy::EXCLUDE)); // After null exclusion, `11`, `21`, and `null` should not appear. auto const expected_result_with_nulls_excluded = lists_column_wrapper{ @@ -1056,7 +1056,7 @@ TEST_F(CollectListTest, GroupedTimeRangeRollingWindowOnStringsWithMinPeriods) preceding, following, min_periods, - make_collect_aggregation()); + make_collect_list_aggregation()); auto const expected_result = lists_column_wrapper{ {{"10", "11", "12", "13"}, @@ -1082,7 +1082,7 @@ TEST_F(CollectListTest, GroupedTimeRangeRollingWindowOnStringsWithMinPeriods) preceding, following, min_periods, - make_collect_aggregation(null_policy::EXCLUDE)); + make_collect_list_aggregation(null_policy::EXCLUDE)); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view()); } @@ -1110,7 +1110,7 @@ TEST_F(CollectListTest, GroupedTimeRangeRollingWindowOnStringsWithNullsAndMinPer preceding, following, min_periods, - make_collect_aggregation()); + make_collect_list_aggregation()); auto null_at_1 = iterator_with_null_at(1); @@ -1139,7 +1139,7 @@ TEST_F(CollectListTest, GroupedTimeRangeRollingWindowOnStringsWithNullsAndMinPer preceding, following, min_periods, - make_collect_aggregation(null_policy::EXCLUDE)); + make_collect_list_aggregation(null_policy::EXCLUDE)); // After null exclusion, `11`, `21`, and `null` should not appear. auto const expected_result_with_nulls_excluded = lists_column_wrapper{ @@ -1190,7 +1190,7 @@ TYPED_TEST(TypedCollectListTest, GroupedTimeRangeRollingWindowOnStructsWithMinPe preceding, following, min_periods, - make_collect_aggregation()); + make_collect_list_aggregation()); auto expected_numeric_column = fixed_width_column_wrapper{ 10, 11, 12, 13, 10, 11, 12, 13, 10, 11, 12, 13, 14, 10, 11, 12, 13, 14, 10, 11, 12, 13, 14}; @@ -1226,7 +1226,7 @@ TYPED_TEST(TypedCollectListTest, GroupedTimeRangeRollingWindowOnStructsWithMinPe preceding, following, min_periods, - make_collect_aggregation(null_policy::EXCLUDE)); + make_collect_list_aggregation(null_policy::EXCLUDE)); CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view()); } From 1ec959fd948e21a5a5c1979a01ab3f01617a0ad4 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 12 Mar 2021 08:26:19 -0700 Subject: [PATCH 03/22] Add collect_set type and factory function --- cpp/include/cudf/aggregation.hpp | 15 +++++++++++++ .../cudf/detail/aggregation/aggregation.hpp | 21 +++++++++++++++++++ cpp/src/aggregation/aggregation.cpp | 5 +++++ 3 files changed, 41 insertions(+) diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp index 4d65db856e7..6fda8bcbd50 100644 --- a/cpp/include/cudf/aggregation.hpp +++ b/cpp/include/cudf/aggregation.hpp @@ -75,6 +75,7 @@ class aggregation { NTH_ELEMENT, ///< get the nth element ROW_NUMBER, ///< get row-number of current index (relative to rolling window) COLLECT_LIST, ///< collect values into a list + COLLECT_SET, ///< collect values into a list without duplicate entries LEAD, ///< window function, accesses row at specified offset following current row LAG, ///< window function, accesses row at specified offset preceding current row PTX, ///< PTX UDF based reduction @@ -217,6 +218,20 @@ std::unique_ptr make_row_number_aggregation(); std::unique_ptr make_collect_list_aggregation( null_policy null_handling = null_policy::INCLUDE); +/** + * @brief Factory to create a COLLECT_SET aggregation + * + * `COLLECT_SET` returns a lists column of all included elements in the group/series. Within each + * list, the duplicated entries are dropped out such that each entry appears only once. + * + * If `null_handling` is set to `EXCLUDE`, null elements are dropped from each + * of the list rows. + * + * @param null_handling Indicates whether to include/exclude nulls in list elements. + */ +std::unique_ptr make_collect_set_aggregation( + null_policy null_handling = null_policy::INCLUDE); + /// Factory to create a LAG aggregation std::unique_ptr make_lag_aggregation(size_type offset); diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp index df5a0040ce5..69fc25602a7 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.hpp +++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp @@ -340,6 +340,27 @@ struct collect_list_aggregation final : derived_aggregation size_t hash_impl() const { return std::hash{}(static_cast(_null_handling)); } }; +/** + * @brief Derived aggregation class for specifying COLLECT_SET aggregation + */ +struct collect_set_aggregation final : derived_aggregation { + explicit collect_set_aggregation(null_policy null_handling = null_policy::INCLUDE) + : derived_aggregation{COLLECT_SET}, _null_handling{null_handling} + { + } + null_policy _null_handling; ///< include or exclude nulls + + protected: + friend class derived_aggregation; + + bool operator==(nunique_aggregation const& other) const + { + return _null_handling == other._null_handling; + } + + size_t hash_impl() const { return std::hash{}(static_cast(_null_handling)); } +}; + /** * @brief Sentinel value used for `ARGMAX` aggregation. * diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp index fbb41b6a6f2..2b02b2824fe 100644 --- a/cpp/src/aggregation/aggregation.cpp +++ b/cpp/src/aggregation/aggregation.cpp @@ -130,6 +130,11 @@ std::unique_ptr make_collect_list_aggregation(null_policy null_hand { return std::make_unique(null_handling); } +/// Factory to create a COLLECT_SET aggregation +std::unique_ptr make_collect_set_aggregation(null_policy null_handling) +{ + return std::make_unique(null_handling); +} /// Factory to create a LAG aggregation std::unique_ptr make_lag_aggregation(size_type offset) { From 401290ab1d40d9a9a60ac51b3aff9a96881f7625 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 12 Mar 2021 09:04:34 -0700 Subject: [PATCH 04/22] Update copyright year --- cpp/include/cudf/aggregation.hpp | 2 +- cpp/include/cudf/detail/aggregation/aggregation.hpp | 2 +- cpp/src/aggregation/aggregation.cpp | 2 +- cpp/src/groupby/groupby.cu | 2 +- cpp/tests/groupby/group_collect_test.cpp | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp index 6fda8bcbd50..bd291346c8d 100644 --- a/cpp/include/cudf/aggregation.hpp +++ b/cpp/include/cudf/aggregation.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp index 69fc25602a7..11fc59b3ddc 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.hpp +++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp index 2b02b2824fe..31d9e53cd73 100644 --- a/cpp/src/aggregation/aggregation.cpp +++ b/cpp/src/aggregation/aggregation.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu index 3166b2be4d4..6b0b8e69b33 100644 --- a/cpp/src/groupby/groupby.cu +++ b/cpp/src/groupby/groupby.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/tests/groupby/group_collect_test.cpp b/cpp/tests/groupby/group_collect_test.cpp index def20b909c1..8a578ea0c0f 100644 --- a/cpp/tests/groupby/group_collect_test.cpp +++ b/cpp/tests/groupby/group_collect_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 8ceda4481ca28dc79a0af4408a796eb31bcf7541 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 16 Mar 2021 13:37:11 -0600 Subject: [PATCH 05/22] Initially implement tests for collect_set --- cpp/tests/groupby/collect_set_test.cpp | 123 ++++ cpp/tests/rolling/collect_set_test.cpp | 974 +++++++++++++++++++++++++ 2 files changed, 1097 insertions(+) create mode 100644 cpp/tests/groupby/collect_set_test.cpp create mode 100644 cpp/tests/rolling/collect_set_test.cpp diff --git a/cpp/tests/groupby/collect_set_test.cpp b/cpp/tests/groupby/collect_set_test.cpp new file mode 100644 index 00000000000..6e92b171d94 --- /dev/null +++ b/cpp/tests/groupby/collect_set_test.cpp @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include + +#include + +namespace cudf { +namespace test { + +#define COL_K cudf::test::fixed_width_column_wrapper +#define COL_V cudf::test::fixed_width_column_wrapper +#define LCL_V cudf::test::lists_column_wrapper +#define DCL_V cudf::test::dictionary_column_wrapper +#define VALIDITY std::initializer_list +#define COLLECT_SET cudf::make_collect_list_aggregation() + +template +struct CollectSetTest : public cudf::test::BaseFixture { +}; + +using FixedWidthTypesNotBool = cudf::test::Concat; +TYPED_TEST_CASE(CollectSetTest, FixedWidthTypesNotBool); + +TYPED_TEST(CollectSetTest, ExceptionCases) +{ + std::vector agg_requests(1); + agg_requests[0].values = COL_V{{1, 2, 3, 4, 5, 6}, {true, false, true, false, true, false}}; + agg_requests[0].aggregations.push_back(cudf::make_collect_list_aggregation(null_policy::EXCLUDE)); + + // groupby cannot exclude nulls + groupby::groupby gby{table_view{{COL_K{1, 1, 2, 2, 3, 3}}}}; + EXPECT_THROW(gby.aggregate(agg_requests), cudf::logic_error); +} + +// TODO: Fix those cases to handle empty and simple input +TYPED_TEST(CollectSetTest, DISABLED_TrivialCases) +{ + // Empty input + test_single_agg(COL_K{}, COL_V{}, COL_K{}, COL_V{}, COLLECT_SET); + + // Single key input + { + COL_K keys{1}; + COL_K keys_expected{1}; + COL_V vals{100}; + COL_V vals_expected{{100}}; + test_single_agg(keys, vals, keys_expected, vals_expected, COLLECT_SET); + } + + // Simple input + { + COL_K keys{1, 2}; + COL_K keys_expected{1, 2}; + COL_V vals{100, 200}; + LCL_V vals_expected{{100}, {200}}; + test_single_agg(keys, vals, keys_expected, vals_expected, COLLECT_SET); + } +} + +TYPED_TEST(CollectSetTest, TypicalCases) +{ + // Hard-coded case + { + COL_K keys{1, 1, 2, 2, 3, 3}; + COL_K keys_expected{1, 2, 3}; + LCL_V vals{{1, 2}, {3, 4}, {5, 6, 7}, {}, {9, 10}, {11}}; + LCL_V vals_expected{{{1, 2}, {3, 4}}, {{5, 6, 7}, {}}, {{9, 10}, {11}}}; + test_single_agg(keys, vals, keys_expected, vals_expected, COLLECT_SET); + } + + // Procedurally generated test + { + COL_K keys{1, 1, 2, 2, 3, 3}; + COL_K keys_expected{1, 2, 3}; + LCL_V vals{{1, 2}, {3, 4}, {5, 6, 7}, {}, {9, 10}, {11}}; + LCL_V vals_expected{{{1, 2}, {3, 4}}, {{5, 6, 7}, {}}, {{9, 10}, {11}}}; + test_single_agg(keys, vals, keys_expected, vals_expected, COLLECT_SET); + } +} + +TYPED_TEST(CollectSetTest, CollectWithNulls) +{ + // Hard-coded case + { + COL_K keys{1, 1, 2, 2, 3, 3}; + COL_K keys_expected{1, 2, 3}; + COL_V vals{{1, 2, 3, 4, 5, 6}, {true, false, true, false, true, false}}; + LCL_V vals_expected{{{1, 2}, VALIDITY{true, false}}, + {{3, 4}, VALIDITY{true, false}}, + {{5, 6}, VALIDITY{true, false}}}; + test_single_agg(keys, vals, keys_expected, vals_expected, COLLECT_SET); + } + + // Procedurally generated test + { + // + } +} + +} // namespace test +} // namespace cudf + +CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/rolling/collect_set_test.cpp b/cpp/tests/rolling/collect_set_test.cpp new file mode 100644 index 00000000000..511795ebe43 --- /dev/null +++ b/cpp/tests/rolling/collect_set_test.cpp @@ -0,0 +1,974 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +//#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +namespace cudf { +namespace test { + +#define COL_V cudf::test::fixed_width_column_wrapper +#define COL_S cudf::test::fixed_width_column_wrapper +#define LCL_V cudf::test::lists_column_wrapper +#define COLLECT_SET cudf::make_collect_list_aggregation() +#define COLLECT_SET_NULLS_EXCLUDED cudf::make_collect_list_aggregation(cudf::null_policy::EXCLUDE) + +void test_equivalent(std::unique_ptr const& lhs, + std::unique_ptr const& rhs) +{ + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(lhs->view(), rhs->view()); +} + +void test_equivalent(cudf::column_view const& lhs, cudf::column_view const& rhs) +{ + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(lhs, rhs); +} + +struct CollectSetTest : public cudf::test::BaseFixture { +}; + +template +struct TypedCollectListTest : public CollectSetTest { +}; + +using TypesForTest = cudf::test::Concat; + +TYPED_TEST_CASE(TypedCollectListTest, TypesForTest); + +TYPED_TEST(TypedCollectListTest, BasicRollingWindow) +{ + auto const input = COL_V{10, 11, 12, 13, 14}; + auto const expected_result = + LCL_V{ + {10, 11}, + {10, 11, 12}, + {11, 12, 13}, + {12, 13, 14}, + {13, 14}, + } + .release(); + + // Rolling window with variable window sizes + test_equivalent( + expected_result, + rolling_window(input, COL_S{1, 2, 2, 2, 2}, COL_S{1, 1, 1, 1, 0}, 1, COLLECT_SET)); + + // Rolling window with fixed window size + test_equivalent(expected_result, rolling_window(input, 2, 1, 1, COLLECT_SET)); + + // Rolling window with nulls excluded + test_equivalent(expected_result, rolling_window(input, 2, 1, 1, COLLECT_SET_NULLS_EXCLUDED)); +} + +TYPED_TEST(TypedCollectListTest, EmptyOutputLists) +{ + auto const input = COL_V{10, 11, 12, 13, 14, 15}; + auto const expected_result = + LCL_V{ + {10, 11}, + {10, 11, 12}, + {11, 12, 13}, + {}, + {13, 14, 15}, + {14, 15}, + } + .release(); + auto const prev = COL_S{1, 2, 2, 0, 2, 2}; + auto const next = COL_S{1, 1, 1, 0, 1, 0}; + + // Rolling window with variable window sizes + test_equivalent(expected_result, rolling_window(input, prev, next, 0, COLLECT_SET)); + + // Rolling window with nulls excluded + test_equivalent(expected_result, + rolling_window(input, prev, next, 0, COLLECT_SET_NULLS_EXCLUDED)); +} + +TYPED_TEST(TypedCollectListTest, EmptyOutputListsAtEnds) +{ + auto const input = COL_V{0, 1, 2, 3, 4, 5}; + auto const expected_result = LCL_V{{}, {0, 1, 2}, {1, 2, 3}, {2, 3, 4}, {3, 4, 5}, {}}.release(); + auto const prev = COL_S{0, 2, 2, 2, 2, 0}; + auto const next = COL_S{0, 1, 1, 1, 1, 0}; + + // Rolling window with variable window sizes + test_equivalent(expected_result, rolling_window(input, prev, next, 0, COLLECT_SET)); + + // Rolling window with nulls excluded + test_equivalent(expected_result, + rolling_window(input, prev, next, 0, COLLECT_SET_NULLS_EXCLUDED)); +} + +#if 0 +TEST_F(CollectSetTest, RollingWindowHonoursMinPeriodsOnStrings) +{ + // Test that when the number of observations is fewer than min_periods, + // the result is null. + + using namespace cudf; + using namespace cudf::test; + + auto const input = strings_column_wrapper{"0", "1", "2", "3", "4", "5"}; + auto const num_elements = static_cast(input).size(); + + auto preceding = 2; + auto following = 1; + auto min_periods = 3; + auto const result = rolling_window(input, preceding, following, min_periods, COLLECT_SET); + + auto const expected_result = lists_column_wrapper{ + {{}, {"0", "1", "2"}, {"1", "2", "3"}, {"2", "3", "4"}, {"3", "4", "5"}, {}}, + cudf::detail::make_counting_transform_iterator(0, [num_elements](auto i) { + return i != 0 && i != (num_elements - 1); + })}.release(); + + test_equivalent(expected_result->view(), result->view()); + + auto const result_with_nulls_excluded = + rolling_window(input, preceding, following, min_periods, COLLECT_SET_NULLS_EXCLUDED); + + test_equivalent(expected_result->view(), result_with_nulls_excluded->view()); + + preceding = 2; + following = 2; + min_periods = 4; + + auto result_2 = rolling_window(input, preceding, following, min_periods, COLLECT_SET); + auto expected_result_2 = lists_column_wrapper{ + {{}, {"0", "1", "2", "3"}, {"1", "2", "3", "4"}, {"2", "3", "4", "5"}, {}, {}}, + cudf::detail::make_counting_transform_iterator(0, [num_elements](auto i) { + return i != 0 && i < 4; + })}.release(); + + test_equivalent(expected_result_2->view(), result_2->view()); + + auto result_2_with_nulls_excluded = + rolling_window(input, preceding, following, min_periods, COLLECT_SET_NULLS_EXCLUDED); + + test_equivalent(expected_result_2->view(), result_2_with_nulls_excluded->view()); +} + +TEST_F(CollectSetTest, RollingWindowHonoursMinPeriodsWithDecimal) +{ + // Test that when the number of observations is fewer than min_periods, + // the result is null. + + using namespace cudf; + using namespace cudf::test; + + auto const input_iter = + cudf::detail::make_counting_transform_iterator(0, thrust::identity{}); + auto const input = + fixed_point_column_wrapper{input_iter, input_iter + 6, numeric::scale_type{0}}; + + { + // One result row at each end should be null. + auto preceding = 2; + auto following = 1; + auto min_periods = 3; + auto const result = rolling_window(input, preceding, following, min_periods, COLLECT_SET); + + auto expected_result_child_values = std::vector{0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4, 5}; + auto expected_result_child = + fixed_point_column_wrapper{expected_result_child_values.begin(), + expected_result_child_values.end(), + numeric::scale_type{0}}; + auto expected_offsets = COL_S{0, 0, 3, 6, 9, 12, 12}.release(); + auto expected_num_rows = expected_offsets->size() - 1; + auto null_mask_iter = cudf::detail::make_counting_transform_iterator( + size_type{0}, [expected_num_rows](auto i) { return i != 0 && i != (expected_num_rows - 1); }); + + auto expected_result = make_lists_column( + expected_num_rows, + std::move(expected_offsets), + expected_result_child.release(), + 2, + cudf::test::detail::make_null_mask(null_mask_iter, null_mask_iter + expected_num_rows)); + + test_equivalent(expected_result->view(), result->view()); + + auto const result_with_nulls_excluded = + rolling_window(input, + preceding, + following, + min_periods, + COLLECT_SET_NULLS_EXCLUDED); + + test_equivalent(expected_result->view(), result_with_nulls_excluded->view()); + } + + { + // First result row, and the last two result rows should be null. + auto preceding = 2; + auto following = 2; + auto min_periods = 4; + auto const result = rolling_window(input, preceding, following, min_periods, COLLECT_SET); + + auto expected_result_child_values = std::vector{0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5}; + auto expected_result_child = + fixed_point_column_wrapper{expected_result_child_values.begin(), + expected_result_child_values.end(), + numeric::scale_type{0}}; + auto expected_offsets = COL_S{0, 0, 4, 8, 12, 12, 12}.release(); + auto expected_num_rows = expected_offsets->size() - 1; + auto null_mask_iter = cudf::detail::make_counting_transform_iterator( + size_type{0}, [expected_num_rows](auto i) { return i > 0 && i < 4; }); + + auto expected_result = make_lists_column( + expected_num_rows, + std::move(expected_offsets), + expected_result_child.release(), + 3, + cudf::test::detail::make_null_mask(null_mask_iter, null_mask_iter + expected_num_rows)); + + test_equivalent(expected_result->view(), result->view()); + + auto const result_with_nulls_excluded = + rolling_window(input, + preceding, + following, + min_periods, + COLLECT_SET_NULLS_EXCLUDED); + + test_equivalent(expected_result->view(), result_with_nulls_excluded->view()); + } +} + +TYPED_TEST(TypedCollectListTest, BasicGroupedRollingWindow) +{ + using namespace cudf; + using namespace cudf::test; + + auto const group_column = fixed_width_column_wrapper{1, 1, 1, 1, 1, 2, 2, 2, 2}; + auto const input = COL_V{10, 11, 12, 13, 14, 20, 21, 22, 23}; + + auto const preceding = 2; + auto const following = 1; + auto const min_periods = 1; + auto const result = grouped_rolling_window(table_view{std::vector{group_column}}, + input, + preceding, + following, + min_periods, + COLLECT_SET); + + auto const expected_result = LCL_V{ + {10, 11}, + {10, 11, 12}, + {11, 12, 13}, + {12, 13, 14}, + {13, 14}, + {20, 21}, + {20, 21, 22}, + {21, 22, 23}, + {22, 23}}.release(); + + test_equivalent(expected_result->view(), result->view()); + + auto const result_with_nulls_excluded = + grouped_rolling_window(table_view{std::vector{group_column}}, + input, + preceding, + following, + min_periods, + COLLECT_SET_NULLS_EXCLUDED); + + test_equivalent(expected_result->view(), result_with_nulls_excluded->view()); +} + +TYPED_TEST(TypedCollectListTest, BasicGroupedRollingWindowWithNulls) +{ + using namespace cudf; + using namespace cudf::test; + + auto const group_column = fixed_width_column_wrapper{1, 1, 1, 1, 1, 2, 2, 2, 2}; + auto const input = COL_V{{10, 11, 12, 13, 14, 20, 21, 22, 23}, {1, 0, 1, 1, 1, 1, 0, 1, 1}}; + + auto const preceding = 2; + auto const following = 1; + auto const min_periods = 1; + + { + // Nulls included. + auto const result = grouped_rolling_window(table_view{std::vector{group_column}}, + input, + preceding, + following, + min_periods, + COLLECT_SET); + + auto expected_child = COL_V{ + {10, 11, 10, 11, 12, 11, 12, 13, 12, 13, 14, 13, 14, 20, 21, 20, 21, 22, 21, 22, 23, 22, 23}, + {1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1}}; + + auto expected_offsets = fixed_width_column_wrapper{0, 2, 5, 8, 11, 13, 15, 18, 21, 23}; + + auto expected_result = make_lists_column(static_cast(group_column).size(), + expected_offsets.release(), + expected_child.release(), + 0, + {}); + + test_equivalent(expected_result->view(), result->view()); + } + + { + // Nulls excluded. + auto const result = grouped_rolling_window(table_view{std::vector{group_column}}, + input, + preceding, + following, + min_periods, + COLLECT_SET_NULLS_EXCLUDED); + + auto expected_child = COL_V{10, 10, 12, 12, 13, 12, 13, 14, 13, 14, 20, 20, 22, 22, 23, 22, 23}; + + auto expected_offsets = fixed_width_column_wrapper{0, 1, 3, 5, 8, 10, 11, 13, 15, 17}; + + auto expected_result = make_lists_column(static_cast(group_column).size(), + expected_offsets.release(), + expected_child.release(), + 0, + {}); + + test_equivalent(expected_result->view(), result->view()); + } +} + +TYPED_TEST(TypedCollectListTest, BasicGroupedTimeRangeRollingWindow) +{ + using namespace cudf; + using namespace cudf::test; + + auto const time_column = fixed_width_column_wrapper{ + 1, 1, 2, 2, 3, 1, 4, 5, 6}; + auto const group_column = fixed_width_column_wrapper{1, 1, 1, 1, 1, 2, 2, 2, 2}; + auto const input = COL_V{10, 11, 12, 13, 14, 20, 21, 22, 23}; + auto const preceding = 2; + auto const following = 1; + auto const min_periods = 1; + auto const result = + grouped_time_range_rolling_window(table_view{std::vector{group_column}}, + time_column, + cudf::order::ASCENDING, + input, + preceding, + following, + min_periods, + COLLECT_SET); + + auto const expected_result = LCL_V{ + {10, 11, 12, 13}, + {10, 11, 12, 13}, + {10, 11, 12, 13, 14}, + {10, 11, 12, 13, 14}, + {10, 11, 12, 13, 14}, + {20}, + {21, 22}, + {21, 22, 23}, + {21, 22, 23}}.release(); + + test_equivalent(expected_result->view(), result->view()); + + auto const result_with_nulls_excluded = + grouped_time_range_rolling_window(table_view{std::vector{group_column}}, + time_column, + cudf::order::ASCENDING, + input, + preceding, + following, + min_periods, + COLLECT_SET_NULLS_EXCLUDED); + + test_equivalent(expected_result->view(), result_with_nulls_excluded->view()); +} + +TYPED_TEST(TypedCollectListTest, GroupedTimeRangeRollingWindowWithNulls) +{ + using namespace cudf; + using namespace cudf::test; + + auto const time_column = fixed_width_column_wrapper{ + 1, 1, 2, 2, 3, 1, 4, 5, 6}; + auto const group_column = fixed_width_column_wrapper{1, 1, 1, 1, 1, 2, 2, 2, 2}; + auto const input = COL_V{{10, 11, 12, 13, 14, 20, 21, 22, 23}, {1, 0, 1, 1, 1, 1, 0, 1, 1}}; + auto const preceding = 2; + auto const following = 1; + auto const min_periods = 1; + auto const result = + grouped_time_range_rolling_window(table_view{std::vector{group_column}}, + time_column, + cudf::order::ASCENDING, + input, + preceding, + following, + min_periods, + COLLECT_SET); + + auto null_at_0 = iterator_with_null_at(0); + auto null_at_1 = iterator_with_null_at(1); + + // In the results, `11` and `21` should be nulls. + auto const expected_result = LCL_V{ + {{10, 11, 12, 13}, null_at_1}, + {{10, 11, 12, 13}, null_at_1}, + {{10, 11, 12, 13, 14}, null_at_1}, + {{10, 11, 12, 13, 14}, null_at_1}, + {{10, 11, 12, 13, 14}, null_at_1}, + {{20}, null_at_1}, + {{21, 22}, null_at_0}, + {{21, 22, 23}, null_at_0}, + {{21, 22, 23}, null_at_0}}.release(); + + test_equivalent(expected_result->view(), result->view()); + + auto const result_with_nulls_excluded = + grouped_time_range_rolling_window(table_view{std::vector{group_column}}, + time_column, + cudf::order::ASCENDING, + input, + preceding, + following, + min_periods, + COLLECT_SET_NULLS_EXCLUDED); + + // After null exclusion, `11`, `21`, and `null` should not appear. + auto const expected_result_with_nulls_excluded = LCL_V{ + {10, 12, 13}, + {10, 12, 13}, + {10, 12, 13, 14}, + {10, 12, 13, 14}, + {10, 12, 13, 14}, + {20}, + {22}, + {22, 23}, + {22, 23}}.release(); + + test_equivalent(expected_result_with_nulls_excluded->view(), result_with_nulls_excluded->view()); +} + +TEST_F(CollectSetTest, BasicGroupedTimeRangeRollingWindowOnStrings) +{ + using namespace cudf; + using namespace cudf::test; + + auto const time_column = fixed_width_column_wrapper{ + 1, 1, 2, 2, 3, 1, 4, 5, 6}; + auto const group_column = fixed_width_column_wrapper{1, 1, 1, 1, 1, 2, 2, 2, 2}; + auto const input = strings_column_wrapper{"10", "11", "12", "13", "14", "20", "21", "22", "23"}; + auto const preceding = 2; + auto const following = 1; + auto const min_periods = 1; + auto const result = + grouped_time_range_rolling_window(table_view{std::vector{group_column}}, + time_column, + cudf::order::ASCENDING, + input, + preceding, + following, + min_periods, + COLLECT_SET); + + auto const expected_result = lists_column_wrapper{ + {"10", "11", "12", "13"}, + {"10", "11", "12", "13"}, + {"10", "11", "12", "13", "14"}, + {"10", "11", "12", "13", "14"}, + {"10", "11", "12", "13", "14"}, + {"20"}, + {"21", "22"}, + {"21", "22", "23"}, + {"21", "22", "23"}}.release(); + + test_equivalent(expected_result->view(), result->view()); + + auto const result_with_nulls_excluded = + grouped_time_range_rolling_window(table_view{std::vector{group_column}}, + time_column, + cudf::order::ASCENDING, + input, + preceding, + following, + min_periods, + COLLECT_SET_NULLS_EXCLUDED); + + test_equivalent(expected_result->view(), result_with_nulls_excluded->view()); +} + +TEST_F(CollectSetTest, GroupedTimeRangeRollingWindowOnStringsWithNulls) +{ + using namespace cudf; + using namespace cudf::test; + + auto const time_column = fixed_width_column_wrapper{ + 1, 1, 2, 2, 3, 1, 4, 5, 6}; + auto const group_column = fixed_width_column_wrapper{1, 1, 1, 1, 1, 2, 2, 2, 2}; + auto const input = strings_column_wrapper{{"10", "11", "12", "13", "14", "20", "21", "22", "23"}, + {1, 0, 1, 1, 1, 1, 0, 1, 1}}; + auto const preceding = 2; + auto const following = 1; + auto const min_periods = 1; + auto const result = + grouped_time_range_rolling_window(table_view{std::vector{group_column}}, + time_column, + cudf::order::ASCENDING, + input, + preceding, + following, + min_periods, + COLLECT_SET); + + auto null_at_0 = iterator_with_null_at(0); + auto null_at_1 = iterator_with_null_at(1); + + // In the results, `11` and `21` should be nulls. + auto const expected_result = lists_column_wrapper{ + {{"10", "11", "12", "13"}, null_at_1}, + {{"10", "11", "12", "13"}, null_at_1}, + {{"10", "11", "12", "13", "14"}, null_at_1}, + {{"10", "11", "12", "13", "14"}, null_at_1}, + {{"10", "11", "12", "13", "14"}, null_at_1}, + {"20"}, + {{"21", "22"}, null_at_0}, + {{"21", "22", "23"}, null_at_0}, + {{"21", "22", "23"}, + null_at_0}}.release(); + + test_equivalent(expected_result->view(), result->view()); + + auto const result_with_nulls_excluded = + grouped_time_range_rolling_window(table_view{std::vector{group_column}}, + time_column, + cudf::order::ASCENDING, + input, + preceding, + following, + min_periods, + COLLECT_SET_NULLS_EXCLUDED); + + // After null exclusion, `11`, `21`, and `null` should not appear. + auto const expected_result_with_nulls_excluded = lists_column_wrapper{ + {"10", "12", "13"}, + {"10", "12", "13"}, + {"10", "12", "13", "14"}, + {"10", "12", "13", "14"}, + {"10", "12", "13", "14"}, + {"20"}, + {"22"}, + {"22", "23"}, + {"22", "23"}}.release(); + + test_equivalent(expected_result_with_nulls_excluded->view(), result_with_nulls_excluded->view()); +} + +TYPED_TEST(TypedCollectListTest, BasicGroupedTimeRangeRollingWindowOnStructs) +{ + using namespace cudf; + using namespace cudf::test; + + auto const time_column = fixed_width_column_wrapper{ + 1, 1, 2, 2, 3, 1, 4, 5, 6}; + auto const group_column = fixed_width_column_wrapper{1, 1, 1, 1, 1, 2, 2, 2, 2}; + auto numeric_member_column = COL_V{10, 11, 12, 13, 14, 20, 21, 22, 23}; + auto string_member_column = + strings_column_wrapper{"10", "11", "12", "13", "14", "20", "21", "22", "23"}; + auto struct_members = std::vector>{}; + struct_members.emplace_back(numeric_member_column.release()); + struct_members.emplace_back(string_member_column.release()); + auto const struct_column = make_structs_column(9, std::move(struct_members), 0, {}); + auto const preceding = 2; + auto const following = 1; + auto const min_periods = 1; + auto const result = + grouped_time_range_rolling_window(table_view{std::vector{group_column}}, + time_column, + cudf::order::ASCENDING, + struct_column->view(), + preceding, + following, + min_periods, + COLLECT_SET); + + auto expected_numeric_column = + COL_V{10, 11, 12, 13, 10, 11, 12, 13, 10, 11, 12, 13, 14, 10, 11, 12, + 13, 14, 10, 11, 12, 13, 14, 20, 21, 22, 21, 22, 23, 21, 22, 23}; + + auto expected_string_column = strings_column_wrapper{ + "10", "11", "12", "13", "10", "11", "12", "13", "10", "11", "12", "13", "14", "10", "11", "12", + "13", "14", "10", "11", "12", "13", "14", "20", "21", "22", "21", "22", "23", "21", "22", "23"}; + + auto expected_struct_members = std::vector>{}; + expected_struct_members.emplace_back(expected_numeric_column.release()); + expected_struct_members.emplace_back(expected_string_column.release()); + + auto expected_structs_column = make_structs_column(32, std::move(expected_struct_members), 0, {}); + auto expected_offsets_column = COL_S{0, 4, 8, 13, 18, 23, 24, 26, 29, 32}.release(); + auto expected_result = make_lists_column( + 9, std::move(expected_offsets_column), std::move(expected_structs_column), 0, {}); + + test_equivalent(expected_result->view(), result->view()); + + auto const result_with_nulls_excluded = + grouped_time_range_rolling_window(table_view{std::vector{group_column}}, + time_column, + cudf::order::ASCENDING, + struct_column->view(), + preceding, + following, + min_periods, + COLLECT_SET_NULLS_EXCLUDED); + + test_equivalent(expected_result->view(), result_with_nulls_excluded->view()); +} + +TYPED_TEST(TypedCollectListTest, GroupedTimeRangeRollingWindowWithMinPeriods) +{ + // Test that min_periods is honoured. + // i.e. output row is null when min_periods exceeds number of observations. + using namespace cudf; + using namespace cudf::test; + + auto const time_column = fixed_width_column_wrapper{ + 1, 1, 2, 2, 3, 1, 4, 5, 6}; + auto const group_column = fixed_width_column_wrapper{1, 1, 1, 1, 1, 2, 2, 2, 2}; + auto const input = COL_V{10, 11, 12, 13, 14, 20, 21, 22, 23}; + auto const preceding = 2; + auto const following = 1; + auto const min_periods = 4; + auto const result = + grouped_time_range_rolling_window(table_view{std::vector{group_column}}, + time_column, + cudf::order::ASCENDING, + input, + preceding, + following, + min_periods, + COLLECT_SET); + + auto const expected_result = LCL_V{ + {{10, 11, 12, 13}, + {10, 11, 12, 13}, + {10, 11, 12, 13, 14}, + {10, 11, 12, 13, 14}, + {10, 11, 12, 13, 14}, + {}, + {}, + {}, + {}}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { + return i < 5; + })}.release(); + + test_equivalent(expected_result->view(), result->view()); + + auto const result_with_nulls_excluded = + grouped_time_range_rolling_window(table_view{std::vector{group_column}}, + time_column, + cudf::order::ASCENDING, + input, + preceding, + following, + min_periods, + COLLECT_SET_NULLS_EXCLUDED); + + test_equivalent(expected_result->view(), result_with_nulls_excluded->view()); +} + +TYPED_TEST(TypedCollectListTest, GroupedTimeRangeRollingWindowWithNullsAndMinPeriods) +{ + // Test that min_periods is honoured. + // i.e. output row is null when min_periods exceeds number of observations. + using namespace cudf; + using namespace cudf::test; + + auto const time_column = fixed_width_column_wrapper{ + 1, 1, 2, 2, 3, 1, 4, 5, 6}; + auto const group_column = fixed_width_column_wrapper{1, 1, 1, 1, 1, 2, 2, 2, 2}; + auto const input = COL_V{{10, 11, 12, 13, 14, 20, 21, 22, 23}, {1, 0, 1, 1, 1, 1, 0, 1, 1}}; + auto const preceding = 2; + auto const following = 1; + auto const min_periods = 4; + auto const result = + grouped_time_range_rolling_window(table_view{std::vector{group_column}}, + time_column, + cudf::order::ASCENDING, + input, + preceding, + following, + min_periods, + COLLECT_SET); + + auto null_at_1 = iterator_with_null_at(1); + + // In the results, `11` and `21` should be nulls. + auto const expected_result = LCL_V{ + {{{10, 11, 12, 13}, null_at_1}, + {{10, 11, 12, 13}, null_at_1}, + {{10, 11, 12, 13, 14}, null_at_1}, + {{10, 11, 12, 13, 14}, null_at_1}, + {{10, 11, 12, 13, 14}, null_at_1}, + {}, + {}, + {}, + {}}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { + return i < 5; + })}.release(); + + test_equivalent(expected_result->view(), result->view()); + + auto const result_with_nulls_excluded = + grouped_time_range_rolling_window(table_view{std::vector{group_column}}, + time_column, + cudf::order::ASCENDING, + input, + preceding, + following, + min_periods, + COLLECT_SET_NULLS_EXCLUDED); + + // After null exclusion, `11`, `21`, and `null` should not appear. + auto const expected_result_with_nulls_excluded = LCL_V{ + {{10, 12, 13}, + {10, 12, 13}, + {10, 12, 13, 14}, + {10, 12, 13, 14}, + {10, 12, 13, 14}, + {}, + {}, + {}, + {}}, + cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return i < 5; })}.release(); + + test_equivalent(expected_result_with_nulls_excluded->view(), result_with_nulls_excluded->view()); +} + +TEST_F(CollectSetTest, GroupedTimeRangeRollingWindowOnStringsWithMinPeriods) +{ + // Test that min_periods is honoured. + // i.e. output row is null when min_periods exceeds number of observations. + using namespace cudf; + using namespace cudf::test; + + auto const time_column = fixed_width_column_wrapper{ + 1, 1, 2, 2, 3, 1, 4, 5, 6}; + auto const group_column = fixed_width_column_wrapper{1, 1, 1, 1, 1, 2, 2, 2, 2}; + auto const input = strings_column_wrapper{"10", "11", "12", "13", "14", "20", "21", "22", "23"}; + auto const preceding = 2; + auto const following = 1; + auto const min_periods = 4; + auto const result = + grouped_time_range_rolling_window(table_view{std::vector{group_column}}, + time_column, + cudf::order::ASCENDING, + input, + preceding, + following, + min_periods, + COLLECT_SET); + + auto const expected_result = lists_column_wrapper{ + {{"10", "11", "12", "13"}, + {"10", "11", "12", "13"}, + {"10", "11", "12", "13", "14"}, + {"10", "11", "12", "13", "14"}, + {"10", "11", "12", "13", "14"}, + {}, + {}, + {}, + {}}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { + return i < 5; + })}.release(); + + test_equivalent(expected_result->view(), result->view()); + + auto const result_with_nulls_excluded = + grouped_time_range_rolling_window(table_view{std::vector{group_column}}, + time_column, + cudf::order::ASCENDING, + input, + preceding, + following, + min_periods, + COLLECT_SET_NULLS_EXCLUDED); + + test_equivalent(expected_result->view(), result_with_nulls_excluded->view()); +} + +TEST_F(CollectSetTest, GroupedTimeRangeRollingWindowOnStringsWithNullsAndMinPeriods) +{ + // Test that min_periods is honoured. + // i.e. output row is null when min_periods exceeds number of observations. + using namespace cudf; + using namespace cudf::test; + + auto const time_column = fixed_width_column_wrapper{ + 1, 1, 2, 2, 3, 1, 4, 5, 6}; + auto const group_column = fixed_width_column_wrapper{1, 1, 1, 1, 1, 2, 2, 2, 2}; + auto const input = strings_column_wrapper{{"10", "11", "12", "13", "14", "20", "21", "22", "23"}, + {1, 0, 1, 1, 1, 1, 0, 1, 1}}; + auto const preceding = 2; + auto const following = 1; + auto const min_periods = 4; + auto const result = + grouped_time_range_rolling_window(table_view{std::vector{group_column}}, + time_column, + cudf::order::ASCENDING, + input, + preceding, + following, + min_periods, + COLLECT_SET); + + auto null_at_1 = iterator_with_null_at(1); + + // In the results, `11` and `21` should be nulls. + auto const expected_result = lists_column_wrapper{ + {{{"10", "11", "12", "13"}, null_at_1}, + {{"10", "11", "12", "13"}, null_at_1}, + {{"10", "11", "12", "13", "14"}, null_at_1}, + {{"10", "11", "12", "13", "14"}, null_at_1}, + {{"10", "11", "12", "13", "14"}, null_at_1}, + {}, + {}, + {}, + {}}, + cudf::detail::make_counting_transform_iterator(0, [](auto i) { + return i < 5; + })}.release(); + + test_equivalent(expected_result->view(), result->view()); + + auto const result_with_nulls_excluded = + grouped_time_range_rolling_window(table_view{std::vector{group_column}}, + time_column, + cudf::order::ASCENDING, + input, + preceding, + following, + min_periods, + COLLECT_SET_NULLS_EXCLUDED); + + // After null exclusion, `11`, `21`, and `null` should not appear. + auto const expected_result_with_nulls_excluded = lists_column_wrapper{ + {{"10", "12", "13"}, + {"10", "12", "13"}, + {"10", "12", "13", "14"}, + {"10", "12", "13", "14"}, + {"10", "12", "13", "14"}, + {}, + {}, + {}, + {}}, + cudf::detail::make_counting_transform_iterator( + 0, [](auto i) { return i < 5; })}.release(); + + test_equivalent(expected_result_with_nulls_excluded->view(), result_with_nulls_excluded->view()); +} + +TYPED_TEST(TypedCollectListTest, GroupedTimeRangeRollingWindowOnStructsWithMinPeriods) +{ + // Test that min_periods is honoured. + // i.e. output row is null when min_periods exceeds number of observations. + using namespace cudf; + using namespace cudf::test; + + auto const time_column = fixed_width_column_wrapper{ + 1, 1, 2, 2, 3, 1, 4, 5, 6}; + auto const group_column = fixed_width_column_wrapper{1, 1, 1, 1, 1, 2, 2, 2, 2}; + auto numeric_member_column = COL_V{10, 11, 12, 13, 14, 20, 21, 22, 23}; + auto string_member_column = + strings_column_wrapper{"10", "11", "12", "13", "14", "20", "21", "22", "23"}; + auto struct_members = std::vector>{}; + struct_members.emplace_back(numeric_member_column.release()); + struct_members.emplace_back(string_member_column.release()); + auto const struct_column = make_structs_column(9, std::move(struct_members), 0, {}); + auto const preceding = 2; + auto const following = 1; + auto const min_periods = 4; + auto const result = + grouped_time_range_rolling_window(table_view{std::vector{group_column}}, + time_column, + cudf::order::ASCENDING, + struct_column->view(), + preceding, + following, + min_periods, + COLLECT_SET); + + auto expected_numeric_column = COL_V{10, 11, 12, 13, 10, 11, 12, 13, 10, 11, 12, 13, + 14, 10, 11, 12, 13, 14, 10, 11, 12, 13, 14}; + + auto expected_string_column = + strings_column_wrapper{"10", "11", "12", "13", "10", "11", "12", "13", "10", "11", "12", "13", + "14", "10", "11", "12", "13", "14", "10", "11", "12", "13", "14"}; + + auto expected_struct_members = std::vector>{}; + expected_struct_members.emplace_back(expected_numeric_column.release()); + expected_struct_members.emplace_back(expected_string_column.release()); + + auto expected_structs_column = make_structs_column(23, std::move(expected_struct_members), 0, {}); + auto expected_offsets_column = COL_S{0, 4, 8, 13, 18, 23, 23, 23, 23, 23}.release(); + auto expected_validity_iter = + cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i < 5; }); + auto expected_null_mask = + cudf::test::detail::make_null_mask(expected_validity_iter, expected_validity_iter + 9); + auto expected_result = make_lists_column(9, + std::move(expected_offsets_column), + std::move(expected_structs_column), + 4, + std::move(expected_null_mask)); + + test_equivalent(expected_result->view(), result->view()); + + auto const result_with_nulls_excluded = + grouped_time_range_rolling_window(table_view{std::vector{group_column}}, + time_column, + cudf::order::ASCENDING, + struct_column->view(), + preceding, + following, + min_periods, + COLLECT_SET_NULLS_EXCLUDED); + + test_equivalent(expected_result->view(), result_with_nulls_excluded->view()); +} + +#endif +} // namespace test +} // namespace cudf +CUDF_TEST_PROGRAM_MAIN() From 35ac84c9a88f4e9c59720ac8eac35e8032ef649b Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 16 Mar 2021 14:11:03 -0600 Subject: [PATCH 06/22] Rewrite tests for groupby collect_set --- cpp/tests/groupby/collect_set_test.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/tests/groupby/collect_set_test.cpp b/cpp/tests/groupby/collect_set_test.cpp index 6e92b171d94..29f6bc412b5 100644 --- a/cpp/tests/groupby/collect_set_test.cpp +++ b/cpp/tests/groupby/collect_set_test.cpp @@ -28,9 +28,8 @@ namespace test { #define COL_K cudf::test::fixed_width_column_wrapper #define COL_V cudf::test::fixed_width_column_wrapper #define LCL_V cudf::test::lists_column_wrapper -#define DCL_V cudf::test::dictionary_column_wrapper #define VALIDITY std::initializer_list -#define COLLECT_SET cudf::make_collect_list_aggregation() +#define COLLECT_SET cudf::make_collect_set_aggregation() template struct CollectSetTest : public cudf::test::BaseFixture { From 931e01cb622e69458283fee4aedd695e188f538a Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 16 Mar 2021 16:00:39 -0600 Subject: [PATCH 07/22] Implement sort-based groupby collect_set --- .../cudf/detail/aggregation/aggregation.hpp | 8 ++++++++ cpp/src/groupby/sort/groupby.cu | 19 +++++++++++++++++-- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp index 11fc59b3ddc..2b283a84214 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.hpp +++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp @@ -541,6 +541,12 @@ struct target_type_impl { using type = cudf::list_view; }; +// Always use list for COLLECT_SET +template +struct target_type_impl { + using type = cudf::list_view; +}; + // Always use Source for LEAD template struct target_type_impl { @@ -640,6 +646,8 @@ CUDA_HOST_DEVICE_CALLABLE decltype(auto) aggregation_dispatcher(aggregation::Kin return f.template operator()(std::forward(args)...); case aggregation::COLLECT_LIST: return f.template operator()(std::forward(args)...); + case aggregation::COLLECT_SET: + return f.template operator()(std::forward(args)...); case aggregation::LEAD: return f.template operator()(std::forward(args)...); case aggregation::LAG: diff --git a/cpp/src/groupby/sort/groupby.cu b/cpp/src/groupby/sort/groupby.cu index 04ccb1244a7..1c62c516cb7 100644 --- a/cpp/src/groupby/sort/groupby.cu +++ b/cpp/src/groupby/sort/groupby.cu @@ -19,16 +19,15 @@ #include #include -#include #include #include #include #include #include -#include #include #include #include +#include #include #include #include @@ -65,6 +64,7 @@ struct store_result_functor { template void operator()(aggregation const& agg) { + CUDF_FAIL("Unsupported aggregation."); } private: @@ -416,6 +416,21 @@ void store_result_functor::operator()(aggregation con cache.add_result(col_idx, agg, std::move(result)); }; +template <> +void store_result_functor::operator()(aggregation const& agg) +{ + auto null_handling = + static_cast(agg)._null_handling; + CUDF_EXPECTS(null_handling == null_policy::INCLUDE, + "null exclusion is not supported on groupby COLLECT_SET aggregation."); + + if (cache.has_result(col_idx, agg)) { return; } + auto const result = detail::group_collect( + get_grouped_values(), helper.group_offsets(), helper.num_groups(), stream, mr); + cache.add_result( + col_idx, agg, lists::drop_list_duplicates(lists_column_view(result->view()), stream, mr)); +}; + } // namespace detail // Sort-based groupby From 9db7c588e89a3e6c051fb653c880b4baa2d4410d Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 16 Mar 2021 16:33:29 -0600 Subject: [PATCH 08/22] Add detail API for drop_list_duplicates that accepts stream parameter --- conda/recipes/libcudf/meta.yaml | 1 + .../lists/detail/drop_list_duplicates.hpp | 38 +++++++++++++++++++ 2 files changed, 39 insertions(+) create mode 100644 cpp/include/cudf/lists/detail/drop_list_duplicates.hpp diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index e709824721c..8778a552810 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -132,6 +132,7 @@ test: - test -f $PREFIX/include/cudf/join.hpp - test -f $PREFIX/include/cudf/lists/detail/concatenate.hpp - test -f $PREFIX/include/cudf/lists/detail/copying.hpp + - test -f $PREFIX/include/cudf/lists/detail/drop_list_duplicates.hpp - test -f $PREFIX/include/cudf/lists/detail/sorting.hpp - test -f $PREFIX/include/cudf/lists/count_elements.hpp - test -f $PREFIX/include/cudf/lists/drop_list_duplicates.hpp diff --git a/cpp/include/cudf/lists/detail/drop_list_duplicates.hpp b/cpp/include/cudf/lists/detail/drop_list_duplicates.hpp new file mode 100644 index 00000000000..ba3e1d17d7f --- /dev/null +++ b/cpp/include/cudf/lists/detail/drop_list_duplicates.hpp @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include + +namespace cudf { +namespace lists { +namespace detail { + +/** + * @copydoc cudf::lists::drop_list_duplicates + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr drop_list_duplicates( + lists_column_view const& lists_column, + null_equality nulls_equal, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +} // namespace detail +} // namespace lists +} // namespace cudf From f522627e655eedcb23031aa29172528f3dc733e5 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 16 Mar 2021 16:40:23 -0600 Subject: [PATCH 09/22] Expose the detail::drop_list_duplicates function to use in other places --- cpp/src/lists/drop_list_duplicates.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu index 1eb105d296d..529b7489c35 100644 --- a/cpp/src/lists/drop_list_duplicates.cu +++ b/cpp/src/lists/drop_list_duplicates.cu @@ -225,6 +225,8 @@ void generate_offsets(size_type num_entries, return offsets[i - prefix_sum_empty_lists[i]]; }); } +} // anonymous namespace + /** * @copydoc cudf::lists::drop_list_duplicates * @@ -276,7 +278,6 @@ std::unique_ptr drop_list_duplicates(lists_column_view const& lists_colu cudf::detail::copy_bitmask(lists_column.parent(), stream, mr)); } -} // anonymous namespace } // namespace detail /** From 2af1db789217f7774e07e521d1bcdf89931ccf03 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 16 Mar 2021 16:42:27 -0600 Subject: [PATCH 10/22] Use the detail::drop_list_duplicate function in groupby collect_set --- .../cudf/detail/aggregation/aggregation.hpp | 19 ++++++++++++------- cpp/src/groupby/sort/groupby.cu | 16 ++++++++++------ 2 files changed, 22 insertions(+), 13 deletions(-) diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp index 2b283a84214..8bd737ad787 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.hpp +++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp @@ -343,22 +343,27 @@ struct collect_list_aggregation final : derived_aggregation /** * @brief Derived aggregation class for specifying COLLECT_SET aggregation */ -struct collect_set_aggregation final : derived_aggregation { - explicit collect_set_aggregation(null_policy null_handling = null_policy::INCLUDE) - : derived_aggregation{COLLECT_SET}, _null_handling{null_handling} +struct collect_set_aggregation final : derived_aggregation { + explicit collect_set_aggregation(null_policy null_handling = null_policy::INCLUDE, + null_equality null_equal = null_equality::EQUAL) + : derived_aggregation{COLLECT_SET}, _null_handling{null_handling}, _null_equal(null_equal) { } null_policy _null_handling; ///< include or exclude nulls + null_equality _null_equal; ///< whether to consider nulls as equal values protected: - friend class derived_aggregation; + friend class derived_aggregation; - bool operator==(nunique_aggregation const& other) const + bool operator==(collect_set_aggregation const& other) const { - return _null_handling == other._null_handling; + return _null_handling == other._null_handling && _null_equal == other._null_equal; } - size_t hash_impl() const { return std::hash{}(static_cast(_null_handling)); } + size_t hash_impl() const + { + return std::hash{}(static_cast(_null_handling) ^ static_cast(_null_equal)); + } }; /** diff --git a/cpp/src/groupby/sort/groupby.cu b/cpp/src/groupby/sort/groupby.cu index 1c62c516cb7..7ce38308ebe 100644 --- a/cpp/src/groupby/sort/groupby.cu +++ b/cpp/src/groupby/sort/groupby.cu @@ -27,7 +27,7 @@ #include #include #include -#include +#include #include #include #include @@ -419,18 +419,22 @@ void store_result_functor::operator()(aggregation con template <> void store_result_functor::operator()(aggregation const& agg) { - auto null_handling = + auto const null_handling = static_cast(agg)._null_handling; CUDF_EXPECTS(null_handling == null_policy::INCLUDE, "null exclusion is not supported on groupby COLLECT_SET aggregation."); if (cache.has_result(col_idx, agg)) { return; } - auto const result = detail::group_collect( + + auto const collect_result = detail::group_collect( get_grouped_values(), helper.group_offsets(), helper.num_groups(), stream, mr); - cache.add_result( - col_idx, agg, lists::drop_list_duplicates(lists_column_view(result->view()), stream, mr)); + auto const nulls_equal = + static_cast(agg)._null_equal; + cache.add_result(col_idx, + agg, + lists::detail::drop_list_duplicates( + lists_column_view(collect_result->view()), nulls_equal, stream, mr)); }; - } // namespace detail // Sort-based groupby From 4099d17fe07748e91eeec1d8cfe326786dde0adb Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 16 Mar 2021 16:52:32 -0600 Subject: [PATCH 11/22] Fix default parameters for the make_collect_set_aggregation() function --- cpp/include/cudf/aggregation.hpp | 7 +++++-- cpp/src/aggregation/aggregation.cpp | 5 +++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp index bd291346c8d..3c454c85720 100644 --- a/cpp/include/cudf/aggregation.hpp +++ b/cpp/include/cudf/aggregation.hpp @@ -227,10 +227,13 @@ std::unique_ptr make_collect_list_aggregation( * If `null_handling` is set to `EXCLUDE`, null elements are dropped from each * of the list rows. * - * @param null_handling Indicates whether to include/exclude nulls in list elements. + * @param null_handling Indicates whether to include/exclude nulls during collection + * @param nulls_equal Flag to specify whether null entries within each list should be considered + * equal */ std::unique_ptr make_collect_set_aggregation( - null_policy null_handling = null_policy::INCLUDE); + null_policy null_handling = null_policy::INCLUDE, + null_equality null_equal = null_equality::EQUAL); /// Factory to create a LAG aggregation std::unique_ptr make_lag_aggregation(size_type offset); diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp index 31d9e53cd73..33c19617308 100644 --- a/cpp/src/aggregation/aggregation.cpp +++ b/cpp/src/aggregation/aggregation.cpp @@ -131,9 +131,10 @@ std::unique_ptr make_collect_list_aggregation(null_policy null_hand return std::make_unique(null_handling); } /// Factory to create a COLLECT_SET aggregation -std::unique_ptr make_collect_set_aggregation(null_policy null_handling) +std::unique_ptr make_collect_set_aggregation(null_policy null_handling, + null_equality null_equal) { - return std::make_unique(null_handling); + return std::make_unique(null_handling, null_equal); } /// Factory to create a LAG aggregation std::unique_ptr make_lag_aggregation(size_type offset) From 765b2ef82ce095d533beffe013f6182d33d06e2d Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 16 Mar 2021 17:28:49 -0600 Subject: [PATCH 12/22] Remove rolling collect_set test --- cpp/tests/rolling/collect_set_test.cpp | 974 ------------------------- 1 file changed, 974 deletions(-) delete mode 100644 cpp/tests/rolling/collect_set_test.cpp diff --git a/cpp/tests/rolling/collect_set_test.cpp b/cpp/tests/rolling/collect_set_test.cpp deleted file mode 100644 index 511795ebe43..00000000000 --- a/cpp/tests/rolling/collect_set_test.cpp +++ /dev/null @@ -1,974 +0,0 @@ -/* - * Copyright (c) 2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -//#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#include - -#include -#include - -namespace cudf { -namespace test { - -#define COL_V cudf::test::fixed_width_column_wrapper -#define COL_S cudf::test::fixed_width_column_wrapper -#define LCL_V cudf::test::lists_column_wrapper -#define COLLECT_SET cudf::make_collect_list_aggregation() -#define COLLECT_SET_NULLS_EXCLUDED cudf::make_collect_list_aggregation(cudf::null_policy::EXCLUDE) - -void test_equivalent(std::unique_ptr const& lhs, - std::unique_ptr const& rhs) -{ - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(lhs->view(), rhs->view()); -} - -void test_equivalent(cudf::column_view const& lhs, cudf::column_view const& rhs) -{ - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(lhs, rhs); -} - -struct CollectSetTest : public cudf::test::BaseFixture { -}; - -template -struct TypedCollectListTest : public CollectSetTest { -}; - -using TypesForTest = cudf::test::Concat; - -TYPED_TEST_CASE(TypedCollectListTest, TypesForTest); - -TYPED_TEST(TypedCollectListTest, BasicRollingWindow) -{ - auto const input = COL_V{10, 11, 12, 13, 14}; - auto const expected_result = - LCL_V{ - {10, 11}, - {10, 11, 12}, - {11, 12, 13}, - {12, 13, 14}, - {13, 14}, - } - .release(); - - // Rolling window with variable window sizes - test_equivalent( - expected_result, - rolling_window(input, COL_S{1, 2, 2, 2, 2}, COL_S{1, 1, 1, 1, 0}, 1, COLLECT_SET)); - - // Rolling window with fixed window size - test_equivalent(expected_result, rolling_window(input, 2, 1, 1, COLLECT_SET)); - - // Rolling window with nulls excluded - test_equivalent(expected_result, rolling_window(input, 2, 1, 1, COLLECT_SET_NULLS_EXCLUDED)); -} - -TYPED_TEST(TypedCollectListTest, EmptyOutputLists) -{ - auto const input = COL_V{10, 11, 12, 13, 14, 15}; - auto const expected_result = - LCL_V{ - {10, 11}, - {10, 11, 12}, - {11, 12, 13}, - {}, - {13, 14, 15}, - {14, 15}, - } - .release(); - auto const prev = COL_S{1, 2, 2, 0, 2, 2}; - auto const next = COL_S{1, 1, 1, 0, 1, 0}; - - // Rolling window with variable window sizes - test_equivalent(expected_result, rolling_window(input, prev, next, 0, COLLECT_SET)); - - // Rolling window with nulls excluded - test_equivalent(expected_result, - rolling_window(input, prev, next, 0, COLLECT_SET_NULLS_EXCLUDED)); -} - -TYPED_TEST(TypedCollectListTest, EmptyOutputListsAtEnds) -{ - auto const input = COL_V{0, 1, 2, 3, 4, 5}; - auto const expected_result = LCL_V{{}, {0, 1, 2}, {1, 2, 3}, {2, 3, 4}, {3, 4, 5}, {}}.release(); - auto const prev = COL_S{0, 2, 2, 2, 2, 0}; - auto const next = COL_S{0, 1, 1, 1, 1, 0}; - - // Rolling window with variable window sizes - test_equivalent(expected_result, rolling_window(input, prev, next, 0, COLLECT_SET)); - - // Rolling window with nulls excluded - test_equivalent(expected_result, - rolling_window(input, prev, next, 0, COLLECT_SET_NULLS_EXCLUDED)); -} - -#if 0 -TEST_F(CollectSetTest, RollingWindowHonoursMinPeriodsOnStrings) -{ - // Test that when the number of observations is fewer than min_periods, - // the result is null. - - using namespace cudf; - using namespace cudf::test; - - auto const input = strings_column_wrapper{"0", "1", "2", "3", "4", "5"}; - auto const num_elements = static_cast(input).size(); - - auto preceding = 2; - auto following = 1; - auto min_periods = 3; - auto const result = rolling_window(input, preceding, following, min_periods, COLLECT_SET); - - auto const expected_result = lists_column_wrapper{ - {{}, {"0", "1", "2"}, {"1", "2", "3"}, {"2", "3", "4"}, {"3", "4", "5"}, {}}, - cudf::detail::make_counting_transform_iterator(0, [num_elements](auto i) { - return i != 0 && i != (num_elements - 1); - })}.release(); - - test_equivalent(expected_result->view(), result->view()); - - auto const result_with_nulls_excluded = - rolling_window(input, preceding, following, min_periods, COLLECT_SET_NULLS_EXCLUDED); - - test_equivalent(expected_result->view(), result_with_nulls_excluded->view()); - - preceding = 2; - following = 2; - min_periods = 4; - - auto result_2 = rolling_window(input, preceding, following, min_periods, COLLECT_SET); - auto expected_result_2 = lists_column_wrapper{ - {{}, {"0", "1", "2", "3"}, {"1", "2", "3", "4"}, {"2", "3", "4", "5"}, {}, {}}, - cudf::detail::make_counting_transform_iterator(0, [num_elements](auto i) { - return i != 0 && i < 4; - })}.release(); - - test_equivalent(expected_result_2->view(), result_2->view()); - - auto result_2_with_nulls_excluded = - rolling_window(input, preceding, following, min_periods, COLLECT_SET_NULLS_EXCLUDED); - - test_equivalent(expected_result_2->view(), result_2_with_nulls_excluded->view()); -} - -TEST_F(CollectSetTest, RollingWindowHonoursMinPeriodsWithDecimal) -{ - // Test that when the number of observations is fewer than min_periods, - // the result is null. - - using namespace cudf; - using namespace cudf::test; - - auto const input_iter = - cudf::detail::make_counting_transform_iterator(0, thrust::identity{}); - auto const input = - fixed_point_column_wrapper{input_iter, input_iter + 6, numeric::scale_type{0}}; - - { - // One result row at each end should be null. - auto preceding = 2; - auto following = 1; - auto min_periods = 3; - auto const result = rolling_window(input, preceding, following, min_periods, COLLECT_SET); - - auto expected_result_child_values = std::vector{0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4, 5}; - auto expected_result_child = - fixed_point_column_wrapper{expected_result_child_values.begin(), - expected_result_child_values.end(), - numeric::scale_type{0}}; - auto expected_offsets = COL_S{0, 0, 3, 6, 9, 12, 12}.release(); - auto expected_num_rows = expected_offsets->size() - 1; - auto null_mask_iter = cudf::detail::make_counting_transform_iterator( - size_type{0}, [expected_num_rows](auto i) { return i != 0 && i != (expected_num_rows - 1); }); - - auto expected_result = make_lists_column( - expected_num_rows, - std::move(expected_offsets), - expected_result_child.release(), - 2, - cudf::test::detail::make_null_mask(null_mask_iter, null_mask_iter + expected_num_rows)); - - test_equivalent(expected_result->view(), result->view()); - - auto const result_with_nulls_excluded = - rolling_window(input, - preceding, - following, - min_periods, - COLLECT_SET_NULLS_EXCLUDED); - - test_equivalent(expected_result->view(), result_with_nulls_excluded->view()); - } - - { - // First result row, and the last two result rows should be null. - auto preceding = 2; - auto following = 2; - auto min_periods = 4; - auto const result = rolling_window(input, preceding, following, min_periods, COLLECT_SET); - - auto expected_result_child_values = std::vector{0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5}; - auto expected_result_child = - fixed_point_column_wrapper{expected_result_child_values.begin(), - expected_result_child_values.end(), - numeric::scale_type{0}}; - auto expected_offsets = COL_S{0, 0, 4, 8, 12, 12, 12}.release(); - auto expected_num_rows = expected_offsets->size() - 1; - auto null_mask_iter = cudf::detail::make_counting_transform_iterator( - size_type{0}, [expected_num_rows](auto i) { return i > 0 && i < 4; }); - - auto expected_result = make_lists_column( - expected_num_rows, - std::move(expected_offsets), - expected_result_child.release(), - 3, - cudf::test::detail::make_null_mask(null_mask_iter, null_mask_iter + expected_num_rows)); - - test_equivalent(expected_result->view(), result->view()); - - auto const result_with_nulls_excluded = - rolling_window(input, - preceding, - following, - min_periods, - COLLECT_SET_NULLS_EXCLUDED); - - test_equivalent(expected_result->view(), result_with_nulls_excluded->view()); - } -} - -TYPED_TEST(TypedCollectListTest, BasicGroupedRollingWindow) -{ - using namespace cudf; - using namespace cudf::test; - - auto const group_column = fixed_width_column_wrapper{1, 1, 1, 1, 1, 2, 2, 2, 2}; - auto const input = COL_V{10, 11, 12, 13, 14, 20, 21, 22, 23}; - - auto const preceding = 2; - auto const following = 1; - auto const min_periods = 1; - auto const result = grouped_rolling_window(table_view{std::vector{group_column}}, - input, - preceding, - following, - min_periods, - COLLECT_SET); - - auto const expected_result = LCL_V{ - {10, 11}, - {10, 11, 12}, - {11, 12, 13}, - {12, 13, 14}, - {13, 14}, - {20, 21}, - {20, 21, 22}, - {21, 22, 23}, - {22, 23}}.release(); - - test_equivalent(expected_result->view(), result->view()); - - auto const result_with_nulls_excluded = - grouped_rolling_window(table_view{std::vector{group_column}}, - input, - preceding, - following, - min_periods, - COLLECT_SET_NULLS_EXCLUDED); - - test_equivalent(expected_result->view(), result_with_nulls_excluded->view()); -} - -TYPED_TEST(TypedCollectListTest, BasicGroupedRollingWindowWithNulls) -{ - using namespace cudf; - using namespace cudf::test; - - auto const group_column = fixed_width_column_wrapper{1, 1, 1, 1, 1, 2, 2, 2, 2}; - auto const input = COL_V{{10, 11, 12, 13, 14, 20, 21, 22, 23}, {1, 0, 1, 1, 1, 1, 0, 1, 1}}; - - auto const preceding = 2; - auto const following = 1; - auto const min_periods = 1; - - { - // Nulls included. - auto const result = grouped_rolling_window(table_view{std::vector{group_column}}, - input, - preceding, - following, - min_periods, - COLLECT_SET); - - auto expected_child = COL_V{ - {10, 11, 10, 11, 12, 11, 12, 13, 12, 13, 14, 13, 14, 20, 21, 20, 21, 22, 21, 22, 23, 22, 23}, - {1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1}}; - - auto expected_offsets = fixed_width_column_wrapper{0, 2, 5, 8, 11, 13, 15, 18, 21, 23}; - - auto expected_result = make_lists_column(static_cast(group_column).size(), - expected_offsets.release(), - expected_child.release(), - 0, - {}); - - test_equivalent(expected_result->view(), result->view()); - } - - { - // Nulls excluded. - auto const result = grouped_rolling_window(table_view{std::vector{group_column}}, - input, - preceding, - following, - min_periods, - COLLECT_SET_NULLS_EXCLUDED); - - auto expected_child = COL_V{10, 10, 12, 12, 13, 12, 13, 14, 13, 14, 20, 20, 22, 22, 23, 22, 23}; - - auto expected_offsets = fixed_width_column_wrapper{0, 1, 3, 5, 8, 10, 11, 13, 15, 17}; - - auto expected_result = make_lists_column(static_cast(group_column).size(), - expected_offsets.release(), - expected_child.release(), - 0, - {}); - - test_equivalent(expected_result->view(), result->view()); - } -} - -TYPED_TEST(TypedCollectListTest, BasicGroupedTimeRangeRollingWindow) -{ - using namespace cudf; - using namespace cudf::test; - - auto const time_column = fixed_width_column_wrapper{ - 1, 1, 2, 2, 3, 1, 4, 5, 6}; - auto const group_column = fixed_width_column_wrapper{1, 1, 1, 1, 1, 2, 2, 2, 2}; - auto const input = COL_V{10, 11, 12, 13, 14, 20, 21, 22, 23}; - auto const preceding = 2; - auto const following = 1; - auto const min_periods = 1; - auto const result = - grouped_time_range_rolling_window(table_view{std::vector{group_column}}, - time_column, - cudf::order::ASCENDING, - input, - preceding, - following, - min_periods, - COLLECT_SET); - - auto const expected_result = LCL_V{ - {10, 11, 12, 13}, - {10, 11, 12, 13}, - {10, 11, 12, 13, 14}, - {10, 11, 12, 13, 14}, - {10, 11, 12, 13, 14}, - {20}, - {21, 22}, - {21, 22, 23}, - {21, 22, 23}}.release(); - - test_equivalent(expected_result->view(), result->view()); - - auto const result_with_nulls_excluded = - grouped_time_range_rolling_window(table_view{std::vector{group_column}}, - time_column, - cudf::order::ASCENDING, - input, - preceding, - following, - min_periods, - COLLECT_SET_NULLS_EXCLUDED); - - test_equivalent(expected_result->view(), result_with_nulls_excluded->view()); -} - -TYPED_TEST(TypedCollectListTest, GroupedTimeRangeRollingWindowWithNulls) -{ - using namespace cudf; - using namespace cudf::test; - - auto const time_column = fixed_width_column_wrapper{ - 1, 1, 2, 2, 3, 1, 4, 5, 6}; - auto const group_column = fixed_width_column_wrapper{1, 1, 1, 1, 1, 2, 2, 2, 2}; - auto const input = COL_V{{10, 11, 12, 13, 14, 20, 21, 22, 23}, {1, 0, 1, 1, 1, 1, 0, 1, 1}}; - auto const preceding = 2; - auto const following = 1; - auto const min_periods = 1; - auto const result = - grouped_time_range_rolling_window(table_view{std::vector{group_column}}, - time_column, - cudf::order::ASCENDING, - input, - preceding, - following, - min_periods, - COLLECT_SET); - - auto null_at_0 = iterator_with_null_at(0); - auto null_at_1 = iterator_with_null_at(1); - - // In the results, `11` and `21` should be nulls. - auto const expected_result = LCL_V{ - {{10, 11, 12, 13}, null_at_1}, - {{10, 11, 12, 13}, null_at_1}, - {{10, 11, 12, 13, 14}, null_at_1}, - {{10, 11, 12, 13, 14}, null_at_1}, - {{10, 11, 12, 13, 14}, null_at_1}, - {{20}, null_at_1}, - {{21, 22}, null_at_0}, - {{21, 22, 23}, null_at_0}, - {{21, 22, 23}, null_at_0}}.release(); - - test_equivalent(expected_result->view(), result->view()); - - auto const result_with_nulls_excluded = - grouped_time_range_rolling_window(table_view{std::vector{group_column}}, - time_column, - cudf::order::ASCENDING, - input, - preceding, - following, - min_periods, - COLLECT_SET_NULLS_EXCLUDED); - - // After null exclusion, `11`, `21`, and `null` should not appear. - auto const expected_result_with_nulls_excluded = LCL_V{ - {10, 12, 13}, - {10, 12, 13}, - {10, 12, 13, 14}, - {10, 12, 13, 14}, - {10, 12, 13, 14}, - {20}, - {22}, - {22, 23}, - {22, 23}}.release(); - - test_equivalent(expected_result_with_nulls_excluded->view(), result_with_nulls_excluded->view()); -} - -TEST_F(CollectSetTest, BasicGroupedTimeRangeRollingWindowOnStrings) -{ - using namespace cudf; - using namespace cudf::test; - - auto const time_column = fixed_width_column_wrapper{ - 1, 1, 2, 2, 3, 1, 4, 5, 6}; - auto const group_column = fixed_width_column_wrapper{1, 1, 1, 1, 1, 2, 2, 2, 2}; - auto const input = strings_column_wrapper{"10", "11", "12", "13", "14", "20", "21", "22", "23"}; - auto const preceding = 2; - auto const following = 1; - auto const min_periods = 1; - auto const result = - grouped_time_range_rolling_window(table_view{std::vector{group_column}}, - time_column, - cudf::order::ASCENDING, - input, - preceding, - following, - min_periods, - COLLECT_SET); - - auto const expected_result = lists_column_wrapper{ - {"10", "11", "12", "13"}, - {"10", "11", "12", "13"}, - {"10", "11", "12", "13", "14"}, - {"10", "11", "12", "13", "14"}, - {"10", "11", "12", "13", "14"}, - {"20"}, - {"21", "22"}, - {"21", "22", "23"}, - {"21", "22", "23"}}.release(); - - test_equivalent(expected_result->view(), result->view()); - - auto const result_with_nulls_excluded = - grouped_time_range_rolling_window(table_view{std::vector{group_column}}, - time_column, - cudf::order::ASCENDING, - input, - preceding, - following, - min_periods, - COLLECT_SET_NULLS_EXCLUDED); - - test_equivalent(expected_result->view(), result_with_nulls_excluded->view()); -} - -TEST_F(CollectSetTest, GroupedTimeRangeRollingWindowOnStringsWithNulls) -{ - using namespace cudf; - using namespace cudf::test; - - auto const time_column = fixed_width_column_wrapper{ - 1, 1, 2, 2, 3, 1, 4, 5, 6}; - auto const group_column = fixed_width_column_wrapper{1, 1, 1, 1, 1, 2, 2, 2, 2}; - auto const input = strings_column_wrapper{{"10", "11", "12", "13", "14", "20", "21", "22", "23"}, - {1, 0, 1, 1, 1, 1, 0, 1, 1}}; - auto const preceding = 2; - auto const following = 1; - auto const min_periods = 1; - auto const result = - grouped_time_range_rolling_window(table_view{std::vector{group_column}}, - time_column, - cudf::order::ASCENDING, - input, - preceding, - following, - min_periods, - COLLECT_SET); - - auto null_at_0 = iterator_with_null_at(0); - auto null_at_1 = iterator_with_null_at(1); - - // In the results, `11` and `21` should be nulls. - auto const expected_result = lists_column_wrapper{ - {{"10", "11", "12", "13"}, null_at_1}, - {{"10", "11", "12", "13"}, null_at_1}, - {{"10", "11", "12", "13", "14"}, null_at_1}, - {{"10", "11", "12", "13", "14"}, null_at_1}, - {{"10", "11", "12", "13", "14"}, null_at_1}, - {"20"}, - {{"21", "22"}, null_at_0}, - {{"21", "22", "23"}, null_at_0}, - {{"21", "22", "23"}, - null_at_0}}.release(); - - test_equivalent(expected_result->view(), result->view()); - - auto const result_with_nulls_excluded = - grouped_time_range_rolling_window(table_view{std::vector{group_column}}, - time_column, - cudf::order::ASCENDING, - input, - preceding, - following, - min_periods, - COLLECT_SET_NULLS_EXCLUDED); - - // After null exclusion, `11`, `21`, and `null` should not appear. - auto const expected_result_with_nulls_excluded = lists_column_wrapper{ - {"10", "12", "13"}, - {"10", "12", "13"}, - {"10", "12", "13", "14"}, - {"10", "12", "13", "14"}, - {"10", "12", "13", "14"}, - {"20"}, - {"22"}, - {"22", "23"}, - {"22", "23"}}.release(); - - test_equivalent(expected_result_with_nulls_excluded->view(), result_with_nulls_excluded->view()); -} - -TYPED_TEST(TypedCollectListTest, BasicGroupedTimeRangeRollingWindowOnStructs) -{ - using namespace cudf; - using namespace cudf::test; - - auto const time_column = fixed_width_column_wrapper{ - 1, 1, 2, 2, 3, 1, 4, 5, 6}; - auto const group_column = fixed_width_column_wrapper{1, 1, 1, 1, 1, 2, 2, 2, 2}; - auto numeric_member_column = COL_V{10, 11, 12, 13, 14, 20, 21, 22, 23}; - auto string_member_column = - strings_column_wrapper{"10", "11", "12", "13", "14", "20", "21", "22", "23"}; - auto struct_members = std::vector>{}; - struct_members.emplace_back(numeric_member_column.release()); - struct_members.emplace_back(string_member_column.release()); - auto const struct_column = make_structs_column(9, std::move(struct_members), 0, {}); - auto const preceding = 2; - auto const following = 1; - auto const min_periods = 1; - auto const result = - grouped_time_range_rolling_window(table_view{std::vector{group_column}}, - time_column, - cudf::order::ASCENDING, - struct_column->view(), - preceding, - following, - min_periods, - COLLECT_SET); - - auto expected_numeric_column = - COL_V{10, 11, 12, 13, 10, 11, 12, 13, 10, 11, 12, 13, 14, 10, 11, 12, - 13, 14, 10, 11, 12, 13, 14, 20, 21, 22, 21, 22, 23, 21, 22, 23}; - - auto expected_string_column = strings_column_wrapper{ - "10", "11", "12", "13", "10", "11", "12", "13", "10", "11", "12", "13", "14", "10", "11", "12", - "13", "14", "10", "11", "12", "13", "14", "20", "21", "22", "21", "22", "23", "21", "22", "23"}; - - auto expected_struct_members = std::vector>{}; - expected_struct_members.emplace_back(expected_numeric_column.release()); - expected_struct_members.emplace_back(expected_string_column.release()); - - auto expected_structs_column = make_structs_column(32, std::move(expected_struct_members), 0, {}); - auto expected_offsets_column = COL_S{0, 4, 8, 13, 18, 23, 24, 26, 29, 32}.release(); - auto expected_result = make_lists_column( - 9, std::move(expected_offsets_column), std::move(expected_structs_column), 0, {}); - - test_equivalent(expected_result->view(), result->view()); - - auto const result_with_nulls_excluded = - grouped_time_range_rolling_window(table_view{std::vector{group_column}}, - time_column, - cudf::order::ASCENDING, - struct_column->view(), - preceding, - following, - min_periods, - COLLECT_SET_NULLS_EXCLUDED); - - test_equivalent(expected_result->view(), result_with_nulls_excluded->view()); -} - -TYPED_TEST(TypedCollectListTest, GroupedTimeRangeRollingWindowWithMinPeriods) -{ - // Test that min_periods is honoured. - // i.e. output row is null when min_periods exceeds number of observations. - using namespace cudf; - using namespace cudf::test; - - auto const time_column = fixed_width_column_wrapper{ - 1, 1, 2, 2, 3, 1, 4, 5, 6}; - auto const group_column = fixed_width_column_wrapper{1, 1, 1, 1, 1, 2, 2, 2, 2}; - auto const input = COL_V{10, 11, 12, 13, 14, 20, 21, 22, 23}; - auto const preceding = 2; - auto const following = 1; - auto const min_periods = 4; - auto const result = - grouped_time_range_rolling_window(table_view{std::vector{group_column}}, - time_column, - cudf::order::ASCENDING, - input, - preceding, - following, - min_periods, - COLLECT_SET); - - auto const expected_result = LCL_V{ - {{10, 11, 12, 13}, - {10, 11, 12, 13}, - {10, 11, 12, 13, 14}, - {10, 11, 12, 13, 14}, - {10, 11, 12, 13, 14}, - {}, - {}, - {}, - {}}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { - return i < 5; - })}.release(); - - test_equivalent(expected_result->view(), result->view()); - - auto const result_with_nulls_excluded = - grouped_time_range_rolling_window(table_view{std::vector{group_column}}, - time_column, - cudf::order::ASCENDING, - input, - preceding, - following, - min_periods, - COLLECT_SET_NULLS_EXCLUDED); - - test_equivalent(expected_result->view(), result_with_nulls_excluded->view()); -} - -TYPED_TEST(TypedCollectListTest, GroupedTimeRangeRollingWindowWithNullsAndMinPeriods) -{ - // Test that min_periods is honoured. - // i.e. output row is null when min_periods exceeds number of observations. - using namespace cudf; - using namespace cudf::test; - - auto const time_column = fixed_width_column_wrapper{ - 1, 1, 2, 2, 3, 1, 4, 5, 6}; - auto const group_column = fixed_width_column_wrapper{1, 1, 1, 1, 1, 2, 2, 2, 2}; - auto const input = COL_V{{10, 11, 12, 13, 14, 20, 21, 22, 23}, {1, 0, 1, 1, 1, 1, 0, 1, 1}}; - auto const preceding = 2; - auto const following = 1; - auto const min_periods = 4; - auto const result = - grouped_time_range_rolling_window(table_view{std::vector{group_column}}, - time_column, - cudf::order::ASCENDING, - input, - preceding, - following, - min_periods, - COLLECT_SET); - - auto null_at_1 = iterator_with_null_at(1); - - // In the results, `11` and `21` should be nulls. - auto const expected_result = LCL_V{ - {{{10, 11, 12, 13}, null_at_1}, - {{10, 11, 12, 13}, null_at_1}, - {{10, 11, 12, 13, 14}, null_at_1}, - {{10, 11, 12, 13, 14}, null_at_1}, - {{10, 11, 12, 13, 14}, null_at_1}, - {}, - {}, - {}, - {}}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { - return i < 5; - })}.release(); - - test_equivalent(expected_result->view(), result->view()); - - auto const result_with_nulls_excluded = - grouped_time_range_rolling_window(table_view{std::vector{group_column}}, - time_column, - cudf::order::ASCENDING, - input, - preceding, - following, - min_periods, - COLLECT_SET_NULLS_EXCLUDED); - - // After null exclusion, `11`, `21`, and `null` should not appear. - auto const expected_result_with_nulls_excluded = LCL_V{ - {{10, 12, 13}, - {10, 12, 13}, - {10, 12, 13, 14}, - {10, 12, 13, 14}, - {10, 12, 13, 14}, - {}, - {}, - {}, - {}}, - cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return i < 5; })}.release(); - - test_equivalent(expected_result_with_nulls_excluded->view(), result_with_nulls_excluded->view()); -} - -TEST_F(CollectSetTest, GroupedTimeRangeRollingWindowOnStringsWithMinPeriods) -{ - // Test that min_periods is honoured. - // i.e. output row is null when min_periods exceeds number of observations. - using namespace cudf; - using namespace cudf::test; - - auto const time_column = fixed_width_column_wrapper{ - 1, 1, 2, 2, 3, 1, 4, 5, 6}; - auto const group_column = fixed_width_column_wrapper{1, 1, 1, 1, 1, 2, 2, 2, 2}; - auto const input = strings_column_wrapper{"10", "11", "12", "13", "14", "20", "21", "22", "23"}; - auto const preceding = 2; - auto const following = 1; - auto const min_periods = 4; - auto const result = - grouped_time_range_rolling_window(table_view{std::vector{group_column}}, - time_column, - cudf::order::ASCENDING, - input, - preceding, - following, - min_periods, - COLLECT_SET); - - auto const expected_result = lists_column_wrapper{ - {{"10", "11", "12", "13"}, - {"10", "11", "12", "13"}, - {"10", "11", "12", "13", "14"}, - {"10", "11", "12", "13", "14"}, - {"10", "11", "12", "13", "14"}, - {}, - {}, - {}, - {}}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { - return i < 5; - })}.release(); - - test_equivalent(expected_result->view(), result->view()); - - auto const result_with_nulls_excluded = - grouped_time_range_rolling_window(table_view{std::vector{group_column}}, - time_column, - cudf::order::ASCENDING, - input, - preceding, - following, - min_periods, - COLLECT_SET_NULLS_EXCLUDED); - - test_equivalent(expected_result->view(), result_with_nulls_excluded->view()); -} - -TEST_F(CollectSetTest, GroupedTimeRangeRollingWindowOnStringsWithNullsAndMinPeriods) -{ - // Test that min_periods is honoured. - // i.e. output row is null when min_periods exceeds number of observations. - using namespace cudf; - using namespace cudf::test; - - auto const time_column = fixed_width_column_wrapper{ - 1, 1, 2, 2, 3, 1, 4, 5, 6}; - auto const group_column = fixed_width_column_wrapper{1, 1, 1, 1, 1, 2, 2, 2, 2}; - auto const input = strings_column_wrapper{{"10", "11", "12", "13", "14", "20", "21", "22", "23"}, - {1, 0, 1, 1, 1, 1, 0, 1, 1}}; - auto const preceding = 2; - auto const following = 1; - auto const min_periods = 4; - auto const result = - grouped_time_range_rolling_window(table_view{std::vector{group_column}}, - time_column, - cudf::order::ASCENDING, - input, - preceding, - following, - min_periods, - COLLECT_SET); - - auto null_at_1 = iterator_with_null_at(1); - - // In the results, `11` and `21` should be nulls. - auto const expected_result = lists_column_wrapper{ - {{{"10", "11", "12", "13"}, null_at_1}, - {{"10", "11", "12", "13"}, null_at_1}, - {{"10", "11", "12", "13", "14"}, null_at_1}, - {{"10", "11", "12", "13", "14"}, null_at_1}, - {{"10", "11", "12", "13", "14"}, null_at_1}, - {}, - {}, - {}, - {}}, - cudf::detail::make_counting_transform_iterator(0, [](auto i) { - return i < 5; - })}.release(); - - test_equivalent(expected_result->view(), result->view()); - - auto const result_with_nulls_excluded = - grouped_time_range_rolling_window(table_view{std::vector{group_column}}, - time_column, - cudf::order::ASCENDING, - input, - preceding, - following, - min_periods, - COLLECT_SET_NULLS_EXCLUDED); - - // After null exclusion, `11`, `21`, and `null` should not appear. - auto const expected_result_with_nulls_excluded = lists_column_wrapper{ - {{"10", "12", "13"}, - {"10", "12", "13"}, - {"10", "12", "13", "14"}, - {"10", "12", "13", "14"}, - {"10", "12", "13", "14"}, - {}, - {}, - {}, - {}}, - cudf::detail::make_counting_transform_iterator( - 0, [](auto i) { return i < 5; })}.release(); - - test_equivalent(expected_result_with_nulls_excluded->view(), result_with_nulls_excluded->view()); -} - -TYPED_TEST(TypedCollectListTest, GroupedTimeRangeRollingWindowOnStructsWithMinPeriods) -{ - // Test that min_periods is honoured. - // i.e. output row is null when min_periods exceeds number of observations. - using namespace cudf; - using namespace cudf::test; - - auto const time_column = fixed_width_column_wrapper{ - 1, 1, 2, 2, 3, 1, 4, 5, 6}; - auto const group_column = fixed_width_column_wrapper{1, 1, 1, 1, 1, 2, 2, 2, 2}; - auto numeric_member_column = COL_V{10, 11, 12, 13, 14, 20, 21, 22, 23}; - auto string_member_column = - strings_column_wrapper{"10", "11", "12", "13", "14", "20", "21", "22", "23"}; - auto struct_members = std::vector>{}; - struct_members.emplace_back(numeric_member_column.release()); - struct_members.emplace_back(string_member_column.release()); - auto const struct_column = make_structs_column(9, std::move(struct_members), 0, {}); - auto const preceding = 2; - auto const following = 1; - auto const min_periods = 4; - auto const result = - grouped_time_range_rolling_window(table_view{std::vector{group_column}}, - time_column, - cudf::order::ASCENDING, - struct_column->view(), - preceding, - following, - min_periods, - COLLECT_SET); - - auto expected_numeric_column = COL_V{10, 11, 12, 13, 10, 11, 12, 13, 10, 11, 12, 13, - 14, 10, 11, 12, 13, 14, 10, 11, 12, 13, 14}; - - auto expected_string_column = - strings_column_wrapper{"10", "11", "12", "13", "10", "11", "12", "13", "10", "11", "12", "13", - "14", "10", "11", "12", "13", "14", "10", "11", "12", "13", "14"}; - - auto expected_struct_members = std::vector>{}; - expected_struct_members.emplace_back(expected_numeric_column.release()); - expected_struct_members.emplace_back(expected_string_column.release()); - - auto expected_structs_column = make_structs_column(23, std::move(expected_struct_members), 0, {}); - auto expected_offsets_column = COL_S{0, 4, 8, 13, 18, 23, 23, 23, 23, 23}.release(); - auto expected_validity_iter = - cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i < 5; }); - auto expected_null_mask = - cudf::test::detail::make_null_mask(expected_validity_iter, expected_validity_iter + 9); - auto expected_result = make_lists_column(9, - std::move(expected_offsets_column), - std::move(expected_structs_column), - 4, - std::move(expected_null_mask)); - - test_equivalent(expected_result->view(), result->view()); - - auto const result_with_nulls_excluded = - grouped_time_range_rolling_window(table_view{std::vector{group_column}}, - time_column, - cudf::order::ASCENDING, - struct_column->view(), - preceding, - following, - min_periods, - COLLECT_SET_NULLS_EXCLUDED); - - test_equivalent(expected_result->view(), result_with_nulls_excluded->view()); -} - -#endif -} // namespace test -} // namespace cudf -CUDF_TEST_PROGRAM_MAIN() From 10abd8eb69ae12655c503a4be0d95943cdcac855 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 17 Mar 2021 09:35:47 -0600 Subject: [PATCH 13/22] Implement tests for groupby collect_set --- cpp/tests/groupby/collect_set_test.cpp | 143 +++++++++++++++++++------ 1 file changed, 113 insertions(+), 30 deletions(-) diff --git a/cpp/tests/groupby/collect_set_test.cpp b/cpp/tests/groupby/collect_set_test.cpp index 29f6bc412b5..7460bf9aca6 100644 --- a/cpp/tests/groupby/collect_set_test.cpp +++ b/cpp/tests/groupby/collect_set_test.cpp @@ -27,20 +27,27 @@ namespace test { #define COL_K cudf::test::fixed_width_column_wrapper #define COL_V cudf::test::fixed_width_column_wrapper +#define COL_S cudf::test::strings_column_wrapper #define LCL_V cudf::test::lists_column_wrapper +#define LCL_S cudf::test::lists_column_wrapper #define VALIDITY std::initializer_list #define COLLECT_SET cudf::make_collect_set_aggregation() +#define COLLECT_SET_NULL_UNEQUAL \ + cudf::make_collect_set_aggregation(null_policy::INCLUDE, null_equality::UNEQUAL) -template struct CollectSetTest : public cudf::test::BaseFixture { }; +template +struct CollectSetTypedTest : public cudf::test::BaseFixture { +}; + using FixedWidthTypesNotBool = cudf::test::Concat; -TYPED_TEST_CASE(CollectSetTest, FixedWidthTypesNotBool); +TYPED_TEST_CASE(CollectSetTypedTest, FixedWidthTypesNotBool); -TYPED_TEST(CollectSetTest, ExceptionCases) +TYPED_TEST(CollectSetTypedTest, ExceptionCases) { std::vector agg_requests(1); agg_requests[0].values = COL_V{{1, 2, 3, 4, 5, 6}, {true, false, true, false, true, false}}; @@ -51,8 +58,8 @@ TYPED_TEST(CollectSetTest, ExceptionCases) EXPECT_THROW(gby.aggregate(agg_requests), cudf::logic_error); } -// TODO: Fix those cases to handle empty and simple input -TYPED_TEST(CollectSetTest, DISABLED_TrivialCases) +// TODO: Enable these tests after issue#7611 has been fixed +TYPED_TEST(CollectSetTypedTest, DISABLED_TrivialCases) { // Empty input test_single_agg(COL_K{}, COL_V{}, COL_K{}, COL_V{}, COLLECT_SET); @@ -60,59 +67,135 @@ TYPED_TEST(CollectSetTest, DISABLED_TrivialCases) // Single key input { COL_K keys{1}; + COL_V vals{10}; COL_K keys_expected{1}; - COL_V vals{100}; - COL_V vals_expected{{100}}; + LCL_V vals_expected{{10}}; test_single_agg(keys, vals, keys_expected, vals_expected, COLLECT_SET); } - // Simple input + // Non-repeated keys { - COL_K keys{1, 2}; + COL_K keys{2, 1}; + COL_V vals{20, 10}; COL_K keys_expected{1, 2}; - COL_V vals{100, 200}; - LCL_V vals_expected{{100}, {200}}; + LCL_V vals_expected{{10}, {20}}; test_single_agg(keys, vals, keys_expected, vals_expected, COLLECT_SET); } } -TYPED_TEST(CollectSetTest, TypicalCases) +TYPED_TEST(CollectSetTypedTest, TypicalCases) { - // Hard-coded case + // Pre-sorted keys { - COL_K keys{1, 1, 2, 2, 3, 3}; + COL_K keys{1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3}; + COL_V vals{10, 11, 10, 10, 20, 21, 21, 20, 30, 33, 32, 31}; COL_K keys_expected{1, 2, 3}; - LCL_V vals{{1, 2}, {3, 4}, {5, 6, 7}, {}, {9, 10}, {11}}; - LCL_V vals_expected{{{1, 2}, {3, 4}}, {{5, 6, 7}, {}}, {{9, 10}, {11}}}; + LCL_V vals_expected{{10, 11}, {20, 21}, {30, 31, 32, 33}}; test_single_agg(keys, vals, keys_expected, vals_expected, COLLECT_SET); } - // Procedurally generated test + // Expect the result keys to be sorted by sort-based groupby { - COL_K keys{1, 1, 2, 2, 3, 3}; - COL_K keys_expected{1, 2, 3}; - LCL_V vals{{1, 2}, {3, 4}, {5, 6, 7}, {}, {9, 10}, {11}}; - LCL_V vals_expected{{{1, 2}, {3, 4}}, {{5, 6, 7}, {}}, {{9, 10}, {11}}}; + COL_K keys{4, 1, 2, 4, 3, 3, 2, 1}; + COL_V vals{40, 10, 20, 40, 30, 30, 20, 11}; + COL_K keys_expected{1, 2, 3, 4}; + LCL_V vals_expected{{10, 11}, {20}, {30}, {40}}; + test_single_agg(keys, vals, keys_expected, vals_expected, COLLECT_SET); + } +} + +// Keys and values columns are sliced columns +TYPED_TEST(CollectSetTypedTest, SlicedColumnsCases) +{ + COL_K keys_original{1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3}; + COL_V vals_original{10, 11, 10, 10, 20, 21, 21, 20, 30, 33, 32, 31}; + { + auto const keys = cudf::slice(keys_original, {0, 4})[0]; // { 1, 1, 1, 1 } + auto const vals = cudf::slice(vals_original, {0, 4})[0]; // { 10, 11, 10, 10 } + auto const keys_expected = COL_K{1}; + auto const vals_expected = LCL_V{{10, 11}}; test_single_agg(keys, vals, keys_expected, vals_expected, COLLECT_SET); } + { + auto const keys = cudf::slice(keys_original, {2, 10})[0]; // { 1, 1, 2, 2, 2, 2, 3, 3 } + auto const vals = cudf::slice(vals_original, {2, 10})[0]; // { 10, 10, 20, 21, 21, 20, 30, 33 } + auto const keys_expected = COL_K{1, 2, 3}; + auto const vals_expected = LCL_V{{10}, {20, 21}, {30, 33}}; + test_single_agg(keys, vals, keys_expected, vals_expected, COLLECT_SET); + } +} + +TEST_F(CollectSetTest, StringTest) +{ + COL_K keys{1, 2, 3, 3, 2, 1, 2, 1, 2, 1, 1, 1, 1}; + COL_S vals{ + "String 1, first", + "String 2, first", + "String 3, first", + "String 3, second", + "String 2, second", + "String 1, second", + "String 2, second", // repeated + "String 1, second", // repeated + "String 2, second", // repeated + "String 1, second", // repeated + "String 1, second", // repeated + "String 1, second", // repeated + "String 1, second" // repeated + }; + COL_K keys_expected{1, 2, 3}; + LCL_S vals_expected{{"String 1, first", "String 1, second"}, + {"String 2, first", "String 2, second"}, + {"String 3, first", "String 3, second"}}; + test_single_agg(keys, vals, keys_expected, vals_expected, COLLECT_SET); } -TYPED_TEST(CollectSetTest, CollectWithNulls) +TYPED_TEST(CollectSetTypedTest, CollectWithNulls) { - // Hard-coded case + // Just use an arbitrary value to store null entries + // Using this alias variable will make the code look cleaner + constexpr int32_t null = 0; + + // Pre-sorted keys { - COL_K keys{1, 1, 2, 2, 3, 3}; + COL_K keys{1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3}; + COL_V vals{{10, 10, null, null, 20, null, null, null, 30, 31, 30, 31}, + {true, true, false, false, true, false, false, false, true, true, true, true}}; COL_K keys_expected{1, 2, 3}; - COL_V vals{{1, 2, 3, 4, 5, 6}, {true, false, true, false, true, false}}; - LCL_V vals_expected{{{1, 2}, VALIDITY{true, false}}, - {{3, 4}, VALIDITY{true, false}}, - {{5, 6}, VALIDITY{true, false}}}; + + // By default, nulls are consider equals, thus only one null is kept per key + LCL_V vals_expected{{{10, null}, VALIDITY{true, false}}, + {{20, null}, VALIDITY{true, false}}, + {{30, 31}, VALIDITY{true, true}}}; test_single_agg(keys, vals, keys_expected, vals_expected, COLLECT_SET); + + // All nulls per key are kept (nulls are put at the end of each list) + vals_expected = LCL_V{{{10, null, null}, VALIDITY{true, false, false}}, + {{20, null, null, null}, VALIDITY{true, false, false, false}}, + {{30, 31}, VALIDITY{true, true}}}; + test_single_agg(keys, vals, keys_expected, vals_expected, COLLECT_SET_NULL_UNEQUAL); } - // Procedurally generated test + // Expect the result keys to be sorted by sort-based groupby { - // + COL_K keys{4, 1, 2, 4, 3, 3, 3, 3, 2, 1}; + COL_V vals{{40, 10, 20, 40, null, null, null, null, 21, null}, + {true, true, true, true, false, false, false, false, true, false}}; + COL_K keys_expected{1, 2, 3, 4}; + + // By default, nulls are consider equals, thus only one null is kept per key + LCL_V vals_expected{{{10, null}, VALIDITY{true, false}}, + {{20, 21}, VALIDITY{true, true}}, + {{null}, VALIDITY{false}}, + {{40}, VALIDITY{true}}}; + test_single_agg(keys, vals, keys_expected, vals_expected, COLLECT_SET); + + // All nulls per key are kept (nulls are put at the end of each list) + vals_expected = LCL_V{{{10, null}, VALIDITY{true, false}}, + {{20, 21}, VALIDITY{true, true}}, + {{null, null, null, null}, VALIDITY{false, false, false, false}}, + {{40}, VALIDITY{true}}}; + test_single_agg(keys, vals, keys_expected, vals_expected, COLLECT_SET_NULL_UNEQUAL); } } From 864d87822ed6f55b55084b710ea4a9e43b354114 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 17 Mar 2021 12:37:22 -0600 Subject: [PATCH 14/22] Add groupby/collect_set_test.cpp to CMakeList.txt --- cpp/tests/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 40829c74957..eb6ecb2ad77 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -86,6 +86,7 @@ ConfigureTest(ERROR_TEST error/error_handling_test.cu) ################################################################################################### # - groupby tests --------------------------------------------------------------------------------- ConfigureTest(GROUPBY_TEST + groupby/collect_set_test.cpp groupby/groupby_groups_test.cpp groupby/group_argmin_test.cpp groupby/group_argmax_test.cpp From 68878160044e832284d7c1a31ba771ef09bbd875 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 17 Mar 2021 12:45:35 -0600 Subject: [PATCH 15/22] Update copyright year in header --- cpp/src/groupby/sort/groupby.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/groupby/sort/groupby.cu b/cpp/src/groupby/sort/groupby.cu index 7ce38308ebe..6d92c2608fa 100644 --- a/cpp/src/groupby/sort/groupby.cu +++ b/cpp/src/groupby/sort/groupby.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 96c0e2740b86c0d62ae1da8246969ed7af272eeb Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 17 Mar 2021 12:46:56 -0600 Subject: [PATCH 16/22] Remove CUDF_TEST_PROGRAM_MAIN() from test file --- cpp/tests/groupby/collect_set_test.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/cpp/tests/groupby/collect_set_test.cpp b/cpp/tests/groupby/collect_set_test.cpp index 7460bf9aca6..214e677d3ce 100644 --- a/cpp/tests/groupby/collect_set_test.cpp +++ b/cpp/tests/groupby/collect_set_test.cpp @@ -201,5 +201,3 @@ TYPED_TEST(CollectSetTypedTest, CollectWithNulls) } // namespace test } // namespace cudf - -CUDF_TEST_PROGRAM_MAIN() From 483e752638be5aab1e297acbc0f895af5ddefde7 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 17 Mar 2021 13:08:15 -0600 Subject: [PATCH 17/22] Fix style check --- cpp/src/rolling/rolling_detail.cuh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/rolling/rolling_detail.cuh b/cpp/src/rolling/rolling_detail.cuh index c476601aa64..42562507fa9 100644 --- a/cpp/src/rolling/rolling_detail.cuh +++ b/cpp/src/rolling/rolling_detail.cuh @@ -1370,7 +1370,8 @@ std::unique_ptr rolling_window(column_view const& input, auto input_col = cudf::is_dictionary(input.type()) ? dictionary_column_view(input).get_indices_annotated() : input; - auto output = cudf::type_dispatcher(input_col.type(), + + auto output = cudf::type_dispatcher(input_col.type(), dispatch_rolling{}, input_col, default_outputs, From c0dae8499a6309d67a5a8cbabdc5e976ec216726 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 17 Mar 2021 15:33:27 -0600 Subject: [PATCH 18/22] Update tests for groupby collect_set --- cpp/tests/groupby/collect_set_test.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/tests/groupby/collect_set_test.cpp b/cpp/tests/groupby/collect_set_test.cpp index 214e677d3ce..e2ea80b3dae 100644 --- a/cpp/tests/groupby/collect_set_test.cpp +++ b/cpp/tests/groupby/collect_set_test.cpp @@ -58,18 +58,18 @@ TYPED_TEST(CollectSetTypedTest, ExceptionCases) EXPECT_THROW(gby.aggregate(agg_requests), cudf::logic_error); } -// TODO: Enable these tests after issue#7611 has been fixed -TYPED_TEST(CollectSetTypedTest, DISABLED_TrivialCases) +TYPED_TEST(CollectSetTypedTest, TrivialCases) { // Empty input - test_single_agg(COL_K{}, COL_V{}, COL_K{}, COL_V{}, COLLECT_SET); + // TODO: Enable this test after issue#7611 has been fixed + // test_single_agg(COL_K{}, COL_V{}, COL_K{}, COL_V{}, COLLECT_SET); // Single key input { COL_K keys{1}; COL_V vals{10}; COL_K keys_expected{1}; - LCL_V vals_expected{{10}}; + LCL_V vals_expected{LCL_V{10}}; test_single_agg(keys, vals, keys_expected, vals_expected, COLLECT_SET); } @@ -78,7 +78,7 @@ TYPED_TEST(CollectSetTypedTest, DISABLED_TrivialCases) COL_K keys{2, 1}; COL_V vals{20, 10}; COL_K keys_expected{1, 2}; - LCL_V vals_expected{{10}, {20}}; + LCL_V vals_expected{LCL_V{10}, LCL_V{20}}; test_single_agg(keys, vals, keys_expected, vals_expected, COLLECT_SET); } } From 43731bac7cb30a3fbf73cfc5c56446b6beb7ba37 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 17 Mar 2021 21:01:26 -0600 Subject: [PATCH 19/22] Fix java binding and python binding for collect_list, and also add a simple python interface for collect_set --- java/src/main/native/src/AggregationJni.cpp | 2 +- python/cudf/cudf/_lib/aggregation.pyx | 11 +++++++++-- python/cudf/cudf/_lib/cpp/aggregation.pxd | 7 +++++-- 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/java/src/main/native/src/AggregationJni.cpp b/java/src/main/native/src/AggregationJni.cpp index aae7cb493a8..c5184111edf 100644 --- a/java/src/main/native/src/AggregationJni.cpp +++ b/java/src/main/native/src/AggregationJni.cpp @@ -206,7 +206,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createCollectAgg(JNIEnv cudf::jni::auto_set_device(env); cudf::null_policy policy = include_nulls ? cudf::null_policy::INCLUDE : cudf::null_policy::EXCLUDE; - std::unique_ptr ret = cudf::make_collect_aggregation(policy); + std::unique_ptr ret = cudf::make_collect_list_aggregation(policy); return reinterpret_cast(ret.release()); } CATCH_STD(env, 0); diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx index 5c6801137ae..91717823300 100644 --- a/python/cudf/cudf/_lib/aggregation.pyx +++ b/python/cudf/cudf/_lib/aggregation.pyx @@ -49,7 +49,8 @@ class AggregationKind(Enum): ARGMIN = libcudf_aggregation.aggregation.Kind.ARGMIN NUNIQUE = libcudf_aggregation.aggregation.Kind.NUNIQUE NTH = libcudf_aggregation.aggregation.Kind.NTH_ELEMENT - COLLECT = libcudf_aggregation.aggregation.Kind.COLLECT + COLLECT_LIST = libcudf_aggregation.aggregation.Kind.COLLECT_LIST + COLLECT_SET = libcudf_aggregation.aggregation.Kind.COLLECT_SET PTX = libcudf_aggregation.aggregation.Kind.PTX CUDA = libcudf_aggregation.aggregation.Kind.CUDA @@ -241,7 +242,13 @@ cdef class _AggregationFactory: @classmethod def collect(cls): cdef Aggregation agg = Aggregation.__new__(Aggregation) - agg.c_obj = move(libcudf_aggregation.make_collect_aggregation()) + agg.c_obj = move(libcudf_aggregation.make_collect_list_aggregation()) + return agg + + @classmethod + def collect_set(cls): + cdef Aggregation agg = Aggregation.__new__(Aggregation) + agg.c_obj = move(libcudf_aggregation.make_collect_set_aggregation()) return agg @classmethod diff --git a/python/cudf/cudf/_lib/cpp/aggregation.pxd b/python/cudf/cudf/_lib/cpp/aggregation.pxd index 660db29f7a9..e1f4ea13b57 100644 --- a/python/cudf/cudf/_lib/cpp/aggregation.pxd +++ b/python/cudf/cudf/_lib/cpp/aggregation.pxd @@ -34,7 +34,8 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil: ARGMIN 'cudf::aggregation::ARGMIN' NUNIQUE 'cudf::aggregation::NUNIQUE' NTH_ELEMENT 'cudf::aggregation::NTH_ELEMENT' - COLLECT 'cudf::aggregation::COLLECT' + COLLECT_LIST 'cudf::aggregation::COLLECT_LIST' + COLLECT_SET 'cudf::aggregation::COLLECT_SET' PTX 'cudf::aggregation::PTX' CUDA 'cudf::aggregation::CUDA' Kind kind @@ -83,7 +84,9 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil: size_type n ) except + - cdef unique_ptr[aggregation] make_collect_aggregation() except + + cdef unique_ptr[aggregation] make_collect_list_aggregation() except + + + cdef unique_ptr[aggregation] make_collect_set_aggregation() except + cdef unique_ptr[aggregation] make_udf_aggregation( udf_type type, From d91d53c7ecde538da8b437bd20d6da40dec2e51f Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 18 Mar 2021 07:36:12 -0600 Subject: [PATCH 20/22] Reverse name for Kind::COLLECT enum in Python binding --- python/cudf/cudf/_lib/aggregation.pyx | 2 +- python/cudf/cudf/_lib/cpp/aggregation.pxd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx index 91717823300..609c0a74997 100644 --- a/python/cudf/cudf/_lib/aggregation.pyx +++ b/python/cudf/cudf/_lib/aggregation.pyx @@ -49,7 +49,7 @@ class AggregationKind(Enum): ARGMIN = libcudf_aggregation.aggregation.Kind.ARGMIN NUNIQUE = libcudf_aggregation.aggregation.Kind.NUNIQUE NTH = libcudf_aggregation.aggregation.Kind.NTH_ELEMENT - COLLECT_LIST = libcudf_aggregation.aggregation.Kind.COLLECT_LIST + COLLECT = libcudf_aggregation.aggregation.Kind.COLLECT_LIST COLLECT_SET = libcudf_aggregation.aggregation.Kind.COLLECT_SET PTX = libcudf_aggregation.aggregation.Kind.PTX CUDA = libcudf_aggregation.aggregation.Kind.CUDA diff --git a/python/cudf/cudf/_lib/cpp/aggregation.pxd b/python/cudf/cudf/_lib/cpp/aggregation.pxd index e1f4ea13b57..e9836c11361 100644 --- a/python/cudf/cudf/_lib/cpp/aggregation.pxd +++ b/python/cudf/cudf/_lib/cpp/aggregation.pxd @@ -34,7 +34,7 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil: ARGMIN 'cudf::aggregation::ARGMIN' NUNIQUE 'cudf::aggregation::NUNIQUE' NTH_ELEMENT 'cudf::aggregation::NTH_ELEMENT' - COLLECT_LIST 'cudf::aggregation::COLLECT_LIST' + COLLECT 'cudf::aggregation::COLLECT_LIST' COLLECT_SET 'cudf::aggregation::COLLECT_SET' PTX 'cudf::aggregation::PTX' CUDA 'cudf::aggregation::CUDA' From b511cb5696664ad912a3069b60650c3435fc7dca Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 18 Mar 2021 15:12:37 -0600 Subject: [PATCH 21/22] Fix python binding for collect API --- python/cudf/cudf/_lib/aggregation.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx index 609c0a74997..840f0c98987 100644 --- a/python/cudf/cudf/_lib/aggregation.pyx +++ b/python/cudf/cudf/_lib/aggregation.pyx @@ -49,7 +49,7 @@ class AggregationKind(Enum): ARGMIN = libcudf_aggregation.aggregation.Kind.ARGMIN NUNIQUE = libcudf_aggregation.aggregation.Kind.NUNIQUE NTH = libcudf_aggregation.aggregation.Kind.NTH_ELEMENT - COLLECT = libcudf_aggregation.aggregation.Kind.COLLECT_LIST + COLLECT = libcudf_aggregation.aggregation.Kind.COLLECT COLLECT_SET = libcudf_aggregation.aggregation.Kind.COLLECT_SET PTX = libcudf_aggregation.aggregation.Kind.PTX CUDA = libcudf_aggregation.aggregation.Kind.CUDA From 918eeb7a0fff07bb53383a00f10160c32691d276 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 23 Mar 2021 08:24:52 -0600 Subject: [PATCH 22/22] Rename tests unit --- cpp/tests/groupby/collect_set_test.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/tests/groupby/collect_set_test.cpp b/cpp/tests/groupby/collect_set_test.cpp index e2ea80b3dae..5303b8f4f61 100644 --- a/cpp/tests/groupby/collect_set_test.cpp +++ b/cpp/tests/groupby/collect_set_test.cpp @@ -47,7 +47,7 @@ using FixedWidthTypesNotBool = cudf::test::Concat; TYPED_TEST_CASE(CollectSetTypedTest, FixedWidthTypesNotBool); -TYPED_TEST(CollectSetTypedTest, ExceptionCases) +TYPED_TEST(CollectSetTypedTest, ExceptionTests) { std::vector agg_requests(1); agg_requests[0].values = COL_V{{1, 2, 3, 4, 5, 6}, {true, false, true, false, true, false}}; @@ -58,7 +58,7 @@ TYPED_TEST(CollectSetTypedTest, ExceptionCases) EXPECT_THROW(gby.aggregate(agg_requests), cudf::logic_error); } -TYPED_TEST(CollectSetTypedTest, TrivialCases) +TYPED_TEST(CollectSetTypedTest, TrivialInput) { // Empty input // TODO: Enable this test after issue#7611 has been fixed @@ -83,7 +83,7 @@ TYPED_TEST(CollectSetTypedTest, TrivialCases) } } -TYPED_TEST(CollectSetTypedTest, TypicalCases) +TYPED_TEST(CollectSetTypedTest, TypicalInput) { // Pre-sorted keys { @@ -105,7 +105,7 @@ TYPED_TEST(CollectSetTypedTest, TypicalCases) } // Keys and values columns are sliced columns -TYPED_TEST(CollectSetTypedTest, SlicedColumnsCases) +TYPED_TEST(CollectSetTypedTest, SlicedColumnsInput) { COL_K keys_original{1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3}; COL_V vals_original{10, 11, 10, 10, 20, 21, 21, 20, 30, 33, 32, 31}; @@ -125,7 +125,7 @@ TYPED_TEST(CollectSetTypedTest, SlicedColumnsCases) } } -TEST_F(CollectSetTest, StringTest) +TEST_F(CollectSetTest, StringInput) { COL_K keys{1, 2, 3, 3, 2, 1, 2, 1, 2, 1, 1, 1, 1}; COL_S vals{