From 5502ce27d7de0d3781e1f50d4c70e2334cf6eefe Mon Sep 17 00:00:00 2001 From: Mithun RK Date: Sat, 17 Oct 2020 14:33:27 -0700 Subject: [PATCH 1/8] Fix time range window queries with null timestamps: 1. Initial commit. Fix for non-grouped time-ranges with null timestamps, in ASC order. 2. Fixed non-grouped time-ranges with null timestampw, in DESC. 3. Fixed grouped time-ranges with null timestamp, in ASC. 4. Fixed grouped time-ranges with null timestamp, in DESC. 5. Fixed grouped time-ranges for ASC (following) 6. Tests for different combinations of timestamp null grouping, timestamp ordering, and key grouping. 7. Refactor: Common code gathered to utility function. 8. Code formatting --- cpp/src/rolling/rolling.cu | 345 +++++++++++++++--- .../grouped_rolling/grouped_rolling_test.cpp | 239 ++++++++++++ 2 files changed, 543 insertions(+), 41 deletions(-) diff --git a/cpp/src/rolling/rolling.cu b/cpp/src/rolling/rolling.cu index 74faa4a6ed4..398d0422e5b 100644 --- a/cpp/src/rolling/rolling.cu +++ b/cpp/src/rolling/rolling.cu @@ -50,6 +50,10 @@ #include #include #include +#include "rmm/thrust_rmm_allocator.h" +#include "thrust/execution_policy.h" +#include "thrust/find.h" +#include "thrust/transform.h" namespace cudf { namespace detail { @@ -1283,22 +1287,80 @@ size_t multiplication_factor(cudf::data_type const& data_type) } } +// Given a single, ungrouped timestamp column, return the indices corresponding +// to the first null timestamp, and (one past) the last null timestamp. +// The input column is sorted, with all null values clustered either +// at the beginning of the column or at the end. +// If no null values are founds, null_begin and null_end are 0. +std::tuple get_null_bounds_for_timestamp_column( + column_view const& timestamp_column) +{ + auto p_timestamps_device_view = column_device_view::create(timestamp_column); + auto timestamp_begin = thrust::make_counting_iterator(0); + auto timestamp_end = thrust::make_counting_iterator(timestamp_column.size()); + + // TODO: Handle case where timestamp_column is not nullable. + + auto nulls_begin = timestamp_column.has_nulls() + ? thrust::find_if(thrust::device, + timestamp_begin, + timestamp_end, + [timestamps = *p_timestamps_device_view] __device__( + auto i) { return timestamps.is_null_nocheck(i); }) + : timestamp_begin; + auto nulls_begin_idx = nulls_begin - timestamp_begin; + + auto nulls_end = timestamp_column.has_nulls() + ? thrust::find_if(thrust::device, + nulls_begin, + timestamp_end, + [timestamps = *p_timestamps_device_view] __device__(auto i) { + return !timestamps.is_null_nocheck(i); + }) + : timestamp_begin; + auto nulls_end_idx = nulls_end - timestamp_begin; + + // Sanity checks: NULL values must be grouped together. + // Either NULLS FIRST or NULLS LAST. + CUDF_EXPECTS(!timestamp_column.has_nulls() || nulls_begin_idx == 0 || + nulls_end_idx == timestamp_column.size(), + "Expected nulls in timestamp column to be grouped together."); + + return std::make_tuple(nulls_begin_idx, nulls_end_idx); +} + // Time-range window computation, with // 1. no grouping keys specified // 2. timetamps in ASCENDING order. // Treat as one single group. -template +template std::unique_ptr time_range_window_ASC(column_view const& input, column_view const& timestamp_column, - TimestampImpl_t preceding_window, - TimestampImpl_t following_window, + TimeT preceding_window, + TimeT following_window, size_type min_periods, std::unique_ptr const& aggr, rmm::mr::device_memory_resource* mr) { - auto preceding_calculator = [d_timestamps = timestamp_column.data(), - preceding_window] __device__(size_type idx) { - auto group_start = 0; + size_type nulls_begin_idx, nulls_end_idx; + std::tie(nulls_begin_idx, nulls_end_idx) = get_null_bounds_for_timestamp_column(timestamp_column); + + auto preceding_calculator = [nulls_begin_idx, + nulls_end_idx, + d_timestamps = timestamp_column.data(), + preceding_window] __device__(size_type idx) -> size_type { + if (idx >= nulls_begin_idx && idx < nulls_end_idx) { + // Current row is in the null group. + // Must consider beginning of null-group as window start. + return idx - nulls_begin_idx + 1; + } + + // timestamp[idx] not null. Binary search the group, excluding null group. + // If nulls_begin_idx == 0, either + // 1. NULLS FIRST ordering: Binary search starts where nulls_end_idx. + // 2. NO NULLS: Binary search starts at 0 (also nulls_end_idx). + // Otherwise, NULLS LAST ordering. Start at 0. + auto group_start = nulls_begin_idx == 0 ? nulls_end_idx : 0; auto lowest_timestamp_in_window = d_timestamps[idx] - preceding_window; return ((d_timestamps + idx) - thrust::lower_bound(thrust::seq, @@ -1308,10 +1370,24 @@ std::unique_ptr time_range_window_ASC(column_view const& input, 1; // Add 1, for `preceding` to account for current row. }; - auto following_calculator = [num_rows = input.size(), - d_timestamps = timestamp_column.data(), - following_window] __device__(size_type idx) { - auto group_end = num_rows; + auto following_calculator = [nulls_begin_idx, + nulls_end_idx, + num_rows = input.size(), + d_timestamps = timestamp_column.data(), + following_window] __device__(size_type idx) -> size_type { + if (idx >= nulls_begin_idx && idx < nulls_end_idx) { + // Current row is in the null group. + // Window ends at the end of the null group. + return nulls_end_idx - idx - 1; + } + + // timestamp[idx] not null. Binary search the group, excluding null group. + // If nulls_begin_idx == 0, either + // 1. NULLS FIRST ordering: Binary search ends at num_rows. + // 2. NO NULLS: Binary search also ends at num_rows. + // Otherwise, NULLS LAST ordering. End at nulls_begin_idx. + + auto group_end = nulls_begin_idx == 0 ? num_rows : nulls_begin_idx; auto highest_timestamp_in_window = d_timestamps[idx] + following_window; return (thrust::upper_bound(thrust::seq, @@ -1334,29 +1410,126 @@ std::unique_ptr time_range_window_ASC(column_view const& input, mr); } +// Given a timestamp column grouped as specified in group_offsets, +// return the following two vectors: +// 1. Vector with one entry per group, indicating the offset in the group +// where the null values begin. +// 2. Vector with one entry per group, indicating the offset in the group +// where the null values end. (i.e. 1 past the last null.) +// Each group in the input timestamp column must be sorted, +// with null values clustered at either the start or the end of each group. +// If there are no nulls for any given group, (nulls_begin, nulls_end) == (0,0). +std::tuple, rmm::device_vector> +get_null_bounds_for_timestamp_column(column_view const& timestamp_column, + rmm::device_vector const& group_offsets) +{ + // For each group, the null values are themselves clustered + // at the beginning or the end of the group. + // These nulls cannot participate, except in their own window. + + // If the input has n groups, group_offsets will have n+1 values. + // null_start and null_end should eventually have 1 entry per group. + auto null_start = rmm::device_vector(group_offsets.begin(), group_offsets.end() - 1); + auto null_end = rmm::device_vector(group_offsets.begin(), group_offsets.end() - 1); + if (timestamp_column.has_nulls()) { + auto p_timestamps_device_view = column_device_view::create(timestamp_column); + size_type num_rows = timestamp_column.size(); + + // For each group_label i, find the offsets in the timestamp column + // for the beginning group of null timestamps. + // If no nulls in the group, point to the beginning of the group. + thrust::transform( + thrust::device, + thrust::make_counting_iterator(static_cast(0)), + thrust::make_counting_iterator(static_cast(group_offsets.size() - 1)), + null_start.begin(), + [d_timestamps = *p_timestamps_device_view, + d_group_offsets = group_offsets.data().get()] __device__(auto group_label) { + auto group_start = d_group_offsets[group_label]; + auto group_end = d_group_offsets[group_label + 1]; + auto nulls_begin = + thrust::find_if(thrust::seq, + thrust::make_counting_iterator(group_start), + thrust::make_counting_iterator(group_end), + [&d_timestamps](auto i) { return d_timestamps.is_null_nocheck(i); }); + return nulls_begin == thrust::make_counting_iterator(group_end) // i.e. No nulls in group. + ? group_start + : group_start + (nulls_begin - thrust::make_counting_iterator(group_start)); + }); + + // TODO: FIXME: Attempt to do this in one scan. + // Now, for each group_label i, find the end of the null group. + // (i.e. the first non-null following the first null). + // If no nulls in the group, point to the beginning of the group. + thrust::transform( + thrust::device, + thrust::make_counting_iterator(static_cast(0)), + thrust::make_counting_iterator(static_cast(group_offsets.size() - 1)), + null_end.begin(), + [d_timestamps = *p_timestamps_device_view, + d_group_offsets = group_offsets.data().get(), + d_null_start = null_start.data().get()] __device__(auto group_label) { + auto group_start = d_group_offsets[group_label]; + auto nulls_begin = d_null_start[group_label]; + auto group_end = d_group_offsets[group_label + 1]; + auto nulls_end = + thrust::find_if(thrust::seq, + thrust::make_counting_iterator(nulls_begin), + thrust::make_counting_iterator(group_end), + [&d_timestamps](auto i) { return !d_timestamps.is_null_nocheck(i); }); + return group_start + (nulls_end - thrust::make_counting_iterator(group_start)); + }); + } + + return std::make_pair(std::move(null_start), std::move(null_end)); +} + // Time-range window computation, for timestamps in ASCENDING order. -template +template std::unique_ptr time_range_window_ASC( column_view const& input, column_view const& timestamp_column, rmm::device_vector const& group_offsets, rmm::device_vector const& group_labels, - TimestampImpl_t preceding_window, - TimestampImpl_t following_window, + TimeT preceding_window, + TimeT following_window, size_type min_periods, std::unique_ptr const& aggr, rmm::mr::device_memory_resource* mr) { + rmm::device_vector null_start, null_end; + std::tie(null_start, null_end) = + get_null_bounds_for_timestamp_column(timestamp_column, group_offsets); + auto preceding_calculator = [d_group_offsets = group_offsets.data().get(), d_group_labels = group_labels.data().get(), - d_timestamps = timestamp_column.data(), - preceding_window] __device__(size_type idx) { - auto group_label = d_group_labels[idx]; - auto group_start = d_group_offsets[group_label]; + d_timestamps = timestamp_column.data(), + d_nulls_begin = null_start.data().get(), + d_nulls_end = null_end.data().get(), + preceding_window] __device__(size_type idx) -> size_type { + auto group_label = d_group_labels[idx]; + auto group_start = d_group_offsets[group_label]; + auto nulls_begin = d_nulls_begin[group_label]; + auto nulls_end = d_nulls_end[group_label]; + + // If idx lies in the null-range, the window is the null range. + if (idx >= nulls_begin && idx < nulls_end) { + // Current row is in the null group. + // The window starts at the start of the null group. + return idx - nulls_begin + 1; + } + + // timestamp[idx] not null. Search must exclude the null group. + // If nulls_begin == group_start, either of the following is true: + // 1. NULLS FIRST ordering: Search must begin at nulls_end. + // 2. NO NULLS: Search must begin at group_start (which also equals nulls_end.) + // Otherwise, NULLS LAST ordering. Search must start at nulls group_start. + auto search_start = nulls_begin == group_start ? nulls_end : group_start; + auto lowest_timestamp_in_window = d_timestamps[idx] - preceding_window; return ((d_timestamps + idx) - thrust::lower_bound(thrust::seq, - d_timestamps + group_start, + d_timestamps + search_start, d_timestamps + idx, lowest_timestamp_in_window)) + 1; // Add 1, for `preceding` to account for current row. @@ -1364,17 +1537,37 @@ std::unique_ptr time_range_window_ASC( auto following_calculator = [d_group_offsets = group_offsets.data().get(), d_group_labels = group_labels.data().get(), - d_timestamps = timestamp_column.data(), - following_window] __device__(size_type idx) { + d_timestamps = timestamp_column.data(), + d_nulls_begin = null_start.data().get(), + d_nulls_end = null_end.data().get(), + following_window] __device__(size_type idx) -> size_type { auto group_label = d_group_labels[idx]; + auto group_start = d_group_offsets[group_label]; auto group_end = d_group_offsets[group_label + 1]; // Cannot fall off the end, since offsets is capped with `input.size()`. + auto nulls_begin = d_nulls_begin[group_label]; + auto nulls_end = d_nulls_end[group_label]; + + // If idx lies in the null-range, the window is the null range. + if (idx >= nulls_begin && idx < nulls_end) { + // Current row is in the null group. + // The window ends at the end of the null group. + return nulls_end - idx - 1; + } + + // timestamp[idx] not null. Search must exclude the null group. + // If nulls_begin == group_start, either of the following is true: + // 1. NULLS FIRST ordering: Search ends at group_end. + // 2. NO NULLS: Search ends at group_end. + // Otherwise, NULLS LAST ordering. Search ends at nulls_begin. + auto search_end = nulls_begin == group_start ? group_end : nulls_begin; + auto highest_timestamp_in_window = d_timestamps[idx] + following_window; return (thrust::upper_bound(thrust::seq, d_timestamps + idx, - d_timestamps + group_end, + d_timestamps + search_end, highest_timestamp_in_window) - (d_timestamps + idx)) - 1; @@ -1396,18 +1589,34 @@ std::unique_ptr time_range_window_ASC( // 1. no grouping keys specified // 2. timetamps in DESCENDING order. // Treat as one single group. -template +template std::unique_ptr time_range_window_DESC(column_view const& input, column_view const& timestamp_column, - TimestampImpl_t preceding_window, - TimestampImpl_t following_window, + TimeT preceding_window, + TimeT following_window, size_type min_periods, std::unique_ptr const& aggr, rmm::mr::device_memory_resource* mr) { - auto preceding_calculator = [d_timestamps = timestamp_column.data(), - preceding_window] __device__(size_type idx) { - auto group_start = 0; + size_type nulls_begin_idx, nulls_end_idx; + std::tie(nulls_begin_idx, nulls_end_idx) = get_null_bounds_for_timestamp_column(timestamp_column); + + auto preceding_calculator = [nulls_begin_idx, + nulls_end_idx, + d_timestamps = timestamp_column.data(), + preceding_window] __device__(size_type idx) -> size_type { + if (idx >= nulls_begin_idx && idx < nulls_end_idx) { + // Current row is in the null group. + // Must consider beginning of null-group as window start. + return idx - nulls_begin_idx + 1; + } + + // timestamp[idx] not null. Binary search the group, excluding null group. + // If nulls_begin_idx == 0, either + // 1. NULLS FIRST ordering: Binary search starts where nulls_end_idx. + // 2. NO NULLS: Binary search starts at 0 (also nulls_end_idx). + // Otherwise, NULLS LAST ordering. Start at 0. + auto group_start = nulls_begin_idx == 0 ? nulls_end_idx : 0; auto highest_timestamp_in_window = d_timestamps[idx] + preceding_window; return ((d_timestamps + idx) - @@ -1419,11 +1628,24 @@ std::unique_ptr time_range_window_DESC(column_view const& input, 1; // Add 1, for `preceding` to account for current row. }; - auto following_calculator = [num_rows = input.size(), - d_timestamps = timestamp_column.data(), - following_window] __device__(size_type idx) { - auto group_end = - num_rows; // Cannot fall off the end, since offsets is capped with `input.size()`. + auto following_calculator = [nulls_begin_idx, + nulls_end_idx, + num_rows = input.size(), + d_timestamps = timestamp_column.data(), + following_window] __device__(size_type idx) -> size_type { + if (idx >= nulls_begin_idx && idx < nulls_end_idx) { + // Current row is in the null group. + // Window ends at the end of the null group. + return nulls_end_idx - idx - 1; + } + + // timestamp[idx] not null. Search must exclude null group. + // If nulls_begin_idx = 0, either + // 1. NULLS FIRST ordering: Search ends at num_rows. + // 2. NO NULLS: Search also ends at num_rows. + // Otherwise, NULLS LAST ordering: End at nulls_begin_idx. + + auto group_end = nulls_begin_idx == 0 ? num_rows : nulls_begin_idx; auto lowest_timestamp_in_window = d_timestamps[idx] - following_window; return (thrust::upper_bound(thrust::seq, @@ -1460,17 +1682,40 @@ std::unique_ptr time_range_window_DESC( std::unique_ptr const& aggr, rmm::mr::device_memory_resource* mr) { + rmm::device_vector null_start, null_end; + std::tie(null_start, null_end) = + get_null_bounds_for_timestamp_column(timestamp_column, group_offsets); + auto preceding_calculator = [d_group_offsets = group_offsets.data().get(), d_group_labels = group_labels.data().get(), d_timestamps = timestamp_column.data(), - preceding_window] __device__(size_type idx) { - auto group_label = d_group_labels[idx]; - auto group_start = d_group_offsets[group_label]; + d_nulls_begin = null_start.data().get(), + d_nulls_end = null_end.data().get(), + preceding_window] __device__(size_type idx) -> size_type { + auto group_label = d_group_labels[idx]; + auto group_start = d_group_offsets[group_label]; + auto nulls_begin = d_nulls_begin[group_label]; + auto nulls_end = d_nulls_end[group_label]; + + // If idx lies in the null-range, the window is the null range. + if (idx >= nulls_begin && idx < nulls_end) { + // Current row is in the null group. + // The window starts at the start of the null group. + return idx - nulls_begin + 1; + } + + // timestamp[idx] not null. Search must exclude the null group. + // If nulls_begin == group_start, either of the following is true: + // 1. NULLS FIRST ordering: Search must begin at nulls_end. + // 2. NO NULLS: Search must begin at group_start (which also equals nulls_end.) + // Otherwise, NULLS LAST ordering. Search must start at nulls group_start. + auto search_start = nulls_begin == group_start ? nulls_end : group_start; + auto highest_timestamp_in_window = d_timestamps[idx] + preceding_window; return ((d_timestamps + idx) - thrust::lower_bound(thrust::seq, - d_timestamps + group_start, + d_timestamps + search_start, d_timestamps + idx, highest_timestamp_in_window, thrust::greater())) + @@ -1480,16 +1725,34 @@ std::unique_ptr time_range_window_DESC( auto following_calculator = [d_group_offsets = group_offsets.data().get(), d_group_labels = group_labels.data().get(), d_timestamps = timestamp_column.data(), - following_window] __device__(size_type idx) { + d_nulls_begin = null_start.data().get(), + d_nulls_end = null_end.data().get(), + following_window] __device__(size_type idx) -> size_type { auto group_label = d_group_labels[idx]; - auto group_end = - d_group_offsets[group_label + - 1]; // Cannot fall off the end, since offsets is capped with `input.size()`. + auto group_start = d_group_offsets[group_label]; + auto group_end = d_group_offsets[group_label + 1]; + auto nulls_begin = d_nulls_begin[group_label]; + auto nulls_end = d_nulls_end[group_label]; + + // If idx lies in the null-range, the window is the null range. + if (idx >= nulls_begin && idx < nulls_end) { + // Current row is in the null group. + // The window ends at the end of the null group. + return nulls_end - idx - 1; + } + + // timestamp[idx] not null. Search must exclude the null group. + // If nulls_begin == group_start, either of the following is true: + // 1. NULLS FIRST ordering: Search ends at group_end. + // 2. NO NULLS: Search ends at group_end. + // Otherwise, NULLS LAST ordering. Search ends at nulls_begin. + auto search_end = nulls_begin == group_start ? group_end : nulls_begin; + auto lowest_timestamp_in_window = d_timestamps[idx] - following_window; return (thrust::upper_bound(thrust::seq, d_timestamps + idx, - d_timestamps + group_end, + d_timestamps + search_end, lowest_timestamp_in_window, thrust::greater()) - (d_timestamps + idx)) - diff --git a/cpp/tests/grouped_rolling/grouped_rolling_test.cpp b/cpp/tests/grouped_rolling/grouped_rolling_test.cpp index 64ce1ef7930..c9d4c23c5f3 100644 --- a/cpp/tests/grouped_rolling/grouped_rolling_test.cpp +++ b/cpp/tests/grouped_rolling/grouped_rolling_test.cpp @@ -1235,4 +1235,243 @@ TYPED_TEST(GroupedTimeRangeRollingTest, SimplePartitionedStaticWindowsWithNoGrou 1); } +template +struct TypedNullTimestampTestForRangeQueries : public cudf::test::BaseFixture { +}; + +struct NullTimestampTestForRangeQueries : public cudf::test::BaseFixture { +}; + +TYPED_TEST_CASE(TypedNullTimestampTestForRangeQueries, cudf::test::IntegralTypes); + +TYPED_TEST(TypedNullTimestampTestForRangeQueries, CountSingleGroupTimestampASCNullsFirst) +{ + using namespace cudf::test; + using T = TypeParam; + + auto const grp_col = fixed_width_column_wrapper{0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + auto const agg_col = + fixed_width_column_wrapper{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, {1, 1, 1, 1, 1, 0, 1, 1, 1, 1}}; + auto const time_col = fixed_width_column_wrapper{ + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, {0, 0, 0, 0, 1, 1, 1, 1, 1, 1}}; + + auto const grouping_keys = cudf::table_view{std::vector{grp_col}}; + auto const preceding = 1L; + auto const following = 1L; + auto const min_periods = 1L; + auto const output = cudf::grouped_time_range_rolling_window(grouping_keys, + time_col, + cudf::order::ASCENDING, + agg_col, + preceding, + following, + min_periods, + cudf::make_count_aggregation()); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(), + fixed_width_column_wrapper{ + {4, 4, 4, 4, 1, 2, 2, 3, 3, 2}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1}}); +} + +TYPED_TEST(TypedNullTimestampTestForRangeQueries, CountSingleGroupTimestampASCNullsLast) +{ + using namespace cudf::test; + using T = TypeParam; + + auto const grp_col = fixed_width_column_wrapper{0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + auto const agg_col = + fixed_width_column_wrapper{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, {1, 1, 1, 1, 1, 0, 1, 1, 1, 1}}; + + auto const time_col = fixed_width_column_wrapper{ + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, {1, 1, 1, 1, 1, 1, 0, 0, 0, 0}}; + + auto const grouping_keys = cudf::table_view{std::vector{grp_col}}; + auto const preceding = 1L; + auto const following = 1L; + auto const min_periods = 1L; + auto const output = cudf::grouped_time_range_rolling_window(grouping_keys, + time_col, + cudf::order::ASCENDING, + agg_col, + preceding, + following, + min_periods, + cudf::make_count_aggregation()); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(), + fixed_width_column_wrapper{ + {2, 3, 3, 3, 2, 1, 4, 4, 4, 4}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1}}); +} + +TYPED_TEST(TypedNullTimestampTestForRangeQueries, CountMultiGroupTimestampASCNullsFirst) +{ + using namespace cudf::test; + using T = TypeParam; + + auto const grp_col = fixed_width_column_wrapper{0, 0, 0, 0, 0, 1, 1, 1, 1, 1}; + auto const agg_col = fixed_width_column_wrapper{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + auto const time_col = fixed_width_column_wrapper{ + {1, 2, 2, 1, 2, 1, 2, 3, 4, 5}, {0, 0, 0, 1, 1, 0, 0, 1, 1, 1}}; + + auto const grouping_keys = cudf::table_view{std::vector{grp_col}}; + auto const preceding = 1L; + auto const following = 1L; + auto const min_periods = 1L; + auto const output = cudf::grouped_time_range_rolling_window(grouping_keys, + time_col, + cudf::order::ASCENDING, + agg_col, + preceding, + following, + min_periods, + cudf::make_count_aggregation()); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(), + fixed_width_column_wrapper{ + {3, 3, 3, 2, 2, 2, 2, 2, 3, 2}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1}}); +} + +TYPED_TEST(TypedNullTimestampTestForRangeQueries, CountMultiGroupTimestampASCNullsLast) +{ + using namespace cudf::test; + using T = int32_t; + + auto const grp_col = fixed_width_column_wrapper{0, 0, 0, 0, 0, 1, 1, 1, 1, 1}; + auto const agg_col = fixed_width_column_wrapper{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + auto const time_col = fixed_width_column_wrapper{ + {1, 2, 2, 1, 3, 1, 2, 3, 4, 5}, {1, 1, 1, 0, 0, 1, 1, 1, 0, 0}}; + + auto const grouping_keys = cudf::table_view{std::vector{grp_col}}; + auto const preceding = 1L; + auto const following = 1L; + auto const min_periods = 1L; + auto const output = cudf::grouped_time_range_rolling_window(grouping_keys, + time_col, + cudf::order::ASCENDING, + agg_col, + preceding, + following, + min_periods, + cudf::make_count_aggregation()); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(), + fixed_width_column_wrapper{ + {3, 3, 3, 2, 2, 2, 3, 2, 2, 2}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1}}); +} + +TYPED_TEST(TypedNullTimestampTestForRangeQueries, CountSingleGroupTimestampDESCNullsFirst) +{ + using namespace cudf::test; + using T = TypeParam; + + auto const grp_col = fixed_width_column_wrapper{0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + auto const agg_col = + fixed_width_column_wrapper{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, {1, 1, 1, 1, 1, 0, 1, 1, 1, 1}}; + auto const time_col = fixed_width_column_wrapper{ + {9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, {0, 0, 0, 0, 1, 1, 1, 1, 1, 1}}; + + auto const grouping_keys = cudf::table_view{std::vector{grp_col}}; + auto const preceding = 1L; + auto const following = 1L; + auto const min_periods = 1L; + auto const output = cudf::grouped_time_range_rolling_window(grouping_keys, + time_col, + cudf::order::DESCENDING, + agg_col, + preceding, + following, + min_periods, + cudf::make_count_aggregation()); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(), + fixed_width_column_wrapper{ + {4, 4, 4, 4, 1, 2, 2, 3, 3, 2}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1}}); +} + +TYPED_TEST(TypedNullTimestampTestForRangeQueries, CountSingleGroupTimestampDESCNullsLast) +{ + using namespace cudf::test; + using T = TypeParam; + + auto const grp_col = fixed_width_column_wrapper{0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + auto const agg_col = + fixed_width_column_wrapper{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, {1, 1, 1, 1, 1, 0, 1, 1, 1, 1}}; + + auto const time_col = fixed_width_column_wrapper{ + {9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, {1, 1, 1, 1, 1, 1, 0, 0, 0, 0}}; + + auto const grouping_keys = cudf::table_view{std::vector{grp_col}}; + auto const preceding = 1L; + auto const following = 1L; + auto const min_periods = 1L; + auto const output = cudf::grouped_time_range_rolling_window(grouping_keys, + time_col, + cudf::order::DESCENDING, + agg_col, + preceding, + following, + min_periods, + cudf::make_count_aggregation()); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(), + fixed_width_column_wrapper{ + {2, 3, 3, 3, 2, 1, 4, 4, 4, 4}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1}}); +} + +TYPED_TEST(TypedNullTimestampTestForRangeQueries, CountMultiGroupTimestampDESCNullsFirst) +{ + using namespace cudf::test; + using T = TypeParam; + + auto const grp_col = fixed_width_column_wrapper{0, 0, 0, 0, 0, 1, 1, 1, 1, 1}; + auto const agg_col = fixed_width_column_wrapper{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + auto const time_col = fixed_width_column_wrapper{ + {4, 3, 2, 1, 0, 9, 8, 7, 6, 5}, {0, 0, 0, 1, 1, 0, 0, 1, 1, 1}}; + + auto const grouping_keys = cudf::table_view{std::vector{grp_col}}; + auto const preceding = 1L; + auto const following = 1L; + auto const min_periods = 1L; + auto const output = cudf::grouped_time_range_rolling_window(grouping_keys, + time_col, + cudf::order::DESCENDING, + agg_col, + preceding, + following, + min_periods, + cudf::make_count_aggregation()); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(), + fixed_width_column_wrapper{ + {3, 3, 3, 2, 2, 2, 2, 2, 3, 2}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1}}); +} + +TYPED_TEST(TypedNullTimestampTestForRangeQueries, CountMultiGroupTimestampDESCNullsLast) +{ + using namespace cudf::test; + using T = int32_t; + + auto const grp_col = fixed_width_column_wrapper{0, 0, 0, 0, 0, 1, 1, 1, 1, 1}; + auto const agg_col = fixed_width_column_wrapper{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + auto const time_col = fixed_width_column_wrapper{ + {4, 3, 2, 1, 0, 9, 8, 7, 6, 5}, {1, 1, 1, 0, 0, 1, 1, 1, 0, 0}}; + + auto const grouping_keys = cudf::table_view{std::vector{grp_col}}; + auto const preceding = 1L; + auto const following = 1L; + auto const min_periods = 1L; + auto const output = cudf::grouped_time_range_rolling_window(grouping_keys, + time_col, + cudf::order::DESCENDING, + agg_col, + preceding, + following, + min_periods, + cudf::make_count_aggregation()); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(), + fixed_width_column_wrapper{ + {2, 3, 2, 2, 2, 2, 3, 2, 2, 2}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1}}); +} + CUDF_TEST_PROGRAM_MAIN() From 72f20f338fe1f6db8a0422e30195af8f51cbf06b Mon Sep 17 00:00:00 2001 From: Mithun RK Date: Mon, 19 Oct 2020 15:50:27 -0700 Subject: [PATCH 2/8] Fix time range window queries with null timestamps Added to CHANGELOG.md. --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 45e85e88b8c..a238203b2fd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ - PR #6485 Add File IO to cuIO benchmarks - PR #6504 Update Java bindings version to 0.17-SNAPSHOT - PR #6527 Refactor DeviceColumnViewAccess to avoid JNI returning an array +- PR #6557 Support nullable timestamp columns in time range window functions ## Bug Fixes From 25aac848312f7413ca3590c807c40c11fec0153e Mon Sep 17 00:00:00 2001 From: Mithun RK Date: Wed, 21 Oct 2020 20:13:53 -0700 Subject: [PATCH 3/8] Fix time range window queries with null timestamps 1. Non-groupby case: Use null-count to calculate null bounds 2. Groupby case: Use partition_point() --- cpp/src/rolling/rolling.cu | 114 +++++++++++++------------------------ 1 file changed, 41 insertions(+), 73 deletions(-) diff --git a/cpp/src/rolling/rolling.cu b/cpp/src/rolling/rolling.cu index 398d0422e5b..42f5e3194a4 100644 --- a/cpp/src/rolling/rolling.cu +++ b/cpp/src/rolling/rolling.cu @@ -1295,38 +1295,18 @@ size_t multiplication_factor(cudf::data_type const& data_type) std::tuple get_null_bounds_for_timestamp_column( column_view const& timestamp_column) { - auto p_timestamps_device_view = column_device_view::create(timestamp_column); - auto timestamp_begin = thrust::make_counting_iterator(0); - auto timestamp_end = thrust::make_counting_iterator(timestamp_column.size()); - - // TODO: Handle case where timestamp_column is not nullable. - - auto nulls_begin = timestamp_column.has_nulls() - ? thrust::find_if(thrust::device, - timestamp_begin, - timestamp_end, - [timestamps = *p_timestamps_device_view] __device__( - auto i) { return timestamps.is_null_nocheck(i); }) - : timestamp_begin; - auto nulls_begin_idx = nulls_begin - timestamp_begin; - - auto nulls_end = timestamp_column.has_nulls() - ? thrust::find_if(thrust::device, - nulls_begin, - timestamp_end, - [timestamps = *p_timestamps_device_view] __device__(auto i) { - return !timestamps.is_null_nocheck(i); - }) - : timestamp_begin; - auto nulls_end_idx = nulls_end - timestamp_begin; - - // Sanity checks: NULL values must be grouped together. - // Either NULLS FIRST or NULLS LAST. - CUDF_EXPECTS(!timestamp_column.has_nulls() || nulls_begin_idx == 0 || - nulls_end_idx == timestamp_column.size(), - "Expected nulls in timestamp column to be grouped together."); - - return std::make_tuple(nulls_begin_idx, nulls_end_idx); + auto num_rows = timestamp_column.size(); + auto first_row_is_null = timestamp_column.null_count(0, 1) == 1; + auto last_row_is_null = timestamp_column.null_count(num_rows - 1, num_rows) == 1; + + if (!timestamp_column.nullable() || (!first_row_is_null && !last_row_is_null)) { + // Neither first nor last rows are null. No nulls here. + return std::make_tuple(0, 0); + } else if (first_row_is_null) { + return std::make_tuple(0, timestamp_column.null_count()); + } else { + return std::make_tuple(num_rows - timestamp_column.null_count(), num_rows); + } } // Time-range window computation, with @@ -1431,57 +1411,45 @@ get_null_bounds_for_timestamp_column(column_view const& timestamp_column, // null_start and null_end should eventually have 1 entry per group. auto null_start = rmm::device_vector(group_offsets.begin(), group_offsets.end() - 1); auto null_end = rmm::device_vector(group_offsets.begin(), group_offsets.end() - 1); + if (timestamp_column.has_nulls()) { auto p_timestamps_device_view = column_device_view::create(timestamp_column); - size_type num_rows = timestamp_column.size(); - - // For each group_label i, find the offsets in the timestamp column - // for the beginning group of null timestamps. - // If no nulls in the group, point to the beginning of the group. - thrust::transform( - thrust::device, - thrust::make_counting_iterator(static_cast(0)), - thrust::make_counting_iterator(static_cast(group_offsets.size() - 1)), - null_start.begin(), - [d_timestamps = *p_timestamps_device_view, - d_group_offsets = group_offsets.data().get()] __device__(auto group_label) { - auto group_start = d_group_offsets[group_label]; - auto group_end = d_group_offsets[group_label + 1]; - auto nulls_begin = - thrust::find_if(thrust::seq, - thrust::make_counting_iterator(group_start), - thrust::make_counting_iterator(group_end), - [&d_timestamps](auto i) { return d_timestamps.is_null_nocheck(i); }); - return nulls_begin == thrust::make_counting_iterator(group_end) // i.e. No nulls in group. - ? group_start - : group_start + (nulls_begin - thrust::make_counting_iterator(group_start)); - }); + auto num_groups = group_offsets.size(); - // TODO: FIXME: Attempt to do this in one scan. - // Now, for each group_label i, find the end of the null group. - // (i.e. the first non-null following the first null). - // If no nulls in the group, point to the beginning of the group. - thrust::transform( + // Null timestamps exist. Find null bounds, per group. + thrust::for_each( thrust::device, thrust::make_counting_iterator(static_cast(0)), - thrust::make_counting_iterator(static_cast(group_offsets.size() - 1)), - null_end.begin(), + thrust::make_counting_iterator(static_cast(num_groups)), [d_timestamps = *p_timestamps_device_view, d_group_offsets = group_offsets.data().get(), - d_null_start = null_start.data().get()] __device__(auto group_label) { - auto group_start = d_group_offsets[group_label]; - auto nulls_begin = d_null_start[group_label]; - auto group_end = d_group_offsets[group_label + 1]; - auto nulls_end = - thrust::find_if(thrust::seq, - thrust::make_counting_iterator(nulls_begin), - thrust::make_counting_iterator(group_end), - [&d_timestamps](auto i) { return !d_timestamps.is_null_nocheck(i); }); - return group_start + (nulls_end - thrust::make_counting_iterator(group_start)); + d_null_start = null_start.data(), + d_null_end = null_end.data()] __device__(auto group_label) { + auto group_start = d_group_offsets[group_label]; + auto group_end = d_group_offsets[group_label + 1]; + auto first_element_is_null = d_timestamps.is_null_nocheck(group_start); + auto last_element_is_null = d_timestamps.is_null_nocheck(group_end); + if (first_element_is_null) { + // NULLS FIRST. + d_null_start[group_label] = group_start; + d_null_end[group_label] = *thrust::partition_point( + thrust::seq, + thrust::make_counting_iterator(group_start), + thrust::make_counting_iterator(group_end), + [&d_timestamps] __device__(auto i) { return d_timestamps.is_null_nocheck(i); }); + } else { + // NULLS LAST. + d_null_end[group_label] = group_end; + d_null_start[group_label] = *thrust::partition_point( + thrust::seq, + thrust::make_counting_iterator(group_start), + thrust::make_counting_iterator(group_end), + [&d_timestamps] __device__(auto i) { return d_timestamps.is_valid_nocheck(i); }); + } }); } - return std::make_pair(std::move(null_start), std::move(null_end)); + return std::make_tuple(std::move(null_start), std::move(null_end)); } // Time-range window computation, for timestamps in ASCENDING order. From 7f5a2359a90bfcffe2dbd55e4af8ab3989d304c8 Mon Sep 17 00:00:00 2001 From: Mithun RK Date: Fri, 23 Oct 2020 12:59:00 -0700 Subject: [PATCH 4/8] Fix time range window queries with null timestamps 1. Short circuit eval for all-null/no-null cases. 2. Test cases for all-null/no-null. 3. Code formatting --- cpp/src/rolling/rolling.cu | 25 ++++++-- .../grouped_rolling/grouped_rolling_test.cpp | 63 ++++++++++++++++++- 2 files changed, 82 insertions(+), 6 deletions(-) diff --git a/cpp/src/rolling/rolling.cu b/cpp/src/rolling/rolling.cu index 42f5e3194a4..56aa45800c1 100644 --- a/cpp/src/rolling/rolling.cu +++ b/cpp/src/rolling/rolling.cu @@ -1295,7 +1295,14 @@ size_t multiplication_factor(cudf::data_type const& data_type) std::tuple get_null_bounds_for_timestamp_column( column_view const& timestamp_column) { - auto num_rows = timestamp_column.size(); + auto num_rows = timestamp_column.size(); + auto num_nulls = timestamp_column.null_count(); + + if (num_nulls == num_rows) { + // Short-circuit: All nulls. + return std::make_tuple(0, num_rows); + } + auto first_row_is_null = timestamp_column.null_count(0, 1) == 1; auto last_row_is_null = timestamp_column.null_count(num_rows - 1, num_rows) == 1; @@ -1303,9 +1310,9 @@ std::tuple get_null_bounds_for_timestamp_column( // Neither first nor last rows are null. No nulls here. return std::make_tuple(0, 0); } else if (first_row_is_null) { - return std::make_tuple(0, timestamp_column.null_count()); + return std::make_tuple(0, num_nulls); } else { - return std::make_tuple(num_rows - timestamp_column.null_count(), num_rows); + return std::make_tuple(num_rows - num_nulls, num_rows); } } @@ -1428,8 +1435,16 @@ get_null_bounds_for_timestamp_column(column_view const& timestamp_column, auto group_start = d_group_offsets[group_label]; auto group_end = d_group_offsets[group_label + 1]; auto first_element_is_null = d_timestamps.is_null_nocheck(group_start); - auto last_element_is_null = d_timestamps.is_null_nocheck(group_end); - if (first_element_is_null) { + auto last_element_is_null = d_timestamps.is_null_nocheck(group_end - 1); + if (!first_element_is_null && !last_element_is_null) { + // Short circuit: No nulls. + d_null_start[group_label] = group_start; + d_null_end[group_label] = group_start; + } else if (first_element_is_null && last_element_is_null) { + // Short circuit: All nulls. + d_null_start[group_label] = group_start; + d_null_end[group_label] = group_end; + } else if (first_element_is_null) { // NULLS FIRST. d_null_start[group_label] = group_start; d_null_end[group_label] = *thrust::partition_point( diff --git a/cpp/tests/grouped_rolling/grouped_rolling_test.cpp b/cpp/tests/grouped_rolling/grouped_rolling_test.cpp index c9d4c23c5f3..c5ca1ae0de0 100644 --- a/cpp/tests/grouped_rolling/grouped_rolling_test.cpp +++ b/cpp/tests/grouped_rolling/grouped_rolling_test.cpp @@ -1449,7 +1449,7 @@ TYPED_TEST(TypedNullTimestampTestForRangeQueries, CountMultiGroupTimestampDESCNu TYPED_TEST(TypedNullTimestampTestForRangeQueries, CountMultiGroupTimestampDESCNullsLast) { using namespace cudf::test; - using T = int32_t; + using T = TypeParam; auto const grp_col = fixed_width_column_wrapper{0, 0, 0, 0, 0, 1, 1, 1, 1, 1}; auto const agg_col = fixed_width_column_wrapper{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; @@ -1474,4 +1474,65 @@ TYPED_TEST(TypedNullTimestampTestForRangeQueries, CountMultiGroupTimestampDESCNu {2, 3, 2, 2, 2, 2, 3, 2, 2, 2}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1}}); } +TYPED_TEST(TypedNullTimestampTestForRangeQueries, CountSingleGroupAllNullTimestamps) +{ + using namespace cudf::test; + using T = TypeParam; + + auto const grp_col = fixed_width_column_wrapper{0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + auto const agg_col = + fixed_width_column_wrapper{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, {1, 1, 1, 1, 1, 0, 1, 1, 1, 1}}; + + auto const time_col = fixed_width_column_wrapper{ + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}}; + + auto const grouping_keys = cudf::table_view{std::vector{grp_col}}; + auto const preceding = 1L; + auto const following = 1L; + auto const min_periods = 1L; + auto const output = cudf::grouped_time_range_rolling_window(grouping_keys, + time_col, + cudf::order::ASCENDING, + agg_col, + preceding, + following, + min_periods, + cudf::make_count_aggregation()); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(), + fixed_width_column_wrapper{ + {9, 9, 9, 9, 9, 9, 9, 9, 9, 9}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1}}); +} + +TYPED_TEST(TypedNullTimestampTestForRangeQueries, CountMultiGroupAllNullTimestamps) +{ + using namespace cudf::test; + using T = TypeParam; + + auto const grp_col = fixed_width_column_wrapper{0, 0, 0, 0, 0, 1, 1, 1, 1, 1}; + auto const agg_col = + fixed_width_column_wrapper{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, {1, 1, 1, 1, 1, 0, 1, 1, 1, 1}}; + + auto const time_col = fixed_width_column_wrapper{ + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, {1, 1, 1, 1, 1, 0, 0, 0, 0, 0}}; + + auto const grouping_keys = cudf::table_view{std::vector{grp_col}}; + auto const preceding = 1L; + auto const following = 1L; + auto const min_periods = 1L; + auto const output = cudf::grouped_time_range_rolling_window(grouping_keys, + time_col, + cudf::order::ASCENDING, + agg_col, + preceding, + following, + min_periods, + cudf::make_count_aggregation()); + + print(output->view()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(), + fixed_width_column_wrapper{ + {2, 3, 3, 3, 2, 4, 4, 4, 4, 4}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1}}); +} + CUDF_TEST_PROGRAM_MAIN() From d93fb8feefbd7c416540ebaa5b77dcef21d03a86 Mon Sep 17 00:00:00 2001 From: Mithun RK Date: Mon, 26 Oct 2020 11:31:00 -0700 Subject: [PATCH 5/8] Fix time range window queries with null timestamps 1. Fixed angle-brackets for header inclusion. 2. Switched to doxygen-style function documentation --- cpp/src/rolling/rolling.cu | 52 +++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/cpp/src/rolling/rolling.cu b/cpp/src/rolling/rolling.cu index 56aa45800c1..31804ca3103 100644 --- a/cpp/src/rolling/rolling.cu +++ b/cpp/src/rolling/rolling.cu @@ -50,10 +50,10 @@ #include #include #include -#include "rmm/thrust_rmm_allocator.h" -#include "thrust/execution_policy.h" -#include "thrust/find.h" -#include "thrust/transform.h" +#include +#include +#include +#include namespace cudf { namespace detail { @@ -1287,11 +1287,11 @@ size_t multiplication_factor(cudf::data_type const& data_type) } } -// Given a single, ungrouped timestamp column, return the indices corresponding -// to the first null timestamp, and (one past) the last null timestamp. -// The input column is sorted, with all null values clustered either -// at the beginning of the column or at the end. -// If no null values are founds, null_begin and null_end are 0. +/// Given a single, ungrouped timestamp column, return the indices corresponding +/// to the first null timestamp, and (one past) the last null timestamp. +/// The input column is sorted, with all null values clustered either +/// at the beginning of the column or at the end. +/// If no null values are founds, null_begin and null_end are 0. std::tuple get_null_bounds_for_timestamp_column( column_view const& timestamp_column) { @@ -1316,10 +1316,10 @@ std::tuple get_null_bounds_for_timestamp_column( } } -// Time-range window computation, with -// 1. no grouping keys specified -// 2. timetamps in ASCENDING order. -// Treat as one single group. +/// Time-range window computation, with +/// 1. no grouping keys specified +/// 2. timetamps in ASCENDING order. +/// Treat as one single group. template std::unique_ptr time_range_window_ASC(column_view const& input, column_view const& timestamp_column, @@ -1397,15 +1397,15 @@ std::unique_ptr time_range_window_ASC(column_view const& input, mr); } -// Given a timestamp column grouped as specified in group_offsets, -// return the following two vectors: -// 1. Vector with one entry per group, indicating the offset in the group -// where the null values begin. -// 2. Vector with one entry per group, indicating the offset in the group -// where the null values end. (i.e. 1 past the last null.) -// Each group in the input timestamp column must be sorted, -// with null values clustered at either the start or the end of each group. -// If there are no nulls for any given group, (nulls_begin, nulls_end) == (0,0). +/// Given a timestamp column grouped as specified in group_offsets, +/// return the following two vectors: +/// 1. Vector with one entry per group, indicating the offset in the group +/// where the null values begin. +/// 2. Vector with one entry per group, indicating the offset in the group +/// where the null values end. (i.e. 1 past the last null.) +/// Each group in the input timestamp column must be sorted, +/// with null values clustered at either the start or the end of each group. +/// If there are no nulls for any given group, (nulls_begin, nulls_end) == (0,0). std::tuple, rmm::device_vector> get_null_bounds_for_timestamp_column(column_view const& timestamp_column, rmm::device_vector const& group_offsets) @@ -1568,10 +1568,10 @@ std::unique_ptr time_range_window_ASC( mr); } -// Time-range window computation, with -// 1. no grouping keys specified -// 2. timetamps in DESCENDING order. -// Treat as one single group. +/// Time-range window computation, with +/// 1. no grouping keys specified +/// 2. timetamps in DESCENDING order. +/// Treat as one single group. template std::unique_ptr time_range_window_DESC(column_view const& input, column_view const& timestamp_column, From f8b3ec5aba1daa90ac546632caafaf733956fd12 Mon Sep 17 00:00:00 2001 From: Mithun RK Date: Mon, 26 Oct 2020 11:47:06 -0700 Subject: [PATCH 6/8] Fix time range window queries with null timestamps 3. Code formatting --- cpp/src/rolling/rolling.cu | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/rolling/rolling.cu b/cpp/src/rolling/rolling.cu index 31804ca3103..922dc5daa11 100644 --- a/cpp/src/rolling/rolling.cu +++ b/cpp/src/rolling/rolling.cu @@ -43,17 +43,17 @@ #include #include +#include #include +#include +#include #include +#include #include #include #include #include #include -#include -#include -#include -#include namespace cudf { namespace detail { From 9b97c786a574b616484b56335c6b92f3924a52c3 Mon Sep 17 00:00:00 2001 From: Mithun RK Date: Mon, 26 Oct 2020 14:22:03 -0700 Subject: [PATCH 7/8] Fix time range window queries with null timestamps 1. Streamline null_bound calculation, with fewer comparisons. --- cpp/src/rolling/rolling.cu | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/cpp/src/rolling/rolling.cu b/cpp/src/rolling/rolling.cu index 922dc5daa11..79fe93ed550 100644 --- a/cpp/src/rolling/rolling.cu +++ b/cpp/src/rolling/rolling.cu @@ -1295,25 +1295,18 @@ size_t multiplication_factor(cudf::data_type const& data_type) std::tuple get_null_bounds_for_timestamp_column( column_view const& timestamp_column) { - auto num_rows = timestamp_column.size(); - auto num_nulls = timestamp_column.null_count(); + auto const num_rows = timestamp_column.size(); + auto const num_nulls = timestamp_column.null_count(); - if (num_nulls == num_rows) { - // Short-circuit: All nulls. + if (num_nulls == num_rows || num_nulls == 0) { + // Short-circuit: All nulls, or no nulls. return std::make_tuple(0, num_rows); } - auto first_row_is_null = timestamp_column.null_count(0, 1) == 1; - auto last_row_is_null = timestamp_column.null_count(num_rows - 1, num_rows) == 1; + auto const first_row_is_null = timestamp_column.null_count(0, 1) == 1; - if (!timestamp_column.nullable() || (!first_row_is_null && !last_row_is_null)) { - // Neither first nor last rows are null. No nulls here. - return std::make_tuple(0, 0); - } else if (first_row_is_null) { - return std::make_tuple(0, num_nulls); - } else { - return std::make_tuple(num_rows - num_nulls, num_rows); - } + return first_row_is_null ? std::make_tuple(0, num_nulls) + : std::make_tuple(num_rows - num_nulls, num_rows); } /// Time-range window computation, with From 5d82e7cbf1b5cc2688b8dc94d90bd0f52e1ffbeb Mon Sep 17 00:00:00 2001 From: Mithun RK Date: Mon, 26 Oct 2020 14:51:08 -0700 Subject: [PATCH 8/8] Fix time range window queries with null timestamps 2. Fix botched null_bound calculation cleanup. 3. Remove errant println in tests. --- cpp/src/rolling/rolling.cu | 2 +- cpp/tests/grouped_rolling/grouped_rolling_test.cpp | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/rolling/rolling.cu b/cpp/src/rolling/rolling.cu index 79fe93ed550..ff5c348bb99 100644 --- a/cpp/src/rolling/rolling.cu +++ b/cpp/src/rolling/rolling.cu @@ -1300,7 +1300,7 @@ std::tuple get_null_bounds_for_timestamp_column( if (num_nulls == num_rows || num_nulls == 0) { // Short-circuit: All nulls, or no nulls. - return std::make_tuple(0, num_rows); + return std::make_tuple(0, num_nulls); } auto const first_row_is_null = timestamp_column.null_count(0, 1) == 1; diff --git a/cpp/tests/grouped_rolling/grouped_rolling_test.cpp b/cpp/tests/grouped_rolling/grouped_rolling_test.cpp index c5ca1ae0de0..f069f7f7b52 100644 --- a/cpp/tests/grouped_rolling/grouped_rolling_test.cpp +++ b/cpp/tests/grouped_rolling/grouped_rolling_test.cpp @@ -1529,7 +1529,6 @@ TYPED_TEST(TypedNullTimestampTestForRangeQueries, CountMultiGroupAllNullTimestam min_periods, cudf::make_count_aggregation()); - print(output->view()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(), fixed_width_column_wrapper{ {2, 3, 3, 3, 2, 4, 4, 4, 4, 4}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1}});