diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh index cf8c3343406..6090477c28d 100644 --- a/cpp/include/cudf/detail/null_mask.cuh +++ b/cpp/include/cudf/detail/null_mask.cuh @@ -24,6 +24,18 @@ #include #include +#include + +#include +#include + +#include +#include +#include + +#include +#include +#include namespace cudf { namespace detail { @@ -39,7 +51,7 @@ namespace detail { * @param source_begin_bits Array of offsets into corresponding @p source masks. * Must be same size as source array * @param source_size_bits Number of bits in each mask in @p source - * @param count Pointer to counter of set bits + * @param count_ptr Pointer to counter of set bits */ template __global__ void offset_bitmask_binop(Binop op, @@ -171,24 +183,31 @@ size_type inplace_bitmask_binop( return d_counter.value(stream); } +/** + * @brief Enum indicating whether to count unset (0) bits or set (1) bits. + */ +enum class count_bits_policy : bool { + UNSET_BITS, /// Count unset (0) bits + SET_BITS /// Count set (1) bits +}; + /** * For each range `[first_bit_indices[i], last_bit_indices[i])` * (where 0 <= i < `num_ranges`), count the number of bits set outside the range - * in the boundary words (i.e. words that include either - * `first_bit_indices[i]'th` bit or `(last_bit_indices[i] - 1)'th` bit) and - * subtract the count from the range's null count. + * in the boundary words (i.e. words that include either the first or last bit) + * and subtract the count from the range's null count. * * Expects `0 <= first_bit_indices[i] <= last_bit_indices[i]`. * * @param[in] bitmask The bitmask whose non-zero bits outside the range in the * boundary words will be counted. - * @param[in] num_ranges The number of ranges - * @param[in] first_bit_indices The indices (inclusive) of the first bit in each - * range - * @param[in] last_bit_indices The indices (exclusive) of the last bit in each - * range - * @param[in,out] null_counts The number of non-zero bits in each range to be - * updated + * @param[in] num_ranges The number of ranges. + * @param[in] first_bit_indices Random-access input iterator to the sequence of indices (inclusive) + * of the first bit in each range. + * @param[in] last_bit_indices Random-access input iterator to the sequence of indices (exclusive) + * of the last bit in each range. + * @param[in,out] null_counts Random-access input/output iterator where the number of non-zero bits + * in each range is updated. */ template __global__ void subtract_set_bits_range_boundaries_kernel(bitmask_type const* bitmask, @@ -199,175 +218,96 @@ __global__ void subtract_set_bits_range_boundaries_kernel(bitmask_type const* bi { constexpr size_type const word_size_in_bits{detail::size_in_bits()}; - cudf::size_type const tid = threadIdx.x + blockIdx.x * blockDim.x; - cudf::size_type range_id = tid; + size_type const tid = threadIdx.x + blockIdx.x * blockDim.x; + size_type range_id = tid; while (range_id < num_ranges) { size_type const first_bit_index = *(first_bit_indices + range_id); size_type const last_bit_index = *(last_bit_indices + range_id); size_type delta = 0; - size_type num_slack_bits = 0; - // compute delta due to the preceding bits in the first word in the range - - num_slack_bits = intra_word_index(first_bit_index); - if (num_slack_bits > 0) { - bitmask_type word = bitmask[word_index(first_bit_index)]; - bitmask_type slack_mask = set_least_significant_bits(num_slack_bits); + // Compute delta due to the preceding bits in the first word in the range. + size_type const first_num_slack_bits = intra_word_index(first_bit_index); + if (first_num_slack_bits > 0) { + bitmask_type const word = bitmask[word_index(first_bit_index)]; + bitmask_type const slack_mask = set_least_significant_bits(first_num_slack_bits); delta -= __popc(word & slack_mask); } - // compute delta due to the following bits in the last word in the range - - num_slack_bits = (last_bit_index % word_size_in_bits) == 0 - ? 0 - : word_size_in_bits - intra_word_index(last_bit_index); - if (num_slack_bits > 0) { - bitmask_type word = bitmask[word_index(last_bit_index)]; - bitmask_type slack_mask = set_most_significant_bits(num_slack_bits); + // Compute delta due to the following bits in the last word in the range. + size_type const last_num_slack_bits = (last_bit_index % word_size_in_bits) == 0 + ? 0 + : word_size_in_bits - intra_word_index(last_bit_index); + if (last_num_slack_bits > 0) { + bitmask_type const word = bitmask[word_index(last_bit_index)]; + bitmask_type const slack_mask = set_most_significant_bits(last_num_slack_bits); delta -= __popc(word & slack_mask); } + // Update the null count with the computed delta. size_type updated_null_count = *(null_counts + range_id) + delta; *(null_counts + range_id) = updated_null_count; - range_id += blockDim.x * gridDim.x; } } -// convert [first_bit_index,last_bit_index) to -// [first_word_index,last_word_index) -struct to_word_index : public thrust::unary_function { - const bool _inclusive = false; - size_type const* const _d_bit_indices = nullptr; - - /** - * @brief Constructor of a functor that converts bit indices to bitmask word - * indices. - * - * @param[in] inclusive Flag that indicates whether bit indices are inclusive - * or exclusive. - * @param[in] d_bit_indices Pointer to an array of bit indices - */ - __host__ to_word_index(bool inclusive, size_type const* d_bit_indices) - : _inclusive(inclusive), _d_bit_indices(d_bit_indices) - { - } - - __device__ size_type operator()(const size_type& i) const - { - auto bit_index = _d_bit_indices[i]; - return word_index(bit_index) + ((_inclusive || intra_word_index(bit_index) == 0) ? 0 : 1); - } -}; - /** - * @brief Functor that returns the number of set bits for a specified word - * of a bitmask array. + * @brief Functor that converts bit segment indices to word segment indices. * + * Converts [first_bit_index, last_bit_index) to [first_word_index, + * last_word_index). The flag `inclusive` indicates whether the indices are inclusive or exclusive. + * the end of a segment, in which case the word index should be incremented for + * bits at the start of a word. */ -struct word_num_set_bits_functor { - word_num_set_bits_functor(bitmask_type const* bitmask_) : bitmask(bitmask_) {} - __device__ size_type operator()(size_type i) const +struct bit_to_word_index { + bit_to_word_index(bool inclusive) : inclusive(inclusive) {} + CUDA_DEVICE_CALLABLE size_type operator()(const size_type& bit_index) const { - return static_cast(__popc(bitmask[i])); + return word_index(bit_index) + ((inclusive || intra_word_index(bit_index) == 0) ? 0 : 1); } - bitmask_type const* bitmask; + bool const inclusive; }; -/** - * @brief Given a bitmask, counts the number of set (1) bits in every range - * `[indices_begin[2*i], indices_begin[(2*i)+1])` (where 0 <= i < std::distance(indices_begin, - * indices_end) / 2). - * - * Returns an empty vector if `bitmask == nullptr`. - * - * @throws cudf::logic_error if `std::distance(indices_begin, indices_end) % 2 != 0` - * @throws cudf::logic_error if `indices_begin[2*i] < 0 or indices_begin[2*i] > - * indices_begin[(2*i)+1]` - * - * @param bitmask Bitmask residing in device memory whose bits will be counted - * @param indices_begin An iterator representing the beginning of the range of indices specifying - * ranges to count the number of set bits within - * @param indices_end An iterator representing the end of the range of indices specifying ranges to - * count the number of set bits within - * @param streaam CUDA stream used for device memory operations and kernel launches - * - * @return A vector storing the number of non-zero bits in the specified ranges - */ -template -std::vector segmented_count_set_bits(bitmask_type const* bitmask, - IndexIterator indices_begin, - IndexIterator indices_end, - rmm::cuda_stream_view stream) -{ - size_t const num_indices = std::distance(indices_begin, indices_end); - - CUDF_EXPECTS(num_indices % 2 == 0, "Array of indices needs to have an even number of elements."); - for (size_t i = 0; i < num_indices / 2; i++) { - auto begin = indices_begin[i * 2]; - auto end = indices_begin[i * 2 + 1]; - CUDF_EXPECTS(begin >= 0, "Starting index cannot be negative."); - CUDF_EXPECTS(end >= begin, "End index cannot be smaller than the starting index."); - } +struct popc { + CUDA_DEVICE_CALLABLE size_type operator()(bitmask_type word) const { return __popc(word); } +}; - if (num_indices == 0) { - return std::vector{}; - } else if (bitmask == nullptr) { - std::vector ret(num_indices / 2); - for (size_t i = 0; i < num_indices / 2; i++) { - ret[i] = indices_begin[2 * i + 1] - indices_begin[2 * i]; - } - return ret; - } +// Count set/unset bits in a segmented null mask, using offset iterators accessible by the device. +template +rmm::device_uvector segmented_count_bits(bitmask_type const* bitmask, + OffsetIterator first_bit_indices_begin, + OffsetIterator first_bit_indices_end, + OffsetIterator last_bit_indices_begin, + count_bits_policy count_bits, + rmm::cuda_stream_view stream) +{ + auto const num_ranges = + static_cast(std::distance(first_bit_indices_begin, first_bit_indices_end)); + rmm::device_uvector d_bit_counts(num_ranges, stream); - size_type num_ranges = num_indices / 2; - std::vector h_first_indices(num_ranges); - std::vector h_last_indices(num_ranges); - thrust::stable_partition_copy(thrust::seq, - indices_begin, - indices_end, - thrust::make_counting_iterator(0), - h_first_indices.begin(), - h_last_indices.begin(), - [](auto i) { return (i % 2) == 0; }); - - auto d_first_indices = make_device_uvector_async(h_first_indices, stream); - auto d_last_indices = make_device_uvector_async(h_last_indices, stream); - rmm::device_uvector d_null_counts(num_ranges, stream); - - auto word_num_set_bits = thrust::make_transform_iterator(thrust::make_counting_iterator(0), - word_num_set_bits_functor{bitmask}); - auto first_word_indices = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), - // We cannot use lambda as cub::DeviceSegmentedReduce::Sum() requires - // first_word_indices and last_word_indices to have the same type. - to_word_index(true, d_first_indices.data())); - auto last_word_indices = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), - // We cannot use lambda as cub::DeviceSegmentedReduce::Sum() requires - // first_word_indices and last_word_indices to have the same type. - to_word_index(false, d_last_indices.data())); - - // first allocate temporary memory + auto num_set_bits_in_word = thrust::make_transform_iterator(bitmask, popc{}); + auto first_word_indices = + thrust::make_transform_iterator(first_bit_indices_begin, bit_to_word_index{true}); + auto last_word_indices = + thrust::make_transform_iterator(last_bit_indices_begin, bit_to_word_index{false}); + // Allocate temporary memory. size_t temp_storage_bytes{0}; CUDA_TRY(cub::DeviceSegmentedReduce::Sum(nullptr, temp_storage_bytes, - word_num_set_bits, - d_null_counts.begin(), + num_set_bits_in_word, + d_bit_counts.begin(), num_ranges, first_word_indices, last_word_indices, stream.value())); rmm::device_buffer d_temp_storage(temp_storage_bytes, stream); - // second perform segmented reduction - + // Perform segmented reduction. CUDA_TRY(cub::DeviceSegmentedReduce::Sum(d_temp_storage.data(), temp_storage_bytes, - word_num_set_bits, - d_null_counts.begin(), + num_set_bits_in_word, + d_bit_counts.begin(), num_ranges, first_word_indices, last_word_indices, @@ -375,75 +315,190 @@ std::vector segmented_count_set_bits(bitmask_type const* bitmask, CHECK_CUDA(stream.value()); - // third, adjust counts in segment boundaries (if segments are not - // word-aligned) - + // Adjust counts in segment boundaries (if segments are not word-aligned). constexpr size_type block_size{256}; - cudf::detail::grid_1d grid(num_ranges, block_size); - subtract_set_bits_range_boundaries_kernel<<>>( - bitmask, num_ranges, d_first_indices.begin(), d_last_indices.begin(), d_null_counts.begin()); + bitmask, num_ranges, first_bit_indices_begin, last_bit_indices_begin, d_bit_counts.begin()); + + if (count_bits == count_bits_policy::UNSET_BITS) { + // Convert from set bits counts to unset bits by subtracting the number of + // set bits from the length of the segment. + auto segments_begin = + thrust::make_zip_iterator(first_bit_indices_begin, last_bit_indices_begin); + auto segments_size = thrust::transform_iterator(segments_begin, [] __device__(auto segment) { + auto const begin = thrust::get<0>(segment); + auto const end = thrust::get<1>(segment); + return end - begin; + }); + thrust::transform(rmm::exec_policy(stream), + segments_size, + segments_size + num_ranges, + d_bit_counts.data(), + d_bit_counts.data(), + [] __device__(auto segment_size, auto segment_bit_count) { + return segment_size - segment_bit_count; + }); + } CHECK_CUDA(stream.value()); + return d_bit_counts; +} - std::vector ret(num_ranges); - CUDA_TRY(cudaMemcpyAsync(ret.data(), - d_null_counts.data(), - num_ranges * sizeof(size_type), - cudaMemcpyDeviceToHost, - stream.value())); +/** + * @brief Given two iterators, validate that the iterators represent valid ranges of + * indices and return the number of ranges. + * + * @throws cudf::logic_error if `std::distance(indices_begin, indices_end) % 2 != 0` + * @throws cudf::logic_error if `indices_begin[2*i] < 0 or indices_begin[2*i] > + * indices_begin[(2*i)+1]` + * + * @param indices_begin An iterator representing the beginning of the ranges of indices + * @param indices_end An iterator representing the end of the ranges of indices + * + * @return The number of segments specified by the input iterators. + */ +template +size_type validate_segmented_indices(IndexIterator indices_begin, IndexIterator indices_end) +{ + auto const num_indices = static_cast(std::distance(indices_begin, indices_end)); + CUDF_EXPECTS(num_indices % 2 == 0, "Array of indices needs to have an even number of elements."); + size_type const num_segments = num_indices / 2; + for (size_type i = 0; i < num_segments; i++) { + auto begin = indices_begin[2 * i]; + auto end = indices_begin[2 * i + 1]; + CUDF_EXPECTS(begin >= 0, "Starting index cannot be negative."); + CUDF_EXPECTS(end >= begin, "End index cannot be smaller than the starting index."); + } + return num_segments; +} - stream.synchronize(); // now ret is valid. +struct index_alternator { + CUDA_DEVICE_CALLABLE size_type operator()(const size_type& i) const + { + return *(d_indices + 2 * i + (is_end ? 1 : 0)); + } - return ret; -} + bool const is_end = false; + const size_type* d_indices; +}; /** - * @brief Given a bitmask, counts the number of unset (0) bits in every range + * @brief Given a bitmask, counts the number of set (1) or unset (0) bits in every range * `[indices_begin[2*i], indices_begin[(2*i)+1])` (where 0 <= i < std::distance(indices_begin, * indices_end) / 2). * - * Returns an empty vector if `bitmask == nullptr`. + * If `bitmask == nullptr`, this function returns a vector containing the + * segment lengths, or a vector of zeros if counting unset bits. * - * @throws cudf::logic_error if `std::distance(indices_begin, indices_end) % 2 != 0` + * @throws cudf::logic_error if `bitmask == nullptr`. + * @throws cudf::logic_error if `std::distance(indices_begin, indices_end) % 2 != 0`. * @throws cudf::logic_error if `indices_begin[2*i] < 0 or indices_begin[2*i] > - * indices_begin[(2*i)+1]` + * indices_begin[(2*i)+1]`. * - * @param bitmask Bitmask residing in device memory whose bits will be counted + * @param bitmask Bitmask residing in device memory whose bits will be counted. * @param indices_begin An iterator representing the beginning of the range of indices specifying - * ranges to count the number of unset bits within + * ranges to count the number of set/unset bits within. * @param indices_end An iterator representing the end of the range of indices specifying ranges to - * count the number of unset bits within - * @param streaam CUDA stream used for device memory operations and kernel launches + * count the number of set/unset bits within. + * @param count_bits If SET_BITS, count set (1) bits. If UNSET_BITS, count unset (0) bits. + * @param stream CUDA stream used for device memory operations and kernel launches. * * @return A vector storing the number of non-zero bits in the specified ranges */ template +std::vector segmented_count_bits(bitmask_type const* bitmask, + IndexIterator indices_begin, + IndexIterator indices_end, + count_bits_policy count_bits, + rmm::cuda_stream_view stream) +{ + CUDF_EXPECTS(bitmask != nullptr, "Invalid bitmask."); + auto const num_segments = validate_segmented_indices(indices_begin, indices_end); + + // Return an empty vector if there are zero segments. + if (num_segments == 0) { return std::vector{}; } + + // Construct a contiguous host buffer of indices and copy to device. + auto const h_indices = std::vector(indices_begin, indices_end); + auto const d_indices = make_device_uvector_async(h_indices, stream); + + // Compute the bit counts over each segment. + auto first_bit_indices_begin = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), index_alternator{false, d_indices.data()}); + auto const first_bit_indices_end = first_bit_indices_begin + num_segments; + auto last_bit_indices_begin = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), index_alternator{true, d_indices.data()}); + rmm::device_uvector d_bit_counts = + cudf::detail::segmented_count_bits(bitmask, + first_bit_indices_begin, + first_bit_indices_end, + last_bit_indices_begin, + count_bits, + stream); + + // Copy the results back to the host. + return make_std_vector_sync(d_bit_counts, stream); +} + +// Count non-zero bits in the specified ranges. +template +std::vector segmented_count_set_bits(bitmask_type const* bitmask, + IndexIterator indices_begin, + IndexIterator indices_end, + rmm::cuda_stream_view stream) +{ + return detail::segmented_count_bits( + bitmask, indices_begin, indices_end, count_bits_policy::SET_BITS, stream); +} + +// Count zero bits in the specified ranges. +template std::vector segmented_count_unset_bits(bitmask_type const* bitmask, IndexIterator indices_begin, IndexIterator indices_end, rmm::cuda_stream_view stream) { - size_t const num_indices = std::distance(indices_begin, indices_end); + return detail::segmented_count_bits( + bitmask, indices_begin, indices_end, count_bits_policy::UNSET_BITS, stream); +} - if (num_indices == 0) { - return std::vector{}; - } else if (bitmask == nullptr) { - return std::vector(num_indices / 2, 0); +// Count valid elements in the specified ranges of a validity bitmask. +template +std::vector segmented_valid_count(bitmask_type const* bitmask, + IndexIterator indices_begin, + IndexIterator indices_end, + rmm::cuda_stream_view stream) +{ + if (bitmask == nullptr) { + // Return a vector of segment lengths. + auto const num_segments = validate_segmented_indices(indices_begin, indices_end); + auto ret = std::vector(num_segments, 0); + for (size_type i = 0; i < num_segments; i++) { + ret[i] = indices_begin[2 * i + 1] - indices_begin[2 * i]; + } + return ret; } - auto ret = segmented_count_set_bits(bitmask, indices_begin, indices_end, stream); - for (size_t i = 0; i < ret.size(); i++) { - auto begin = indices_begin[i * 2]; - auto end = indices_begin[i * 2 + 1]; - ret[i] = (end - begin) - ret[i]; - } + return detail::segmented_count_set_bits(bitmask, indices_begin, indices_end, stream); +} - return ret; +// Count null elements in the specified ranges of a validity bitmask. +template +std::vector segmented_null_count(bitmask_type const* bitmask, + IndexIterator indices_begin, + IndexIterator indices_end, + rmm::cuda_stream_view stream) +{ + if (bitmask == nullptr) { + // Return a vector of zeros. + auto const num_segments = validate_segmented_indices(indices_begin, indices_end); + return std::vector(num_segments, 0); + } + return detail::segmented_count_unset_bits(bitmask, indices_begin, indices_end, stream); } } // namespace detail diff --git a/cpp/include/cudf/detail/null_mask.hpp b/cpp/include/cudf/detail/null_mask.hpp index d2819e665df..6ee406de5ef 100644 --- a/cpp/include/cudf/detail/null_mask.hpp +++ b/cpp/include/cudf/detail/null_mask.hpp @@ -48,9 +48,18 @@ void set_null_mask(bitmask_type* bitmask, rmm::cuda_stream_view stream = rmm::cuda_stream_default); /** - * @copydoc cudf::count_set_bits + * @brief Given a bitmask, counts the number of set (1) bits in the range + * `[start, stop)`. * - * @param[in] stream CUDA stream used for device memory operations and kernel launches. + * @throws cudf::logic_error if `bitmask == nullptr` + * @throws cudf::logic_error if `start > stop` + * @throws cudf::logic_error if `start < 0` + * + * @param bitmask Bitmask residing in device memory whose bits will be counted. + * @param start Index of the first bit to count (inclusive). + * @param stop Index of the last bit to count (exclusive). + * @param stream CUDA stream used for device memory operations and kernel launches. + * @return The number of non-zero bits in the specified range. */ cudf::size_type count_set_bits(bitmask_type const* bitmask, size_type start, @@ -58,9 +67,18 @@ cudf::size_type count_set_bits(bitmask_type const* bitmask, rmm::cuda_stream_view stream); /** - * @copydoc cudf::count_unset_bits + * @brief Given a bitmask, counts the number of unset (0) bits in the range + * `[start, stop)`. * - * @param[in] stream CUDA stream used for device memory operations and kernel launches. + * @throws cudf::logic_error if `bitmask == nullptr` + * @throws cudf::logic_error if `start > stop` + * @throws cudf::logic_error if `start < 0` + * + * @param bitmask Bitmask residing in device memory whose bits will be counted. + * @param start Index of the first bit to count (inclusive). + * @param stop Index of the last bit to count (exclusive). + * @param stream CUDA stream used for device memory operations and kernel launches. + * @return The number of zero bits in the specified range. */ cudf::size_type count_unset_bits(bitmask_type const* bitmask, size_type start, @@ -68,23 +86,122 @@ cudf::size_type count_unset_bits(bitmask_type const* bitmask, rmm::cuda_stream_view stream); /** - * @copydoc cudf::segmented_count_set_bits + * @brief Given a bitmask, counts the number of set (1) bits in every range + * `[indices[2*i], indices[(2*i)+1])` (where 0 <= i < indices.size() / 2). + * + * @throws cudf::logic_error if `bitmask == nullptr` + * @throws cudf::logic_error if `indices.size() % 2 != 0` + * @throws cudf::logic_error if `indices[2*i] < 0 or indices[2*i] > indices[(2*i)+1]` * + * @param[in] bitmask Bitmask residing in device memory whose bits will be counted. + * @param[in] indices A host_span of indices specifying ranges to count the number of set bits. * @param[in] stream CUDA stream used for device memory operations and kernel launches. + * @return A vector storing the number of non-zero bits in the specified ranges. */ std::vector segmented_count_set_bits(bitmask_type const* bitmask, host_span indices, rmm::cuda_stream_view stream); /** - * @copydoc cudf::segmented_count_unset_bits + * @brief Given a bitmask, counts the number of unset (0) bits in every range + * `[indices[2*i], indices[(2*i)+1])` (where 0 <= i < indices.size() / 2). * + * @throws cudf::logic_error if `bitmask == nullptr` + * @throws cudf::logic_error if `indices.size() % 2 != 0` + * @throws cudf::logic_error if `indices[2*i] < 0 or indices[2*i] > indices[(2*i)+1]` + * + * @param[in] bitmask Bitmask residing in device memory whose bits will be counted. + * @param[in] indices A host_span of indices specifying ranges to count the number of unset bits. * @param[in] stream CUDA stream used for device memory operations and kernel launches. + * @return A vector storing the number of zero bits in the specified ranges. */ std::vector segmented_count_unset_bits(bitmask_type const* bitmask, host_span indices, rmm::cuda_stream_view stream); +/** + * @brief Given a validity bitmask, counts the number of valid elements (set bits) + * in the range `[start, stop)`. + * + * If `bitmask == nullptr`, all elements are assumed to be valid and the + * function returns `stop-start`. + * + * @throws cudf::logic_error if `start > stop` + * @throws cudf::logic_error if `start < 0` + * + * @param[in] bitmask Validity bitmask residing in device memory. + * @param[in] start Index of the first bit to count (inclusive). + * @param[in] stop Index of the last bit to count (exclusive). + * @param[in] stream CUDA stream used for device memory operations and kernel launches. + * @return The number of valid elements in the specified range. + */ +cudf::size_type valid_count(bitmask_type const* bitmask, + size_type start, + size_type stop, + rmm::cuda_stream_view stream); + +/** + * @brief Given a validity bitmask, counts the number of null elements (unset bits) + * in the range `[start, stop)`. + * + * If `bitmask == nullptr`, all elements are assumed to be valid and the + * function returns ``. + * + * @throws cudf::logic_error if `start > stop` + * @throws cudf::logic_error if `start < 0` + * + * @param[in] bitmask Validity bitmask residing in device memory. + * @param[in] start Index of the first bit to count (inclusive). + * @param[in] stop Index of the last bit to count (exclusive). + * @param[in] stream CUDA stream used for device memory operations and kernel launches. + * @return The number of null elements in the specified range. + */ +cudf::size_type null_count(bitmask_type const* bitmask, + size_type start, + size_type stop, + rmm::cuda_stream_view stream); + +/** + * @brief Given a validity bitmask, counts the number of valid elements (set + * bits) in every range `[indices[2*i], indices[(2*i)+1])` (where 0 <= i < + * indices.size() / 2). + * + * If `bitmask == nullptr`, all elements are assumed to be valid and a vector of + * length `indices.size()` containing segment lengths is returned. + * + * @throws cudf::logic_error if `indices.size() % 2 != 0`. + * @throws cudf::logic_error if `indices[2*i] < 0 or indices[2*i] > indices[(2*i)+1]`. + * + * @param[in] bitmask Validity bitmask residing in device memory. + * @param[in] indices A host_span of indices specifying ranges to count the number of valid + * elements. + * @param[in] stream CUDA stream used for device memory operations and kernel launches. + * @return A vector storing the number of valid elements in each specified range. + */ +std::vector segmented_valid_count(bitmask_type const* bitmask, + host_span indices, + rmm::cuda_stream_view stream); + +/** + * @brief Given a validity bitmask, counts the number of null elements (unset + * bits) in every range `[indices[2*i], indices[(2*i)+1])` (where 0 <= i < + * indices.size() / 2). + * + * If `bitmask == nullptr`, all elements are assumed to be valid and a vector of + * length `indices.size()` containing all zeros is returned. + * + * @throws cudf::logic_error if `indices.size() % 2 != 0` + * @throws cudf::logic_error if `indices[2*i] < 0 or indices[2*i] > indices[(2*i)+1]` + * + * @param[in] bitmask Validity bitmask residing in device memory. + * @param[in] indices A host_span of indices specifying ranges to count the number of null elements. + * @param[in] stream CUDA stream used for device memory operations and kernel launches. + * @return A vector storing the number of null elements in each specified range. + */ +std::vector segmented_null_count(bitmask_type const* bitmask, + host_span indices, + rmm::cuda_stream_view stream); + /** * @copydoc cudf::copy_bitmask(bitmask_type const*, size_type, size_type, *rmm::mr::device_memory_resource*) diff --git a/cpp/include/cudf/null_mask.hpp b/cpp/include/cudf/null_mask.hpp index c74e077dc32..6585932f151 100644 --- a/cpp/include/cudf/null_mask.hpp +++ b/cpp/include/cudf/null_mask.hpp @@ -100,70 +100,6 @@ rmm::device_buffer create_null_mask( */ void set_null_mask(bitmask_type* bitmask, size_type begin_bit, size_type end_bit, bool valid); -/** - * @brief Given a bitmask, counts the number of set (1) bits in the range - * `[start, stop)` - * - * Returns `0` if `bitmask == nullptr`. - * - * @throws cudf::logic_error if `start > stop` - * @throws cudf::logic_error if `start < 0` - * - * @param bitmask Bitmask residing in device memory whose bits will be counted - * @param start Index of the first bit to count (inclusive) - * @param stop Index of the last bit to count (exclusive) - * @return The number of non-zero bits in the specified range - */ -cudf::size_type count_set_bits(bitmask_type const* bitmask, size_type start, size_type stop); - -/** - * @brief Given a bitmask, counts the number of unset (0) bits in the range - *`[start, stop)`. - * - * Returns `0` if `bitmask == nullptr`. - * - * @throws cudf::logic_error if `start > stop` - * @throws cudf::logic_error if `start < 0` - * - * @param bitmask Bitmask residing in device memory whose bits will be counted - * @param start Index of the first bit to count (inclusive) - * @param stop Index of the last bit to count (exclusive) - * @return The number of zero bits in the specified range - */ -cudf::size_type count_unset_bits(bitmask_type const* bitmask, size_type start, size_type stop); - -/** - * @brief Given a bitmask, counts the number of set (1) bits in every range - * `[indices[2*i], indices[(2*i)+1])` (where 0 <= i < indices.size() / 2). - * - * Returns an empty vector if `bitmask == nullptr`. - * - * @throws cudf::logic_error if `indices.size() % 2 != 0` - * @throws cudf::logic_error if `indices[2*i] < 0 or indices[2*i] > indices[(2*i)+1]` - * - * @param[in] bitmask Bitmask residing in device memory whose bits will be counted - * @param[in] indices A host_span of indices specifying ranges to count the number of set bits - * @return A vector storing the number of non-zero bits in the specified ranges - */ -std::vector segmented_count_set_bits(bitmask_type const* bitmask, - host_span indices); - -/** - * @brief Given a bitmask, counts the number of unset (0) bits in every range - * `[indices[2*i], indices[(2*i)+1])` (where 0 <= i < indices.size() / 2). - * - * Returns an empty vector if `bitmask == nullptr`. - * - * @throws cudf::logic_error if `indices.size() % 2 != 0` - * @throws cudf::logic_error if `indices[2*i] < 0 or indices[2*i] > indices[(2*i)+1]` - * - * @param[in] bitmask Bitmask residing in device memory whose bits will be counted - * @param[in] indices A host_span of indices specifying ranges to count the number of unset bits - * @return A vector storing the number of zero bits in the specified ranges - */ -std::vector segmented_count_unset_bits(bitmask_type const* bitmask, - host_span indices); - /** * @brief Creates a `device_buffer` from a slice of bitmask defined by a range * of indices `[begin_bit, end_bit)`. diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu index 1cd3def61ac..ec3776fb6d5 100644 --- a/cpp/src/bitmask/null_mask.cu +++ b/cpp/src/bitmask/null_mask.cu @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -30,7 +31,6 @@ #include #include #include -#include #include #include @@ -167,67 +167,9 @@ void set_null_mask(bitmask_type* bitmask, size_type begin_bit, size_type end_bit return detail::set_null_mask(bitmask, begin_bit, end_bit, valid); } -namespace { - -/** - * @brief Counts the number of non-zero bits in a bitmask in the range - * `[first_bit_index, last_bit_index]`. - * - * Expects `0 <= first_bit_index <= last_bit_index`. - * - * @param[in] bitmask The bitmask whose non-zero bits will be counted. - * @param[in] first_bit_index The index (inclusive) of the first bit to count - * @param[in] last_bit_index The index (inclusive) of the last bit to count - * @param[out] global_count The number of non-zero bits in the specified range - */ -template -__global__ void count_set_bits_kernel(bitmask_type const* bitmask, - size_type first_bit_index, - size_type last_bit_index, - size_type* global_count) -{ - constexpr auto const word_size{detail::size_in_bits()}; - - auto const first_word_index{word_index(first_bit_index)}; - auto const last_word_index{word_index(last_bit_index)}; - auto const tid = threadIdx.x + blockIdx.x * blockDim.x; - auto thread_word_index = tid + first_word_index; - size_type thread_count{0}; - - // First, just count the bits in all words - while (thread_word_index <= last_word_index) { - thread_count += __popc(bitmask[thread_word_index]); - thread_word_index += blockDim.x * gridDim.x; - } - - // Subtract any slack bits counted from the first and last word - // Two threads handle this -- one for first word, one for last - if (tid < 2) { - bool const first{tid == 0}; - bool const last{not first}; - - size_type bit_index = (first) ? first_bit_index : last_bit_index; - size_type word_index = (first) ? first_word_index : last_word_index; - - size_type num_slack_bits = bit_index % word_size; - if (last) { num_slack_bits = word_size - num_slack_bits - 1; } - - if (num_slack_bits > 0) { - bitmask_type word = bitmask[word_index]; - auto slack_mask = (first) ? set_least_significant_bits(num_slack_bits) - : set_most_significant_bits(num_slack_bits); - - thread_count -= __popc(word & slack_mask); - } - } - - using BlockReduce = cub::BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - size_type block_count{BlockReduce(temp_storage).Sum(thread_count)}; - - if (threadIdx.x == 0) { atomicAdd(global_count, block_count); } -} +namespace detail { +namespace { /** * @brief Copies the bits starting at the specified offset from a source * bitmask into the destination bitmask. @@ -257,8 +199,6 @@ __global__ void copy_offset_bitmask(bitmask_type* __restrict__ destination, } // namespace -namespace detail { - // Create a bitmask from a specific range rmm::device_buffer copy_bitmask(bitmask_type const* mask, size_type begin_bit, @@ -299,20 +239,82 @@ rmm::device_buffer copy_bitmask(column_view const& view, return null_mask; } +namespace { +/** + * @brief Counts the number of non-zero bits in a bitmask in the range + * `[first_bit_index, last_bit_index]`. + * + * Expects `0 <= first_bit_index <= last_bit_index`. + * + * @param[in] bitmask The bitmask whose non-zero bits will be counted. + * @param[in] first_bit_index The index (inclusive) of the first bit to count + * @param[in] last_bit_index The index (inclusive) of the last bit to count + * @param[out] global_count The number of non-zero bits in the specified range + */ +template +__global__ void count_set_bits_kernel(bitmask_type const* bitmask, + size_type first_bit_index, + size_type last_bit_index, + size_type* global_count) +{ + constexpr auto const word_size{detail::size_in_bits()}; + + auto const first_word_index{word_index(first_bit_index)}; + auto const last_word_index{word_index(last_bit_index)}; + auto const tid = threadIdx.x + blockIdx.x * blockDim.x; + auto thread_word_index = tid + first_word_index; + size_type thread_count{0}; + + // First, just count the bits in all words + while (thread_word_index <= last_word_index) { + thread_count += __popc(bitmask[thread_word_index]); + thread_word_index += blockDim.x * gridDim.x; + } + + // Subtract any slack bits counted from the first and last word + // Two threads handle this -- one for first word, one for last + if (tid < 2) { + bool const first{tid == 0}; + bool const last{not first}; + + size_type bit_index = (first) ? first_bit_index : last_bit_index; + size_type word_index = (first) ? first_word_index : last_word_index; + + size_type num_slack_bits = bit_index % word_size; + if (last) { num_slack_bits = word_size - num_slack_bits - 1; } + + if (num_slack_bits > 0) { + bitmask_type word = bitmask[word_index]; + auto slack_mask = (first) ? set_least_significant_bits(num_slack_bits) + : set_most_significant_bits(num_slack_bits); + + thread_count -= __popc(word & slack_mask); + } + } + + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + size_type block_count{BlockReduce(temp_storage).Sum(thread_count)}; + + if (threadIdx.x == 0) { atomicAdd(global_count, block_count); } +} + +} // namespace + +// Count non-zero bits in the specified range cudf::size_type count_set_bits(bitmask_type const* bitmask, size_type start, size_type stop, - rmm::cuda_stream_view stream = rmm::cuda_stream_default) + rmm::cuda_stream_view stream) { - if (nullptr == bitmask) { return 0; } - + CUDF_EXPECTS(bitmask != nullptr, "Invalid bitmask."); CUDF_EXPECTS(start >= 0, "Invalid range."); CUDF_EXPECTS(start <= stop, "Invalid bit range."); - std::size_t num_bits_to_count = stop - start; + auto const num_bits_to_count = stop - start; if (num_bits_to_count == 0) { return 0; } - auto num_words = num_bitmask_words(num_bits_to_count); + auto const num_words = num_bitmask_words(num_bits_to_count); constexpr size_type block_size{256}; @@ -327,14 +329,78 @@ cudf::size_type count_set_bits(bitmask_type const* bitmask, return non_zero_count.value(stream); } +// Count zero bits in the specified range cudf::size_type count_unset_bits(bitmask_type const* bitmask, size_type start, size_type stop, - rmm::cuda_stream_view stream = rmm::cuda_stream_default) + rmm::cuda_stream_view stream) +{ + auto const num_set_bits = detail::count_set_bits(bitmask, start, stop, stream); + auto const total_num_bits = (stop - start); + return total_num_bits - num_set_bits; +} + +// Count valid elements in the specified range of a validity bitmask +cudf::size_type valid_count(bitmask_type const* bitmask, + size_type start, + size_type stop, + rmm::cuda_stream_view stream) +{ + if (bitmask == nullptr) { + CUDF_EXPECTS(start >= 0, "Invalid range."); + CUDF_EXPECTS(start <= stop, "Invalid bit range."); + auto const total_num_bits = (stop - start); + return total_num_bits; + } + + return detail::count_set_bits(bitmask, start, stop, stream); +} + +// Count null elements in the specified range of a validity bitmask +cudf::size_type null_count(bitmask_type const* bitmask, + size_type start, + size_type stop, + rmm::cuda_stream_view stream) +{ + if (bitmask == nullptr) { + CUDF_EXPECTS(start >= 0, "Invalid range."); + CUDF_EXPECTS(start <= stop, "Invalid bit range."); + return 0; + } + + return detail::count_unset_bits(bitmask, start, stop, stream); +} + +// Count non-zero bits in the specified ranges of a bitmask +std::vector segmented_count_set_bits(const bitmask_type* bitmask, + host_span indices, + rmm::cuda_stream_view stream) +{ + return detail::segmented_count_set_bits(bitmask, indices.begin(), indices.end(), stream); +} + +// Count zero bits in the specified ranges of a bitmask +std::vector segmented_count_unset_bits(const bitmask_type* bitmask, + host_span indices, + rmm::cuda_stream_view stream) +{ + return detail::segmented_count_unset_bits(bitmask, indices.begin(), indices.end(), stream); +} + +// Count valid elements in the specified ranges of a validity bitmask +std::vector segmented_valid_count(const bitmask_type* bitmask, + host_span indices, + rmm::cuda_stream_view stream) +{ + return detail::segmented_valid_count(bitmask, indices.begin(), indices.end(), stream); +} + +// Count null elements in the specified ranges of a validity bitmask +std::vector segmented_null_count(const bitmask_type* bitmask, + host_span indices, + rmm::cuda_stream_view stream) { - if (nullptr == bitmask) { return 0; } - auto num_bits = (stop - start); - return (num_bits - detail::count_set_bits(bitmask, start, stop, stream)); + return detail::segmented_null_count(bitmask, indices.begin(), indices.end(), stream); } // Inplace Bitwise AND of the masks @@ -437,60 +503,8 @@ std::pair bitmask_or(table_view const& view, return std::make_pair(std::move(null_mask), 0); } -/** - * @copydoc cudf::segmented_count_set_bits - * - * @param[in] stream CUDA stream used for device memory operations and kernel launches. - */ -std::vector segmented_count_set_bits(bitmask_type const* bitmask, - host_span indices, - rmm::cuda_stream_view stream) -{ - CUDF_FUNC_RANGE(); - return detail::segmented_count_set_bits(bitmask, indices.begin(), indices.end(), stream); -} - -// Count zero bits in the specified ranges -std::vector segmented_count_unset_bits(bitmask_type const* bitmask, - host_span indices, - rmm::cuda_stream_view stream) -{ - CUDF_FUNC_RANGE(); - return detail::segmented_count_unset_bits(bitmask, indices.begin(), indices.end(), stream); -} - } // namespace detail -// Count non-zero bits in the specified range -cudf::size_type count_set_bits(bitmask_type const* bitmask, size_type start, size_type stop) -{ - CUDF_FUNC_RANGE(); - return detail::count_set_bits(bitmask, start, stop); -} - -// Count zero bits in the specified range -cudf::size_type count_unset_bits(bitmask_type const* bitmask, size_type start, size_type stop) -{ - CUDF_FUNC_RANGE(); - return detail::count_unset_bits(bitmask, start, stop); -} - -// Count non-zero bits in the specified ranges -std::vector segmented_count_set_bits(bitmask_type const* bitmask, - host_span indices) -{ - CUDF_FUNC_RANGE(); - return detail::segmented_count_set_bits(bitmask, indices, rmm::cuda_stream_default); -} - -// Count zero bits in the specified ranges -std::vector segmented_count_unset_bits(bitmask_type const* bitmask, - host_span indices) -{ - CUDF_FUNC_RANGE(); - return detail::segmented_count_unset_bits(bitmask, indices, rmm::cuda_stream_default); -} - // Create a bitmask from a specific range rmm::device_buffer copy_bitmask(bitmask_type const* mask, size_type begin_bit, diff --git a/cpp/src/column/column.cu b/cpp/src/column/column.cu index 1357bbb10a5..992ff18456a 100644 --- a/cpp/src/column/column.cu +++ b/cpp/src/column/column.cu @@ -116,15 +116,16 @@ mutable_column_view column::mutable_view() child_views.emplace_back(*c); } - // Store the old null count before resetting it. By accessing the value directly instead of - // calling `null_count()`, we can avoid a potential invocation of `count_unset_bits()`. This does - // however mean that calling `null_count()` on the resulting mutable view could still potentially - // invoke `count_unset_bits()`. + // Store the old null count before resetting it. By accessing the value + // directly instead of calling `this->null_count()`, we can avoid a potential + // invocation of `cudf::detail::null_count()`. This does however mean that + // calling `this->null_count()` on the resulting mutable view could still + // potentially invoke `cudf::detail::null_count()`. auto current_null_count = _null_count; // The elements of a column could be changed through a `mutable_column_view`, therefore the // existing `null_count` is no longer valid. Reset it to `UNKNOWN_NULL_COUNT` forcing it to be - // recomputed on the next invocation of `null_count()`. + // recomputed on the next invocation of `this->null_count()`. set_null_count(cudf::UNKNOWN_NULL_COUNT); return mutable_column_view{type(), @@ -141,8 +142,8 @@ size_type column::null_count() const { CUDF_FUNC_RANGE(); if (_null_count <= cudf::UNKNOWN_NULL_COUNT) { - _null_count = - cudf::count_unset_bits(static_cast(_null_mask.data()), 0, size()); + _null_count = cudf::detail::null_count( + static_cast(_null_mask.data()), 0, size(), rmm::cuda_stream_default); } return _null_count; } diff --git a/cpp/src/column/column_view.cpp b/cpp/src/column/column_view.cpp index 5749cb48c0e..2f7297dbb54 100644 --- a/cpp/src/column/column_view.cpp +++ b/cpp/src/column/column_view.cpp @@ -16,7 +16,7 @@ #include #include -#include +#include #include #include #include @@ -66,7 +66,8 @@ column_view_base::column_view_base(data_type type, size_type column_view_base::null_count() const { if (_null_count <= cudf::UNKNOWN_NULL_COUNT) { - _null_count = cudf::count_unset_bits(null_mask(), offset(), offset() + size()); + _null_count = + cudf::detail::null_count(null_mask(), offset(), offset() + size(), rmm::cuda_stream_default); } return _null_count; } @@ -76,7 +77,8 @@ size_type column_view_base::null_count(size_type begin, size_type end) const CUDF_EXPECTS((begin >= 0) && (end <= size()) && (begin <= end), "Range is out of bounds."); return (null_count() == 0) ? 0 - : cudf::count_unset_bits(null_mask(), offset() + begin, offset() + end); + : cudf::detail::null_count( + null_mask(), offset() + begin, offset() + end, rmm::cuda_stream_default); } // Struct to use custom hash combine and fold expression diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu index 06ef42e4a08..9a364451b3b 100644 --- a/cpp/src/copying/scatter.cu +++ b/cpp/src/copying/scatter.cu @@ -88,7 +88,8 @@ void scatter_scalar_bitmask_inplace(std::reference_wrapper const& bitmask_kernel<<>>( *target_view, scatter_map, num_scatter_rows); - target.set_null_count(count_unset_bits(target.view().null_mask(), 0, target.size(), stream)); + target.set_null_count( + cudf::detail::null_count(target.view().null_mask(), 0, target.size(), stream)); } } diff --git a/cpp/src/copying/slice.cu b/cpp/src/copying/slice.cu index 9a3e349b907..b2f05516e2c 100644 --- a/cpp/src/copying/slice.cu +++ b/cpp/src/copying/slice.cu @@ -40,7 +40,7 @@ std::vector slice(column_view const& input, // to count auto indices_iter = cudf::detail::make_counting_transform_iterator( 0, [offset = input.offset(), &indices](size_type index) { return indices[index] + offset; }); - auto null_counts = cudf::detail::segmented_count_unset_bits( + auto null_counts = cudf::detail::segmented_null_count( input.null_mask(), indices_iter, indices_iter + indices.size(), stream); auto const children = std::vector(input.child_begin(), input.child_end()); diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index db02125ce77..a7a767585e6 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -25,9 +25,9 @@ #include #include +#include #include #include -#include #include #include #include @@ -903,7 +903,8 @@ encoded_data encode_columns(orc_table_view const& orc_table, } } for (auto& cnt_in : validity_check_inputs) { - auto const valid_counts = segmented_count_set_bits(cnt_in.second.mask, cnt_in.second.indices); + auto const valid_counts = + cudf::detail::segmented_valid_count(cnt_in.second.mask, cnt_in.second.indices, stream); CUDF_EXPECTS( std::none_of(valid_counts.cbegin(), valid_counts.cend(), diff --git a/cpp/tests/bitmask/bitmask_tests.cpp b/cpp/tests/bitmask/bitmask_tests.cpp index c7ae6e12366..19a0da21cb9 100644 --- a/cpp/tests/bitmask/bitmask_tests.cpp +++ b/cpp/tests/bitmask/bitmask_tests.cpp @@ -15,6 +15,7 @@ */ #include #include +#include #include #include #include @@ -67,14 +68,16 @@ struct CountBitmaskTest : public cudf::test::BaseFixture { TEST_F(CountBitmaskTest, NullMask) { - EXPECT_EQ(0, cudf::count_set_bits(nullptr, 0, 32)); + EXPECT_THROW(cudf::detail::count_set_bits(nullptr, 0, 32, rmm::cuda_stream_default), + cudf::logic_error); + EXPECT_EQ(32, cudf::detail::valid_count(nullptr, 0, 32, rmm::cuda_stream_default)); std::vector indices = {0, 32, 7, 25}; - auto counts = cudf::segmented_count_set_bits(nullptr, indices); - EXPECT_EQ(indices.size(), counts.size() * 2); - for (size_t i = 0; i < counts.size(); i++) { - EXPECT_EQ(indices[2 * i + 1] - indices[2 * i], counts[i]); - } + EXPECT_THROW(cudf::detail::segmented_count_set_bits(nullptr, indices, rmm::cuda_stream_default), + cudf::logic_error); + auto valid_counts = + cudf::detail::segmented_valid_count(nullptr, indices, rmm::cuda_stream_default); + EXPECT_THAT(valid_counts, ::testing::ElementsAreArray(std::vector{32, 18})); } // Utility to construct a mask vector. If fill_valid is false (default), it is initialized to all @@ -96,159 +99,243 @@ rmm::device_uvector make_mask(cudf::size_type size, bool fil TEST_F(CountBitmaskTest, NegativeStart) { auto mask = make_mask(1); - EXPECT_THROW(cudf::count_set_bits(mask.data(), -1, 32), cudf::logic_error); + EXPECT_THROW(cudf::detail::count_set_bits(mask.data(), -1, 32, rmm::cuda_stream_default), + cudf::logic_error); + EXPECT_THROW(cudf::detail::valid_count(mask.data(), -1, 32, rmm::cuda_stream_default), + cudf::logic_error); std::vector indices = {0, 16, -1, 32}; - EXPECT_THROW(cudf::segmented_count_set_bits(mask.data(), indices), cudf::logic_error); + EXPECT_THROW( + cudf::detail::segmented_count_set_bits(mask.data(), indices, rmm::cuda_stream_default), + cudf::logic_error); + EXPECT_THROW(cudf::detail::segmented_valid_count(mask.data(), indices, rmm::cuda_stream_default), + cudf::logic_error); } TEST_F(CountBitmaskTest, StartLargerThanStop) { auto mask = make_mask(1); - EXPECT_THROW(cudf::count_set_bits(mask.data(), 32, 31), cudf::logic_error); + EXPECT_THROW(cudf::detail::count_set_bits(mask.data(), 32, 31, rmm::cuda_stream_default), + cudf::logic_error); + EXPECT_THROW(cudf::detail::valid_count(mask.data(), 32, 31, rmm::cuda_stream_default), + cudf::logic_error); std::vector indices = {0, 16, 31, 30}; - EXPECT_THROW(cudf::segmented_count_set_bits(mask.data(), indices), cudf::logic_error); + EXPECT_THROW( + cudf::detail::segmented_count_set_bits(mask.data(), indices, rmm::cuda_stream_default), + cudf::logic_error); + EXPECT_THROW(cudf::detail::segmented_valid_count(mask.data(), indices, rmm::cuda_stream_default), + cudf::logic_error); } TEST_F(CountBitmaskTest, EmptyRange) { auto mask = make_mask(1); - EXPECT_EQ(0, cudf::count_set_bits(mask.data(), 17, 17)); + EXPECT_EQ(0, cudf::detail::count_set_bits(mask.data(), 17, 17, rmm::cuda_stream_default)); + EXPECT_EQ(0, cudf::detail::valid_count(mask.data(), 17, 17, rmm::cuda_stream_default)); std::vector indices = {0, 0, 17, 17}; - auto counts = cudf::segmented_count_set_bits(mask.data(), indices); - EXPECT_THAT(counts, testing::ContainerEq(std::vector{0, 0})); + auto set_counts = + cudf::detail::segmented_count_set_bits(mask.data(), indices, rmm::cuda_stream_default); + EXPECT_THAT(set_counts, ::testing::ElementsAreArray(std::vector{0, 0})); + auto valid_counts = + cudf::detail::segmented_valid_count(mask.data(), indices, rmm::cuda_stream_default); + EXPECT_THAT(valid_counts, ::testing::ElementsAreArray(std::vector{0, 0})); } TEST_F(CountBitmaskTest, SingleWordAllZero) { auto mask = make_mask(1); - EXPECT_EQ(0, cudf::count_set_bits(mask.data(), 0, 32)); + EXPECT_EQ(0, cudf::detail::count_set_bits(mask.data(), 0, 32, rmm::cuda_stream_default)); + EXPECT_EQ(0, cudf::detail::valid_count(mask.data(), 0, 32, rmm::cuda_stream_default)); std::vector indices = {0, 32, 0, 32}; - auto counts = cudf::segmented_count_set_bits(mask.data(), indices); - EXPECT_THAT(counts, testing::ContainerEq(std::vector{0, 0})); + auto set_counts = + cudf::detail::segmented_count_set_bits(mask.data(), indices, rmm::cuda_stream_default); + EXPECT_THAT(set_counts, ::testing::ElementsAreArray(std::vector{0, 0})); + auto valid_counts = + cudf::detail::segmented_count_set_bits(mask.data(), indices, rmm::cuda_stream_default); + EXPECT_THAT(valid_counts, ::testing::ElementsAreArray(std::vector{0, 0})); } TEST_F(CountBitmaskTest, SingleBitAllZero) { auto mask = make_mask(1); - EXPECT_EQ(0, cudf::count_set_bits(mask.data(), 17, 18)); + EXPECT_EQ(0, cudf::detail::count_set_bits(mask.data(), 17, 18, rmm::cuda_stream_default)); + EXPECT_EQ(0, cudf::detail::valid_count(mask.data(), 17, 18, rmm::cuda_stream_default)); std::vector indices = {17, 18, 7, 8}; - auto counts = cudf::segmented_count_set_bits(mask.data(), indices); - EXPECT_THAT(counts, testing::ContainerEq(std::vector{0, 0})); + auto set_counts = + cudf::detail::segmented_count_set_bits(mask.data(), indices, rmm::cuda_stream_default); + EXPECT_THAT(set_counts, ::testing::ElementsAreArray(std::vector{0, 0})); + auto valid_counts = + cudf::detail::segmented_valid_count(mask.data(), indices, rmm::cuda_stream_default); + EXPECT_THAT(valid_counts, ::testing::ElementsAreArray(std::vector{0, 0})); } TEST_F(CountBitmaskTest, SingleBitAllSet) { auto mask = make_mask(1, true); - EXPECT_EQ(1, cudf::count_set_bits(mask.data(), 13, 14)); + EXPECT_EQ(1, cudf::detail::count_set_bits(mask.data(), 13, 14, rmm::cuda_stream_default)); + EXPECT_EQ(1, cudf::detail::valid_count(mask.data(), 13, 14, rmm::cuda_stream_default)); std::vector indices = {13, 14, 0, 1}; - auto counts = cudf::segmented_count_set_bits(mask.data(), indices); - EXPECT_THAT(counts, testing::ContainerEq(std::vector{1, 1})); + auto set_counts = + cudf::detail::segmented_count_set_bits(mask.data(), indices, rmm::cuda_stream_default); + EXPECT_THAT(set_counts, ::testing::ElementsAreArray(std::vector{1, 1})); + auto valid_counts = + cudf::detail::segmented_valid_count(mask.data(), indices, rmm::cuda_stream_default); + EXPECT_THAT(valid_counts, ::testing::ElementsAreArray(std::vector{1, 1})); } TEST_F(CountBitmaskTest, SingleWordAllBitsSet) { auto mask = make_mask(1, true); - EXPECT_EQ(32, cudf::count_set_bits(mask.data(), 0, 32)); + EXPECT_EQ(32, cudf::detail::count_set_bits(mask.data(), 0, 32, rmm::cuda_stream_default)); + EXPECT_EQ(32, cudf::detail::valid_count(mask.data(), 0, 32, rmm::cuda_stream_default)); std::vector indices = {0, 32, 0, 32}; - auto counts = cudf::segmented_count_set_bits(mask.data(), indices); - EXPECT_THAT(counts, testing::ContainerEq(std::vector{32, 32})); + auto set_counts = + cudf::detail::segmented_count_set_bits(mask.data(), indices, rmm::cuda_stream_default); + EXPECT_THAT(set_counts, ::testing::ElementsAreArray(std::vector{32, 32})); + auto valid_counts = + cudf::detail::segmented_valid_count(mask.data(), indices, rmm::cuda_stream_default); + EXPECT_THAT(valid_counts, ::testing::ElementsAreArray(std::vector{32, 32})); } TEST_F(CountBitmaskTest, SingleWordPreSlack) { auto mask = make_mask(1, true); - EXPECT_EQ(25, cudf::count_set_bits(mask.data(), 7, 32)); + EXPECT_EQ(25, cudf::detail::count_set_bits(mask.data(), 7, 32, rmm::cuda_stream_default)); + EXPECT_EQ(25, cudf::detail::valid_count(mask.data(), 7, 32, rmm::cuda_stream_default)); std::vector indices = {7, 32, 8, 32}; - auto counts = cudf::segmented_count_set_bits(mask.data(), indices); - EXPECT_THAT(counts, testing::ContainerEq(std::vector{25, 24})); + auto set_counts = + cudf::detail::segmented_count_set_bits(mask.data(), indices, rmm::cuda_stream_default); + EXPECT_THAT(set_counts, ::testing::ElementsAreArray(std::vector{25, 24})); + auto valid_counts = + cudf::detail::segmented_valid_count(mask.data(), indices, rmm::cuda_stream_default); + EXPECT_THAT(valid_counts, ::testing::ElementsAreArray(std::vector{25, 24})); } TEST_F(CountBitmaskTest, SingleWordPostSlack) { auto mask = make_mask(1, true); - EXPECT_EQ(17, cudf::count_set_bits(mask.data(), 0, 17)); + EXPECT_EQ(17, cudf::detail::count_set_bits(mask.data(), 0, 17, rmm::cuda_stream_default)); + EXPECT_EQ(17, cudf::detail::valid_count(mask.data(), 0, 17, rmm::cuda_stream_default)); std::vector indices = {0, 17, 0, 18}; - auto counts = cudf::segmented_count_set_bits(mask.data(), indices); - EXPECT_THAT(counts, testing::ContainerEq(std::vector{17, 18})); + auto set_counts = + cudf::detail::segmented_count_set_bits(mask.data(), indices, rmm::cuda_stream_default); + EXPECT_THAT(set_counts, ::testing::ElementsAreArray(std::vector{17, 18})); + auto valid_counts = + cudf::detail::segmented_valid_count(mask.data(), indices, rmm::cuda_stream_default); + EXPECT_THAT(valid_counts, ::testing::ElementsAreArray(std::vector{17, 18})); } TEST_F(CountBitmaskTest, SingleWordSubset) { auto mask = make_mask(1, true); - EXPECT_EQ(30, cudf::count_set_bits(mask.data(), 1, 31)); + EXPECT_EQ(30, cudf::detail::count_set_bits(mask.data(), 1, 31, rmm::cuda_stream_default)); + EXPECT_EQ(30, cudf::detail::valid_count(mask.data(), 1, 31, rmm::cuda_stream_default)); std::vector indices = {1, 31, 7, 17}; - auto counts = cudf::segmented_count_set_bits(mask.data(), indices); - EXPECT_THAT(counts, testing::ContainerEq(std::vector{30, 10})); + auto set_counts = + cudf::detail::segmented_count_set_bits(mask.data(), indices, rmm::cuda_stream_default); + EXPECT_THAT(set_counts, ::testing::ElementsAreArray(std::vector{30, 10})); + auto valid_counts = + cudf::detail::segmented_valid_count(mask.data(), indices, rmm::cuda_stream_default); + EXPECT_THAT(valid_counts, ::testing::ElementsAreArray(std::vector{30, 10})); } TEST_F(CountBitmaskTest, SingleWordSubset2) { auto mask = make_mask(1, true); - EXPECT_EQ(28, cudf::count_set_bits(mask.data(), 2, 30)); + EXPECT_EQ(28, cudf::detail::count_set_bits(mask.data(), 2, 30, rmm::cuda_stream_default)); + EXPECT_EQ(28, cudf::detail::valid_count(mask.data(), 2, 30, rmm::cuda_stream_default)); std::vector indices = {4, 16, 2, 30}; - auto counts = cudf::segmented_count_set_bits(mask.data(), indices); - EXPECT_THAT(counts, testing::ContainerEq(std::vector{12, 28})); + auto set_counts = + cudf::detail::segmented_count_set_bits(mask.data(), indices, rmm::cuda_stream_default); + EXPECT_THAT(set_counts, ::testing::ElementsAreArray(std::vector{12, 28})); + auto valid_counts = + cudf::detail::segmented_valid_count(mask.data(), indices, rmm::cuda_stream_default); + EXPECT_THAT(valid_counts, ::testing::ElementsAreArray(std::vector{12, 28})); } TEST_F(CountBitmaskTest, MultipleWordsAllBits) { auto mask = make_mask(10, true); - EXPECT_EQ(320, cudf::count_set_bits(mask.data(), 0, 320)); + EXPECT_EQ(320, cudf::detail::count_set_bits(mask.data(), 0, 320, rmm::cuda_stream_default)); + EXPECT_EQ(320, cudf::detail::valid_count(mask.data(), 0, 320, rmm::cuda_stream_default)); std::vector indices = {0, 320, 0, 320}; - auto counts = cudf::segmented_count_set_bits(mask.data(), indices); - EXPECT_THAT(counts, testing::ContainerEq(std::vector{320, 320})); + auto set_counts = + cudf::detail::segmented_count_set_bits(mask.data(), indices, rmm::cuda_stream_default); + EXPECT_THAT(set_counts, ::testing::ElementsAreArray(std::vector{320, 320})); + auto valid_counts = + cudf::detail::segmented_valid_count(mask.data(), indices, rmm::cuda_stream_default); + EXPECT_THAT(valid_counts, ::testing::ElementsAreArray(std::vector{320, 320})); } TEST_F(CountBitmaskTest, MultipleWordsSubsetWordBoundary) { auto mask = make_mask(10, true); - EXPECT_EQ(256, cudf::count_set_bits(mask.data(), 32, 288)); + EXPECT_EQ(256, cudf::detail::count_set_bits(mask.data(), 32, 288, rmm::cuda_stream_default)); + EXPECT_EQ(256, cudf::detail::valid_count(mask.data(), 32, 288, rmm::cuda_stream_default)); std::vector indices = {32, 192, 32, 288}; - auto counts = cudf::segmented_count_set_bits(mask.data(), indices); - EXPECT_THAT(counts, testing::ContainerEq(std::vector{160, 256})); + auto set_counts = + cudf::detail::segmented_count_set_bits(mask.data(), indices, rmm::cuda_stream_default); + EXPECT_THAT(set_counts, ::testing::ElementsAreArray(std::vector{160, 256})); + auto valid_counts = + cudf::detail::segmented_valid_count(mask.data(), indices, rmm::cuda_stream_default); + EXPECT_THAT(valid_counts, ::testing::ElementsAreArray(std::vector{160, 256})); } TEST_F(CountBitmaskTest, MultipleWordsSplitWordBoundary) { auto mask = make_mask(10, true); - EXPECT_EQ(2, cudf::count_set_bits(mask.data(), 31, 33)); + EXPECT_EQ(2, cudf::detail::count_set_bits(mask.data(), 31, 33, rmm::cuda_stream_default)); + EXPECT_EQ(2, cudf::detail::valid_count(mask.data(), 31, 33, rmm::cuda_stream_default)); std::vector indices = {31, 33, 60, 67}; - auto counts = cudf::segmented_count_set_bits(mask.data(), indices); - EXPECT_THAT(counts, testing::ContainerEq(std::vector{2, 7})); + auto set_counts = + cudf::detail::segmented_count_set_bits(mask.data(), indices, rmm::cuda_stream_default); + EXPECT_THAT(set_counts, ::testing::ElementsAreArray(std::vector{2, 7})); + auto valid_counts = + cudf::detail::segmented_valid_count(mask.data(), indices, rmm::cuda_stream_default); + EXPECT_THAT(valid_counts, ::testing::ElementsAreArray(std::vector{2, 7})); } TEST_F(CountBitmaskTest, MultipleWordsSubset) { auto mask = make_mask(10, true); - EXPECT_EQ(226, cudf::count_set_bits(mask.data(), 67, 293)); + EXPECT_EQ(226, cudf::detail::count_set_bits(mask.data(), 67, 293, rmm::cuda_stream_default)); + EXPECT_EQ(226, cudf::detail::valid_count(mask.data(), 67, 293, rmm::cuda_stream_default)); std::vector indices = {67, 293, 37, 319}; - auto counts = cudf::segmented_count_set_bits(mask.data(), indices); - EXPECT_THAT(counts, testing::ContainerEq(std::vector{226, 282})); + auto set_counts = + cudf::detail::segmented_count_set_bits(mask.data(), indices, rmm::cuda_stream_default); + EXPECT_THAT(set_counts, ::testing::ElementsAreArray(std::vector{226, 282})); + auto valid_counts = + cudf::detail::segmented_valid_count(mask.data(), indices, rmm::cuda_stream_default); + EXPECT_THAT(valid_counts, ::testing::ElementsAreArray(std::vector{226, 282})); } TEST_F(CountBitmaskTest, MultipleWordsSingleBit) { auto mask = make_mask(10, true); - EXPECT_EQ(1, cudf::count_set_bits(mask.data(), 67, 68)); + EXPECT_EQ(1, cudf::detail::count_set_bits(mask.data(), 67, 68, rmm::cuda_stream_default)); + EXPECT_EQ(1, cudf::detail::valid_count(mask.data(), 67, 68, rmm::cuda_stream_default)); std::vector indices = {67, 68, 31, 32, 192, 193}; - auto counts = cudf::segmented_count_set_bits(mask.data(), indices); - EXPECT_THAT(counts, testing::ContainerEq(std::vector{1, 1, 1})); + auto set_counts = + cudf::detail::segmented_count_set_bits(mask.data(), indices, rmm::cuda_stream_default); + EXPECT_THAT(set_counts, ::testing::ElementsAreArray(std::vector{1, 1, 1})); + auto valid_counts = + cudf::detail::segmented_valid_count(mask.data(), indices, rmm::cuda_stream_default); + EXPECT_THAT(valid_counts, ::testing::ElementsAreArray(std::vector{1, 1, 1})); } using CountUnsetBitsTest = CountBitmaskTest; @@ -256,123 +343,179 @@ using CountUnsetBitsTest = CountBitmaskTest; TEST_F(CountUnsetBitsTest, SingleBitAllSet) { auto mask = make_mask(1, true); - EXPECT_EQ(0, cudf::count_unset_bits(mask.data(), 13, 14)); + EXPECT_EQ(0, cudf::detail::count_unset_bits(mask.data(), 13, 14, rmm::cuda_stream_default)); + EXPECT_EQ(0, cudf::detail::null_count(mask.data(), 13, 14, rmm::cuda_stream_default)); std::vector indices = {13, 14, 31, 32}; - auto counts = cudf::segmented_count_unset_bits(mask.data(), indices); - EXPECT_THAT(counts, testing::ContainerEq(std::vector{0, 0})); + auto unset_counts = + cudf::detail::segmented_count_unset_bits(mask.data(), indices, rmm::cuda_stream_default); + EXPECT_THAT(unset_counts, ::testing::ElementsAreArray(std::vector{0, 0})); + auto null_counts = + cudf::detail::segmented_count_unset_bits(mask.data(), indices, rmm::cuda_stream_default); + EXPECT_THAT(null_counts, ::testing::ElementsAreArray(std::vector{0, 0})); } TEST_F(CountUnsetBitsTest, NullMask) { - EXPECT_EQ(0, cudf::count_unset_bits(nullptr, 0, 32)); + EXPECT_THROW(cudf::detail::count_unset_bits(nullptr, 0, 32, rmm::cuda_stream_default), + cudf::logic_error); + EXPECT_EQ(0, cudf::detail::null_count(nullptr, 0, 32, rmm::cuda_stream_default)); std::vector indices = {0, 32, 7, 25}; - auto counts = cudf::segmented_count_unset_bits(nullptr, indices); - EXPECT_EQ(indices.size(), counts.size() * 2); - for (size_t i = 0; i < counts.size(); i++) { - EXPECT_EQ(0, counts[i]); - } + EXPECT_THROW(cudf::detail::segmented_count_unset_bits(nullptr, indices, rmm::cuda_stream_default), + cudf::logic_error); + auto null_counts = cudf::detail::segmented_null_count(nullptr, indices, rmm::cuda_stream_default); + EXPECT_THAT(null_counts, ::testing::ElementsAreArray(std::vector{0, 0})); } TEST_F(CountUnsetBitsTest, SingleWordAllBits) { auto mask = make_mask(1); - EXPECT_EQ(32, cudf::count_unset_bits(mask.data(), 0, 32)); + EXPECT_EQ(32, cudf::detail::count_unset_bits(mask.data(), 0, 32, rmm::cuda_stream_default)); + EXPECT_EQ(32, cudf::detail::null_count(mask.data(), 0, 32, rmm::cuda_stream_default)); std::vector indices = {0, 32, 0, 32}; - auto counts = cudf::segmented_count_unset_bits(mask.data(), indices); - EXPECT_THAT(counts, testing::ContainerEq(std::vector{32, 32})); + auto unset_counts = + cudf::detail::segmented_count_unset_bits(mask.data(), indices, rmm::cuda_stream_default); + EXPECT_THAT(unset_counts, ::testing::ElementsAreArray(std::vector{32, 32})); + auto null_counts = + cudf::detail::segmented_null_count(mask.data(), indices, rmm::cuda_stream_default); + EXPECT_THAT(null_counts, ::testing::ElementsAreArray(std::vector{32, 32})); } TEST_F(CountUnsetBitsTest, SingleWordPreSlack) { auto mask = make_mask(1); - EXPECT_EQ(25, cudf::count_unset_bits(mask.data(), 7, 32)); + EXPECT_EQ(25, cudf::detail::count_unset_bits(mask.data(), 7, 32, rmm::cuda_stream_default)); + EXPECT_EQ(25, cudf::detail::null_count(mask.data(), 7, 32, rmm::cuda_stream_default)); std::vector indices = {7, 32, 8, 32}; - auto counts = cudf::segmented_count_unset_bits(mask.data(), indices); - EXPECT_THAT(counts, testing::ContainerEq(std::vector{25, 24})); + auto unset_counts = + cudf::detail::segmented_count_unset_bits(mask.data(), indices, rmm::cuda_stream_default); + EXPECT_THAT(unset_counts, ::testing::ElementsAreArray(std::vector{25, 24})); + auto null_counts = + cudf::detail::segmented_null_count(mask.data(), indices, rmm::cuda_stream_default); + EXPECT_THAT(null_counts, ::testing::ElementsAreArray(std::vector{25, 24})); } TEST_F(CountUnsetBitsTest, SingleWordPostSlack) { auto mask = make_mask(1); - EXPECT_EQ(17, cudf::count_unset_bits(mask.data(), 0, 17)); + EXPECT_EQ(17, cudf::detail::count_unset_bits(mask.data(), 0, 17, rmm::cuda_stream_default)); + EXPECT_EQ(17, cudf::detail::null_count(mask.data(), 0, 17, rmm::cuda_stream_default)); std::vector indices = {0, 17, 0, 18}; - auto counts = cudf::segmented_count_unset_bits(mask.data(), indices); - EXPECT_THAT(counts, testing::ContainerEq(std::vector{17, 18})); + auto unset_counts = + cudf::detail::segmented_count_unset_bits(mask.data(), indices, rmm::cuda_stream_default); + EXPECT_THAT(unset_counts, ::testing::ElementsAreArray(std::vector{17, 18})); + auto null_counts = + cudf::detail::segmented_null_count(mask.data(), indices, rmm::cuda_stream_default); + EXPECT_THAT(null_counts, ::testing::ElementsAreArray(std::vector{17, 18})); } TEST_F(CountUnsetBitsTest, SingleWordSubset) { auto mask = make_mask(1); - EXPECT_EQ(30, cudf::count_unset_bits(mask.data(), 1, 31)); + EXPECT_EQ(30, cudf::detail::count_unset_bits(mask.data(), 1, 31, rmm::cuda_stream_default)); + EXPECT_EQ(30, cudf::detail::null_count(mask.data(), 1, 31, rmm::cuda_stream_default)); std::vector indices = {1, 31, 7, 17}; - auto counts = cudf::segmented_count_unset_bits(mask.data(), indices); - EXPECT_THAT(counts, testing::ContainerEq(std::vector{30, 10})); + auto unset_counts = + cudf::detail::segmented_count_unset_bits(mask.data(), indices, rmm::cuda_stream_default); + EXPECT_THAT(unset_counts, ::testing::ElementsAreArray(std::vector{30, 10})); + auto null_counts = + cudf::detail::segmented_null_count(mask.data(), indices, rmm::cuda_stream_default); + EXPECT_THAT(null_counts, ::testing::ElementsAreArray(std::vector{30, 10})); } TEST_F(CountUnsetBitsTest, SingleWordSubset2) { auto mask = make_mask(1); - EXPECT_EQ(28, cudf::count_unset_bits(mask.data(), 2, 30)); + EXPECT_EQ(28, cudf::detail::count_unset_bits(mask.data(), 2, 30, rmm::cuda_stream_default)); + EXPECT_EQ(28, cudf::detail::null_count(mask.data(), 2, 30, rmm::cuda_stream_default)); std::vector indices = {4, 16, 2, 30}; - auto counts = cudf::segmented_count_unset_bits(mask.data(), indices); - EXPECT_THAT(counts, testing::ContainerEq(std::vector{12, 28})); + auto unset_counts = + cudf::detail::segmented_count_unset_bits(mask.data(), indices, rmm::cuda_stream_default); + EXPECT_THAT(unset_counts, ::testing::ElementsAreArray(std::vector{12, 28})); + auto null_counts = + cudf::detail::segmented_null_count(mask.data(), indices, rmm::cuda_stream_default); + EXPECT_THAT(null_counts, ::testing::ElementsAreArray(std::vector{12, 28})); } TEST_F(CountUnsetBitsTest, MultipleWordsAllBits) { auto mask = make_mask(10); - EXPECT_EQ(320, cudf::count_unset_bits(mask.data(), 0, 320)); + EXPECT_EQ(320, cudf::detail::count_unset_bits(mask.data(), 0, 320, rmm::cuda_stream_default)); + EXPECT_EQ(320, cudf::detail::null_count(mask.data(), 0, 320, rmm::cuda_stream_default)); std::vector indices = {0, 320, 0, 320}; - auto counts = cudf::segmented_count_unset_bits(mask.data(), indices); - EXPECT_THAT(counts, testing::ContainerEq(std::vector{320, 320})); + auto unset_counts = + cudf::detail::segmented_count_unset_bits(mask.data(), indices, rmm::cuda_stream_default); + EXPECT_THAT(unset_counts, ::testing::ElementsAreArray(std::vector{320, 320})); + auto null_counts = + cudf::detail::segmented_null_count(mask.data(), indices, rmm::cuda_stream_default); + EXPECT_THAT(null_counts, ::testing::ElementsAreArray(std::vector{320, 320})); } TEST_F(CountUnsetBitsTest, MultipleWordsSubsetWordBoundary) { auto mask = make_mask(10); - EXPECT_EQ(256, cudf::count_unset_bits(mask.data(), 32, 288)); + EXPECT_EQ(256, cudf::detail::count_unset_bits(mask.data(), 32, 288, rmm::cuda_stream_default)); + EXPECT_EQ(256, cudf::detail::null_count(mask.data(), 32, 288, rmm::cuda_stream_default)); std::vector indices = {32, 192, 32, 288}; - auto counts = cudf::segmented_count_unset_bits(mask.data(), indices); - EXPECT_THAT(counts, testing::ContainerEq(std::vector{160, 256})); + auto unset_counts = + cudf::detail::segmented_count_unset_bits(mask.data(), indices, rmm::cuda_stream_default); + EXPECT_THAT(unset_counts, ::testing::ElementsAreArray(std::vector{160, 256})); + auto null_counts = + cudf::detail::segmented_null_count(mask.data(), indices, rmm::cuda_stream_default); + EXPECT_THAT(null_counts, ::testing::ElementsAreArray(std::vector{160, 256})); } TEST_F(CountUnsetBitsTest, MultipleWordsSplitWordBoundary) { auto mask = make_mask(10); - EXPECT_EQ(2, cudf::count_unset_bits(mask.data(), 31, 33)); + EXPECT_EQ(2, cudf::detail::count_unset_bits(mask.data(), 31, 33, rmm::cuda_stream_default)); + EXPECT_EQ(2, cudf::detail::null_count(mask.data(), 31, 33, rmm::cuda_stream_default)); std::vector indices = {31, 33, 60, 67}; - auto counts = cudf::segmented_count_unset_bits(mask.data(), indices); - EXPECT_THAT(counts, ::testing::ContainerEq(std::vector{2, 7})); + auto unset_counts = + cudf::detail::segmented_count_unset_bits(mask.data(), indices, rmm::cuda_stream_default); + EXPECT_THAT(unset_counts, ::testing::ElementsAreArray(std::vector{2, 7})); + auto null_counts = + cudf::detail::segmented_null_count(mask.data(), indices, rmm::cuda_stream_default); + EXPECT_THAT(null_counts, ::testing::ElementsAreArray(std::vector{2, 7})); } TEST_F(CountUnsetBitsTest, MultipleWordsSubset) { auto mask = make_mask(10); - EXPECT_EQ(226, cudf::count_unset_bits(mask.data(), 67, 293)); + EXPECT_EQ(226, cudf::detail::count_unset_bits(mask.data(), 67, 293, rmm::cuda_stream_default)); + EXPECT_EQ(226, cudf::detail::null_count(mask.data(), 67, 293, rmm::cuda_stream_default)); std::vector indices = {67, 293, 37, 319}; - auto counts = cudf::segmented_count_unset_bits(mask.data(), indices); - EXPECT_THAT(counts, ::testing::ContainerEq(std::vector{226, 282})); + auto unset_counts = + cudf::detail::segmented_count_unset_bits(mask.data(), indices, rmm::cuda_stream_default); + EXPECT_THAT(unset_counts, ::testing::ElementsAreArray(std::vector{226, 282})); + auto null_counts = + cudf::detail::segmented_null_count(mask.data(), indices, rmm::cuda_stream_default); + EXPECT_THAT(null_counts, ::testing::ElementsAreArray(std::vector{226, 282})); } TEST_F(CountUnsetBitsTest, MultipleWordsSingleBit) { auto mask = make_mask(10); - EXPECT_EQ(1, cudf::count_unset_bits(mask.data(), 67, 68)); + EXPECT_EQ(1, cudf::detail::count_unset_bits(mask.data(), 67, 68, rmm::cuda_stream_default)); + EXPECT_EQ(1, cudf::detail::null_count(mask.data(), 67, 68, rmm::cuda_stream_default)); std::vector indices = {67, 68, 31, 32, 192, 193}; - auto counts = cudf::segmented_count_unset_bits(mask.data(), indices); - EXPECT_THAT(counts, ::testing::ContainerEq(std::vector{1, 1, 1})); + auto unset_counts = + cudf::detail::segmented_count_unset_bits(mask.data(), indices, rmm::cuda_stream_default); + EXPECT_THAT(unset_counts, ::testing::ElementsAreArray(std::vector{1, 1, 1})); + auto null_counts = + cudf::detail::segmented_null_count(mask.data(), indices, rmm::cuda_stream_default); + EXPECT_THAT(null_counts, ::testing::ElementsAreArray(std::vector{1, 1, 1})); } struct CopyBitmaskTest : public cudf::test::BaseFixture, cudf::test::UniformRandomGenerator { diff --git a/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp b/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp index c80a8fba55c..b78c3b9417f 100644 --- a/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp +++ b/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp @@ -16,6 +16,7 @@ #include #include +#include #include #include #include @@ -264,9 +265,10 @@ TEST_F(ApplyBooleanMask, CorrectNullCount) cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i % 277) == 0; }); cudf::test::fixed_width_column_wrapper boolean_mask(seq3, seq3 + inputRows); - auto got = cudf::apply_boolean_mask(input, boolean_mask); - auto out_col = got->get_column(0).view(); - auto expected_null_count = cudf::count_unset_bits(out_col.null_mask(), 0, out_col.size()); + auto got = cudf::apply_boolean_mask(input, boolean_mask); + auto out_col = got->get_column(0).view(); + auto expected_null_count = + cudf::detail::null_count(out_col.null_mask(), 0, out_col.size(), rmm::cuda_stream_default); ASSERT_EQ(out_col.null_count(), expected_null_count); }