From 67deda086f011a4280a6964639250462aa734242 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 10 Jul 2023 11:34:51 -0700 Subject: [PATCH] Clean up cudf device atomic with `cuda::atomic_ref` (#13583) Contributes to #13575 Depends on #13574, #13578 This PR cleans up custom atomic implementations in libcudf by using `cuda::atomic_ref` when possible. It removes atomic bitwise operations like `and`, `or` and `xor` since libcudac++ already provides proper replacements. Authors: - Yunsong Wang (https://github.com/PointKernel) Approvers: - Bradley Dice (https://github.com/bdice) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/13583 --- cpp/include/cudf/detail/copy_if.cuh | 14 +- .../cudf/detail/utilities/device_atomics.cuh | 124 +----------------- .../detail/utilities/device_operators.cuh | 33 ----- cpp/src/groupby/hash/multi_pass_kernels.cuh | 8 +- cpp/src/hash/concurrent_unordered_map.cuh | 40 +++--- cpp/src/hash/unordered_multiset.cuh | 49 +++---- cpp/src/io/json/json_column.cu | 8 +- cpp/src/join/conditional_join_kernels.cuh | 7 +- cpp/src/join/join_common_utils.hpp | 3 +- cpp/src/join/mixed_join_size_kernel.cuh | 5 +- cpp/src/join/mixed_join_size_kernels_semi.cu | 5 +- cpp/src/reductions/all.cu | 8 +- cpp/src/reductions/any.cu | 8 +- cpp/src/strings/case.cu | 8 +- cpp/src/strings/search/find.cu | 7 +- cpp/src/strings/split/split.cuh | 7 +- cpp/src/text/minhash.cu | 7 +- .../device_atomics/device_atomics_test.cu | 91 ------------- cpp/tests/hash_map/map_test.cu | 5 +- 19 files changed, 122 insertions(+), 315 deletions(-) diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh index d611e94a4d7..1dd91dcd865 100644 --- a/cpp/include/cudf/detail/copy_if.cuh +++ b/cpp/include/cudf/detail/copy_if.cuh @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include @@ -44,6 +43,8 @@ #include +#include + #include namespace cudf { @@ -181,7 +182,9 @@ __launch_bounds__(block_size) __global__ if (wid > 0 && wid < last_warp) output_valid[valid_index] = valid_warp; else { - atomicOr(&output_valid[valid_index], valid_warp); + cuda::atomic_ref ref{ + output_valid[valid_index]}; + ref.fetch_or(valid_warp, cuda::std::memory_order_relaxed); } } @@ -190,7 +193,9 @@ __launch_bounds__(block_size) __global__ uint32_t valid_warp = __ballot_sync(0xffff'ffffu, temp_valids[block_size + threadIdx.x]); if (lane == 0 && valid_warp != 0) { tmp_warp_valid_counts += __popc(valid_warp); - atomicOr(&output_valid[valid_index + num_warps], valid_warp); + cuda::atomic_ref ref{ + output_valid[valid_index + num_warps]}; + ref.fetch_or(valid_warp, cuda::std::memory_order_relaxed); } } } @@ -206,7 +211,8 @@ __launch_bounds__(block_size) __global__ cudf::detail::single_lane_block_sum_reduce(warp_valid_counts); if (threadIdx.x == 0) { // one thread computes and adds to null count - atomicAdd(output_null_count, block_sum - block_valid_count); + cuda::atomic_ref ref{*output_null_count}; + ref.fetch_add(block_sum - block_valid_count, cuda::std::memory_order_relaxed); } } diff --git a/cpp/include/cudf/detail/utilities/device_atomics.cuh b/cpp/include/cudf/detail/utilities/device_atomics.cuh index c1fc96d6f43..c56e88f07a8 100644 --- a/cpp/include/cudf/detail/utilities/device_atomics.cuh +++ b/cpp/include/cudf/detail/utilities/device_atomics.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,7 +27,6 @@ * cudf::duration_us, cudf::duration_ns and bool * where CUDA atomic operations are, `atomicAdd`, `atomicMin`, `atomicMax`, * `atomicCAS`. - * `atomicAnd`, `atomicOr`, `atomicXor` are also supported for integer data types. * Also provides `cudf::genericAtomicOperation` which performs atomic operation * with the given binary operator. */ @@ -161,7 +160,6 @@ struct genericAtomicOperationImpl { // specialized functions for operators // `atomicAdd` supports int32, float, double (signed int64 is not supported.) // `atomicMin`, `atomicMax` support int32_t, int64_t -// `atomicAnd`, `atomicOr`, `atomicXor` support int32_t, int64_t template <> struct genericAtomicOperationImpl { using T = float; @@ -252,63 +250,6 @@ struct genericAtomicOperationImpl { return ret; } }; - -template -struct genericAtomicOperationImpl { - __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceAnd op) - { - return atomicAnd(addr, update_value); - } -}; - -template -struct genericAtomicOperationImpl { - __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceAnd op) - { - using T_int = long long int; - static_assert(sizeof(T) == sizeof(T_int)); - T ret = atomicAnd(reinterpret_cast(addr), type_reinterpret(update_value)); - return ret; - } -}; - -template -struct genericAtomicOperationImpl { - __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceOr op) - { - return atomicOr(addr, update_value); - } -}; - -template -struct genericAtomicOperationImpl { - __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceOr op) - { - using T_int = long long int; - static_assert(sizeof(T) == sizeof(T_int)); - T ret = atomicOr(reinterpret_cast(addr), type_reinterpret(update_value)); - return ret; - } -}; - -template -struct genericAtomicOperationImpl { - __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceXor op) - { - return atomicXor(addr, update_value); - } -}; - -template -struct genericAtomicOperationImpl { - __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceXor op) - { - using T_int = long long int; - static_assert(sizeof(T) == sizeof(T_int)); - T ret = atomicXor(reinterpret_cast(addr), type_reinterpret(update_value)); - return ret; - } -}; // ----------------------------------------------------------------------- // the implementation of `typesAtomicCASImpl` template @@ -598,66 +539,3 @@ __forceinline__ __device__ T atomicCAS(T* address, T compare, T val) { return cudf::detail::typesAtomicCASImpl()(address, compare, val); } - -/** - * @brief Overloads for `atomicAnd` - * reads the `old` located at the `address` in global or shared memory, - * computes (old & val), and stores the result back to memory at the same - * address. These three operations are performed in one atomic transaction. - * - * The supported types for `atomicAnd` are: - * singed/unsigned integer 8/16/32/64 bits - * Cuda natively supports `sint32`, `uint32`, `sint64`, `uint64`. - * - * @param[in] address The address of old value in global or shared memory - * @param[in] val The value to be computed - * - * @returns The old value at `address` - */ -template , T>* = nullptr> -__forceinline__ __device__ T atomicAnd(T* address, T val) -{ - return cudf::genericAtomicOperation(address, val, cudf::DeviceAnd{}); -} - -/** - * @brief Overloads for `atomicOr` - * reads the `old` located at the `address` in global or shared memory, - * computes (old | val), and stores the result back to memory at the same - * address. These three operations are performed in one atomic transaction. - * - * The supported types for `atomicOr` are: - * singed/unsigned integer 8/16/32/64 bits - * Cuda natively supports `sint32`, `uint32`, `sint64`, `uint64`. - * - * @param[in] address The address of old value in global or shared memory - * @param[in] val The value to be computed - * - * @returns The old value at `address` - */ -template , T>* = nullptr> -__forceinline__ __device__ T atomicOr(T* address, T val) -{ - return cudf::genericAtomicOperation(address, val, cudf::DeviceOr{}); -} - -/** - * @brief Overloads for `atomicXor` - * reads the `old` located at the `address` in global or shared memory, - * computes (old ^ val), and stores the result back to memory at the same - * address. These three operations are performed in one atomic transaction. - * - * The supported types for `atomicXor` are: - * singed/unsigned integer 8/16/32/64 bits - * Cuda natively supports `sint32`, `uint32`, `sint64`, `uint64`. - * - * @param[in] address The address of old value in global or shared memory - * @param[in] val The value to be computed - * - * @returns The old value at `address` - */ -template , T>* = nullptr> -__forceinline__ __device__ T atomicXor(T* address, T val) -{ - return cudf::genericAtomicOperation(address, val, cudf::DeviceXor{}); -} diff --git a/cpp/include/cudf/detail/utilities/device_operators.cuh b/cpp/include/cudf/detail/utilities/device_operators.cuh index 8490d3ee73b..46f424e051b 100644 --- a/cpp/include/cudf/detail/utilities/device_operators.cuh +++ b/cpp/include/cudf/detail/utilities/device_operators.cuh @@ -230,39 +230,6 @@ struct DeviceProduct { } }; -/** - * @brief binary `and` operator - */ -struct DeviceAnd { - template >* = nullptr> - CUDF_HOST_DEVICE inline auto operator()(T const& lhs, T const& rhs) -> decltype(lhs & rhs) - { - return (lhs & rhs); - } -}; - -/** - * @brief binary `or` operator - */ -struct DeviceOr { - template >* = nullptr> - CUDF_HOST_DEVICE inline auto operator()(T const& lhs, T const& rhs) -> decltype(lhs | rhs) - { - return (lhs | rhs); - } -}; - -/** - * @brief binary `xor` operator - */ -struct DeviceXor { - template >* = nullptr> - CUDF_HOST_DEVICE inline auto operator()(T const& lhs, T const& rhs) -> decltype(lhs ^ rhs) - { - return (lhs ^ rhs); - } -}; - /** * @brief Operator for calculating Lead/Lag window function. */ diff --git a/cpp/src/groupby/hash/multi_pass_kernels.cuh b/cpp/src/groupby/hash/multi_pass_kernels.cuh index 15a38029bc4..4bc73631732 100644 --- a/cpp/src/groupby/hash/multi_pass_kernels.cuh +++ b/cpp/src/groupby/hash/multi_pass_kernels.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,11 +20,12 @@ #include #include #include -#include #include #include #include +#include + #include namespace cudf { @@ -86,7 +87,8 @@ struct var_hash_functor { auto x = static_cast(source.element(source_index)); auto mean = static_cast(sum.element(target_index)) / group_size; Target result = (x - mean) * (x - mean) / (group_size - ddof); - atomicAdd(&target.element(target_index), result); + cuda::atomic_ref ref{target.element(target_index)}; + ref.fetch_add(result, cuda::std::memory_order_relaxed); // STD sqrt is applied in finalize() if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); } diff --git a/cpp/src/hash/concurrent_unordered_map.cuh b/cpp/src/hash/concurrent_unordered_map.cuh index 2b9f2a1b0fa..5acfba0a8bf 100644 --- a/cpp/src/hash/concurrent_unordered_map.cuh +++ b/cpp/src/hash/concurrent_unordered_map.cuh @@ -21,7 +21,6 @@ #include #include -#include #include #include #include @@ -35,6 +34,8 @@ #include #include +#include + namespace { template struct packed { @@ -91,8 +92,8 @@ union pair_packer; template union pair_packer()>> { using packed_type = packed_t; - packed_type const packed; - pair_type const pair; + packed_type packed; + pair_type pair; __device__ pair_packer(pair_type _pair) : pair{_pair} {} @@ -268,16 +269,21 @@ class concurrent_unordered_map { __device__ std::enable_if_t(), insert_result> attempt_insert( value_type* const __restrict__ insert_location, value_type const& insert_pair) { - pair_packer const unused{thrust::make_pair(m_unused_key, m_unused_element)}; - pair_packer const new_pair{insert_pair}; - pair_packer const old{ - atomicCAS(reinterpret_cast::packed_type*>(insert_location), - unused.packed, - new_pair.packed)}; + pair_packer expected{thrust::make_pair(m_unused_key, m_unused_element)}; + pair_packer desired{insert_pair}; + + using packed_type = typename pair_packer::packed_type; - if (old.packed == unused.packed) { return insert_result::SUCCESS; } + auto* insert_ptr = reinterpret_cast(insert_location); + cuda::atomic_ref ref{*insert_ptr}; + auto const success = + ref.compare_exchange_strong(expected.packed, desired.packed, cuda::std::memory_order_relaxed); - if (m_equal(old.pair.first, insert_pair.first)) { return insert_result::DUPLICATE; } + if (success) { + return insert_result::SUCCESS; + } else if (m_equal(expected.pair.first, insert_pair.first)) { + return insert_result::DUPLICATE; + } return insert_result::CONTINUE; } @@ -292,16 +298,20 @@ class concurrent_unordered_map { __device__ std::enable_if_t(), insert_result> attempt_insert( value_type* const __restrict__ insert_location, value_type const& insert_pair) { - key_type const old_key{atomicCAS(&(insert_location->first), m_unused_key, insert_pair.first)}; + auto expected = m_unused_key; + cuda::atomic_ref ref{insert_location->first}; + auto const key_success = + ref.compare_exchange_strong(expected, insert_pair.first, cuda::std::memory_order_relaxed); // Hash bucket empty - if (m_unused_key == old_key) { + if (key_success) { insert_location->second = insert_pair.second; return insert_result::SUCCESS; } - // Key already exists - if (m_equal(old_key, insert_pair.first)) { return insert_result::DUPLICATE; } + else if (m_equal(expected, insert_pair.first)) { + return insert_result::DUPLICATE; + } return insert_result::CONTINUE; } diff --git a/cpp/src/hash/unordered_multiset.cuh b/cpp/src/hash/unordered_multiset.cuh index 6c807eca4c3..96e6728df81 100644 --- a/cpp/src/hash/unordered_multiset.cuh +++ b/cpp/src/hash/unordered_multiset.cuh @@ -18,7 +18,6 @@ #include -#include #include #include #include @@ -32,6 +31,8 @@ #include #include +#include + namespace cudf { namespace detail { /* @@ -95,16 +96,18 @@ class unordered_multiset { size_type* d_hash_bins_end = hash_bins_end.data(); Element* d_hash_data = hash_data.data(); - thrust::for_each(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(col.size()), - [d_hash_bins_start, d_col, hasher] __device__(size_t idx) { - if (!d_col.is_null(idx)) { - Element e = d_col.element(idx); - size_type tmp = hasher(e) % (2 * d_col.size()); - atomicAdd(d_hash_bins_start + tmp, size_type{1}); - } - }); + thrust::for_each( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(col.size()), + [d_hash_bins_start, d_col, hasher] __device__(size_t idx) { + if (!d_col.is_null(idx)) { + Element e = d_col.element(idx); + size_type tmp = hasher(e) % (2 * d_col.size()); + cuda::atomic_ref ref{*(d_hash_bins_start + tmp)}; + ref.fetch_add(1, cuda::std::memory_order_relaxed); + } + }); thrust::exclusive_scan(rmm::exec_policy(stream), hash_bins_start.begin(), @@ -116,17 +119,19 @@ class unordered_multiset { hash_bins_end.end(), hash_bins_start.begin()); - thrust::for_each(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(col.size()), - [d_hash_bins_end, d_hash_data, d_col, hasher] __device__(size_t idx) { - if (!d_col.is_null(idx)) { - Element e = d_col.element(idx); - size_type tmp = hasher(e) % (2 * d_col.size()); - size_type offset = atomicAdd(d_hash_bins_end + tmp, size_type{1}); - d_hash_data[offset] = e; - } - }); + thrust::for_each( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(col.size()), + [d_hash_bins_end, d_hash_data, d_col, hasher] __device__(size_t idx) { + if (!d_col.is_null(idx)) { + Element e = d_col.element(idx); + size_type tmp = hasher(e) % (2 * d_col.size()); + cuda::atomic_ref ref{*(d_hash_bins_end + tmp)}; + size_type offset = ref.fetch_add(1, cuda::std::memory_order_relaxed); + d_hash_data[offset] = e; + } + }); return unordered_multiset(d_col.size(), std::move(hash_bins_start), std::move(hash_data)); } diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 29c7f3e105e..3a79d832d06 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -21,7 +21,6 @@ #include #include #include -#include #include #include #include @@ -48,6 +47,8 @@ #include #include +#include + #include #include @@ -232,8 +233,9 @@ reduce_to_column_tree(tree_meta_t& tree, auto parent_col_id = parent_col_ids[col_id]; if (parent_col_id != parent_node_sentinel and column_categories[parent_col_id] == node_t::NC_LIST) { - atomicMax(list_parents_children_max_row_offsets + parent_col_id, - max_row_offsets[col_id]); + cuda::atomic_ref ref{ + *(list_parents_children_max_row_offsets + parent_col_id)}; + ref.fetch_max(max_row_offsets[col_id], cuda::std::memory_order_relaxed); } }); thrust::gather_if( diff --git a/cpp/src/join/conditional_join_kernels.cuh b/cpp/src/join/conditional_join_kernels.cuh index 30650f6769f..dc455ad9cef 100644 --- a/cpp/src/join/conditional_join_kernels.cuh +++ b/cpp/src/join/conditional_join_kernels.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -106,7 +106,10 @@ __global__ void compute_conditional_join_output_size( std::size_t block_counter = BlockReduce(temp_storage).Sum(thread_counter); // Add block counter to global counter - if (threadIdx.x == 0) atomicAdd(output_size, block_counter); + if (threadIdx.x == 0) { + cuda::atomic_ref ref{*output_size}; + ref.fetch_add(block_counter, cuda::std::memory_order_relaxed); + } } /** diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp index b78b55bdacc..cbccd78049a 100644 --- a/cpp/src/join/join_common_utils.hpp +++ b/cpp/src/join/join_common_utils.hpp @@ -16,7 +16,6 @@ #pragma once #include -#include #include #include #include @@ -30,6 +29,8 @@ #include #include +#include + #include namespace cudf { diff --git a/cpp/src/join/mixed_join_size_kernel.cuh b/cpp/src/join/mixed_join_size_kernel.cuh index 0b7596dbe6b..ef377dadc4b 100644 --- a/cpp/src/join/mixed_join_size_kernel.cuh +++ b/cpp/src/join/mixed_join_size_kernel.cuh @@ -95,7 +95,10 @@ __launch_bounds__(block_size) __global__ void compute_mixed_join_output_size( std::size_t block_counter = BlockReduce(temp_storage).Sum(thread_counter); // Add block counter to global counter - if (threadIdx.x == 0) atomicAdd(output_size, block_counter); + if (threadIdx.x == 0) { + cuda::atomic_ref ref{*output_size}; + ref.fetch_add(block_counter, cuda::std::memory_order_relaxed); + } } } // namespace detail diff --git a/cpp/src/join/mixed_join_size_kernels_semi.cu b/cpp/src/join/mixed_join_size_kernels_semi.cu index 009669252fb..fd7bf0234e9 100644 --- a/cpp/src/join/mixed_join_size_kernels_semi.cu +++ b/cpp/src/join/mixed_join_size_kernels_semi.cu @@ -83,7 +83,10 @@ __launch_bounds__(block_size) __global__ void compute_mixed_join_output_size_sem std::size_t block_counter = BlockReduce(temp_storage).Sum(thread_counter); // Add block counter to global counter - if (threadIdx.x == 0) atomicAdd(output_size, block_counter); + if (threadIdx.x == 0) { + cuda::atomic_ref ref{*output_size}; + ref.fetch_add(block_counter, cuda::std::memory_order_relaxed); + } } template __global__ void compute_mixed_join_output_size_semi( diff --git a/cpp/src/reductions/all.cu b/cpp/src/reductions/all.cu index c2f278532ac..4717c0673e3 100644 --- a/cpp/src/reductions/all.cu +++ b/cpp/src/reductions/all.cu @@ -16,7 +16,6 @@ #include "simple.cuh" -#include #include #include @@ -25,6 +24,8 @@ #include #include +#include + namespace cudf { namespace reduction { namespace detail { @@ -43,7 +44,10 @@ struct all_fn { struct all_true_fn { __device__ void operator()(size_type idx) { - if (*d_result && (iter[idx] != *d_result)) atomicAnd(d_result, false); + if (*d_result && (iter[idx] != *d_result)) { + cuda::atomic_ref ref{*d_result}; + ref.fetch_and(0, cuda::std::memory_order_relaxed); + } } Iterator iter; int32_t* d_result; diff --git a/cpp/src/reductions/any.cu b/cpp/src/reductions/any.cu index d1b66d5c254..f3093df5ac7 100644 --- a/cpp/src/reductions/any.cu +++ b/cpp/src/reductions/any.cu @@ -16,7 +16,6 @@ #include "simple.cuh" -#include #include #include @@ -25,6 +24,8 @@ #include #include +#include + namespace cudf { namespace reduction { namespace detail { @@ -43,7 +44,10 @@ struct any_fn { struct any_true_fn { __device__ void operator()(size_type idx) { - if (!*d_result && (iter[idx] != *d_result)) atomicOr(d_result, true); + if (!*d_result && (iter[idx] != *d_result)) { + cuda::atomic_ref ref{*d_result}; + ref.fetch_or(1, cuda::std::memory_order_relaxed); + } } Iterator iter; int32_t* d_result; diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu index 02660c46c63..c5fe7a19f53 100644 --- a/cpp/src/strings/case.cu +++ b/cpp/src/strings/case.cu @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include @@ -33,6 +32,8 @@ #include #include +#include + namespace cudf { namespace strings { namespace detail { @@ -167,7 +168,10 @@ struct count_bytes_fn { size += converter.process_character(u8); } // this is every so slightly faster than using the cub::warp_reduce - if (size > 0) atomicAdd(d_offsets + str_idx, size); + if (size > 0) { + cuda::atomic_ref ref{*(d_offsets + str_idx)}; + ref.fetch_add(size, cuda::std::memory_order_relaxed); + } } }; diff --git a/cpp/src/strings/search/find.cu b/cpp/src/strings/search/find.cu index f71c4b6b49e..e5ce88e7583 100644 --- a/cpp/src/strings/search/find.cu +++ b/cpp/src/strings/search/find.cu @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include @@ -37,6 +36,8 @@ #include #include +#include + namespace cudf { namespace strings { namespace detail { @@ -154,7 +155,9 @@ __global__ void finder_warp_parallel_fn(column_device_view const d_strings, // find stores the minimum position while rfind stores the maximum position // note that this was slightly faster than using cub::WarpReduce - forward ? atomicMin(d_results + str_idx, position) : atomicMax(d_results + str_idx, position); + cuda::atomic_ref ref{*(d_results + str_idx)}; + forward ? ref.fetch_min(position, cuda::std::memory_order_relaxed) + : ref.fetch_max(position, cuda::std::memory_order_relaxed); __syncwarp(); if (lane_idx == 0) { diff --git a/cpp/src/strings/split/split.cuh b/cpp/src/strings/split/split.cuh index 31257a441a1..e76d8ac1c60 100644 --- a/cpp/src/strings/split/split.cuh +++ b/cpp/src/strings/split/split.cuh @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include @@ -37,6 +36,8 @@ #include #include +#include + namespace cudf::strings::detail { /** @@ -356,7 +357,9 @@ std::pair, rmm::device_uvector> split delimiter_count, [d_string_indices, d_delimiter_offsets] __device__(size_type idx) { auto const str_idx = d_string_indices[idx] - 1; - atomicAdd(d_delimiter_offsets + str_idx, 1); + cuda::atomic_ref ref{ + *(d_delimiter_offsets + str_idx)}; + ref.fetch_add(1, cuda::std::memory_order_relaxed); }); // finally, convert the delimiter counts into offsets thrust::exclusive_scan(rmm::exec_policy(stream), diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index d2cc90bb971..6658d574dcc 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -25,7 +25,6 @@ #include #include #include -#include #include #include #include @@ -41,6 +40,8 @@ #include +#include + namespace nvtext { namespace detail { namespace { @@ -92,7 +93,9 @@ struct minhash_fn { for (std::size_t seed_idx = 0; seed_idx < seeds.size(); ++seed_idx) { auto const hasher = cudf::detail::MurmurHash3_32{seeds[seed_idx]}; auto const hvalue = hasher(hash_str); - atomicMin(d_output + seed_idx, hvalue); + cuda::atomic_ref ref{ + *(d_output + seed_idx)}; + ref.fetch_min(hvalue, cuda::std::memory_order_relaxed); } } } diff --git a/cpp/tests/device_atomics/device_atomics_test.cu b/cpp/tests/device_atomics/device_atomics_test.cu index d50db649354..24195362d92 100644 --- a/cpp/tests/device_atomics/device_atomics_test.cu +++ b/cpp/tests/device_atomics/device_atomics_test.cu @@ -256,95 +256,4 @@ TYPED_TEST(AtomicsTest, atomicCASRandom) this->atomic_test(input_array, is_cas_test, block_size, grid_size); } -template -__global__ void gpu_atomic_bitwiseOp_test(T* result, T* data, size_t size) -{ - size_t id = blockIdx.x * blockDim.x + threadIdx.x; - size_t step = blockDim.x * gridDim.x; - - for (; id < size; id += step) { - atomicAnd(&result[0], data[id]); - atomicOr(&result[1], data[id]); - atomicXor(&result[2], data[id]); - cudf::genericAtomicOperation(&result[3], data[id], cudf::DeviceAnd{}); - cudf::genericAtomicOperation(&result[4], data[id], cudf::DeviceOr{}); - cudf::genericAtomicOperation(&result[5], data[id], cudf::DeviceXor{}); - } -} - -template -struct AtomicsBitwiseOpTest : public cudf::test::BaseFixture { - void atomic_test(std::vector const& v_input, int block_size = 0, int grid_size = 1) - { - size_t vec_size = v_input.size(); - std::vector v(vec_size); - std::transform(v_input.begin(), v_input.end(), v.begin(), [](int x) { - T t(x); - return t; - }); - - thrust::host_vector identity(9, T{0}); // +3 elements padding for int8 tests - identity[0] = T(~0ull); - identity[3] = T(~0ull); - - T exact[3]; - exact[0] = std::accumulate( - v.begin(), v.end(), identity[0], [](T acc, uint64_t i) { return acc & T(i); }); - exact[1] = std::accumulate( - v.begin(), v.end(), identity[1], [](T acc, uint64_t i) { return acc | T(i); }); - exact[2] = std::accumulate( - v.begin(), v.end(), identity[2], [](T acc, uint64_t i) { return acc ^ T(i); }); - - auto dev_result = cudf::detail::make_device_uvector_sync( - identity, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); - auto dev_data = cudf::detail::make_device_uvector_sync( - v, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); - - if (block_size == 0) { block_size = vec_size; } - - gpu_atomic_bitwiseOp_test<<>>( - reinterpret_cast(dev_result.data()), reinterpret_cast(dev_data.data()), vec_size); - - auto host_result = cudf::detail::make_host_vector_sync(dev_result, cudf::get_default_stream()); - - CUDF_CHECK_CUDA(cudf::get_default_stream().value()); - - // print_exact(exact, "exact"); - // print_exact(host_result.data(), "result"); - - EXPECT_EQ(host_result[0], exact[0]) << "atomicAnd test failed"; - EXPECT_EQ(host_result[1], exact[1]) << "atomicOr test failed"; - EXPECT_EQ(host_result[2], exact[2]) << "atomicXor test failed"; - EXPECT_EQ(host_result[3], exact[0]) << "atomicAnd test(2) failed"; - EXPECT_EQ(host_result[4], exact[1]) << "atomicOr test(2) failed"; - EXPECT_EQ(host_result[5], exact[2]) << "atomicXor test(2) failed"; - } - - [[maybe_unused]] void print_exact(T const* v, char const* msg) - { - std::cout << std::hex << std::showbase; - std::cout << "The " << msg << " = {" << +v[0] << ", " << +v[1] << ", " << +v[2] << "}" - << std::endl; - } -}; - -using BitwiseOpTestingTypes = - cudf::test::Types; - -TYPED_TEST_SUITE(AtomicsBitwiseOpTest, BitwiseOpTestingTypes); - -TYPED_TEST(AtomicsBitwiseOpTest, atomicBitwiseOps) -{ - { // test for AND, XOR - std::vector input_array( - {0xfcfc'fcfc'fcfc'fc7f, 0x7f'7f7f'7f7f'7ffc, 0xfffd'dffd'dffd'dfdf, 0x7f'7f7f'7f7f'7ffc}); - this->atomic_test(input_array); - } - { // test for OR, XOR - std::vector input_array( - {0x01, 0xfc02, 0x1d'ff03, 0x1100'a0b0'801d'0003, 0x8000'0000'0000'0000, 0x1d'ff03}); - this->atomic_test(input_array); - } -} - CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/hash_map/map_test.cu b/cpp/tests/hash_map/map_test.cu index f38c5b3f58f..8d71c512c79 100644 --- a/cpp/tests/hash_map/map_test.cu +++ b/cpp/tests/hash_map/map_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2022, NVIDIA CORPORATION. + * Copyright (c) 2018-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -68,10 +68,7 @@ struct InsertTest : public cudf::test::BaseFixture { using TestTypes = ::testing::Types, key_value_types, - key_value_types, key_value_types, - key_value_types, - key_value_types, key_value_types, key_value_types>;