From 67deda086f011a4280a6964639250462aa734242 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 10 Jul 2023 11:34:51 -0700
Subject: [PATCH] Clean up cudf device atomic with `cuda::atomic_ref` (#13583)

Contributes to #13575

Depends on #13574, #13578

This PR cleans up custom atomic implementations in libcudf by using `cuda::atomic_ref` when possible. It removes atomic bitwise operations like `and`, `or` and `xor` since libcudac++ already provides proper replacements.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/13583
---
 cpp/include/cudf/detail/copy_if.cuh           |  14 +-
 .../cudf/detail/utilities/device_atomics.cuh  | 124 +-----------------
 .../detail/utilities/device_operators.cuh     |  33 -----
 cpp/src/groupby/hash/multi_pass_kernels.cuh   |   8 +-
 cpp/src/hash/concurrent_unordered_map.cuh     |  40 +++---
 cpp/src/hash/unordered_multiset.cuh           |  49 +++----
 cpp/src/io/json/json_column.cu                |   8 +-
 cpp/src/join/conditional_join_kernels.cuh     |   7 +-
 cpp/src/join/join_common_utils.hpp            |   3 +-
 cpp/src/join/mixed_join_size_kernel.cuh       |   5 +-
 cpp/src/join/mixed_join_size_kernels_semi.cu  |   5 +-
 cpp/src/reductions/all.cu                     |   8 +-
 cpp/src/reductions/any.cu                     |   8 +-
 cpp/src/strings/case.cu                       |   8 +-
 cpp/src/strings/search/find.cu                |   7 +-
 cpp/src/strings/split/split.cuh               |   7 +-
 cpp/src/text/minhash.cu                       |   7 +-
 .../device_atomics/device_atomics_test.cu     |  91 -------------
 cpp/tests/hash_map/map_test.cu                |   5 +-
 19 files changed, 122 insertions(+), 315 deletions(-)
diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh
index d611e94a4d7..1dd91dcd865 100644
--- a/cpp/include/cudf/detail/copy_if.cuh
+++ b/cpp/include/cudf/detail/copy_if.cuh
@@ -22,7 +22,6 @@
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/null_mask.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/table/table.hpp>
@@ -44,6 +43,8 @@
 
 #include <cub/cub.cuh>
 
+#include <cuda/atomic>
+
 #include <algorithm>
 
 namespace cudf {
@@ -181,7 +182,9 @@ __launch_bounds__(block_size) __global__
           if (wid > 0 && wid < last_warp)
             output_valid[valid_index] = valid_warp;
           else {
-            atomicOr(&output_valid[valid_index], valid_warp);
+            cuda::atomic_ref<cudf::bitmask_type, cuda::thread_scope_device> ref{
+              output_valid[valid_index]};
+            ref.fetch_or(valid_warp, cuda::std::memory_order_relaxed);
           }
         }
 
@@ -190,7 +193,9 @@ __launch_bounds__(block_size) __global__
           uint32_t valid_warp = __ballot_sync(0xffff'ffffu, temp_valids[block_size + threadIdx.x]);
           if (lane == 0 && valid_warp != 0) {
             tmp_warp_valid_counts += __popc(valid_warp);
-            atomicOr(&output_valid[valid_index + num_warps], valid_warp);
+            cuda::atomic_ref<cudf::bitmask_type, cuda::thread_scope_device> ref{
+              output_valid[valid_index + num_warps]};
+            ref.fetch_or(valid_warp, cuda::std::memory_order_relaxed);
           }
         }
       }
@@ -206,7 +211,8 @@ __launch_bounds__(block_size) __global__
     cudf::detail::single_lane_block_sum_reduce<block_size, leader_lane>(warp_valid_counts);
 
   if (threadIdx.x == 0) {  // one thread computes and adds to null count
-    atomicAdd(output_null_count, block_sum - block_valid_count);
+    cuda::atomic_ref<size_type, cuda::thread_scope_device> ref{*output_null_count};
+    ref.fetch_add(block_sum - block_valid_count, cuda::std::memory_order_relaxed);
   }
 }
 
diff --git a/cpp/include/cudf/detail/utilities/device_atomics.cuh b/cpp/include/cudf/detail/utilities/device_atomics.cuh
index c1fc96d6f43..c56e88f07a8 100644
--- a/cpp/include/cudf/detail/utilities/device_atomics.cuh
+++ b/cpp/include/cudf/detail/utilities/device_atomics.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,7 +27,6 @@
  * cudf::duration_us, cudf::duration_ns and bool
  * where CUDA atomic operations are, `atomicAdd`, `atomicMin`, `atomicMax`,
  * `atomicCAS`.
- * `atomicAnd`, `atomicOr`, `atomicXor` are also supported for integer data types.
  * Also provides `cudf::genericAtomicOperation` which performs atomic operation
  * with the given binary operator.
  */
@@ -161,7 +160,6 @@ struct genericAtomicOperationImpl<T, Op, 8> {
 // specialized functions for operators
 // `atomicAdd` supports int32, float, double (signed int64 is not supported.)
 // `atomicMin`, `atomicMax` support int32_t, int64_t
-// `atomicAnd`, `atomicOr`, `atomicXor` support int32_t, int64_t
 template <>
 struct genericAtomicOperationImpl<float, DeviceSum, 4> {
   using T = float;
@@ -252,63 +250,6 @@ struct genericAtomicOperationImpl<int64_t, DeviceMax, 8> {
     return ret;
   }
 };
-
-template <typename T>
-struct genericAtomicOperationImpl<T, DeviceAnd, 4> {
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceAnd op)
-  {
-    return atomicAnd(addr, update_value);
-  }
-};
-
-template <typename T>
-struct genericAtomicOperationImpl<T, DeviceAnd, 8> {
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceAnd op)
-  {
-    using T_int = long long int;
-    static_assert(sizeof(T) == sizeof(T_int));
-    T ret = atomicAnd(reinterpret_cast<T_int*>(addr), type_reinterpret<T_int, T>(update_value));
-    return ret;
-  }
-};
-
-template <typename T>
-struct genericAtomicOperationImpl<T, DeviceOr, 4> {
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceOr op)
-  {
-    return atomicOr(addr, update_value);
-  }
-};
-
-template <typename T>
-struct genericAtomicOperationImpl<T, DeviceOr, 8> {
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceOr op)
-  {
-    using T_int = long long int;
-    static_assert(sizeof(T) == sizeof(T_int));
-    T ret = atomicOr(reinterpret_cast<T_int*>(addr), type_reinterpret<T_int, T>(update_value));
-    return ret;
-  }
-};
-
-template <typename T>
-struct genericAtomicOperationImpl<T, DeviceXor, 4> {
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceXor op)
-  {
-    return atomicXor(addr, update_value);
-  }
-};
-
-template <typename T>
-struct genericAtomicOperationImpl<T, DeviceXor, 8> {
-  __forceinline__ __device__ T operator()(T* addr, T const& update_value, DeviceXor op)
-  {
-    using T_int = long long int;
-    static_assert(sizeof(T) == sizeof(T_int));
-    T ret = atomicXor(reinterpret_cast<T_int*>(addr), type_reinterpret<T_int, T>(update_value));
-    return ret;
-  }
-};
 // -----------------------------------------------------------------------
 // the implementation of `typesAtomicCASImpl`
 template <typename T, size_t N = sizeof(T)>
@@ -598,66 +539,3 @@ __forceinline__ __device__ T atomicCAS(T* address, T compare, T val)
 {
   return cudf::detail::typesAtomicCASImpl<T>()(address, compare, val);
 }
-
-/**
- * @brief Overloads for `atomicAnd`
- * reads the `old` located at the `address` in global or shared memory,
- * computes (old & val), and stores the result back to memory at the same
- * address. These three operations are performed in one atomic transaction.
- *
- * The supported types for `atomicAnd` are:
- *   singed/unsigned integer 8/16/32/64 bits
- * Cuda natively supports `sint32`, `uint32`, `sint64`, `uint64`.
- *
- * @param[in] address The address of old value in global or shared memory
- * @param[in] val The value to be computed
- *
- * @returns The old value at `address`
- */
-template <typename T, std::enable_if_t<std::is_integral_v<T>, T>* = nullptr>
-__forceinline__ __device__ T atomicAnd(T* address, T val)
-{
-  return cudf::genericAtomicOperation(address, val, cudf::DeviceAnd{});
-}
-
-/**
- * @brief Overloads for `atomicOr`
- * reads the `old` located at the `address` in global or shared memory,
- * computes (old | val), and stores the result back to memory at the same
- * address. These three operations are performed in one atomic transaction.
- *
- * The supported types for `atomicOr` are:
- *   singed/unsigned integer 8/16/32/64 bits
- * Cuda natively supports `sint32`, `uint32`, `sint64`, `uint64`.
- *
- * @param[in] address The address of old value in global or shared memory
- * @param[in] val The value to be computed
- *
- * @returns The old value at `address`
- */
-template <typename T, std::enable_if_t<std::is_integral_v<T>, T>* = nullptr>
-__forceinline__ __device__ T atomicOr(T* address, T val)
-{
-  return cudf::genericAtomicOperation(address, val, cudf::DeviceOr{});
-}
-
-/**
- * @brief Overloads for `atomicXor`
- * reads the `old` located at the `address` in global or shared memory,
- * computes (old ^ val), and stores the result back to memory at the same
- * address. These three operations are performed in one atomic transaction.
- *
- * The supported types for `atomicXor` are:
- *   singed/unsigned integer 8/16/32/64 bits
- * Cuda natively supports `sint32`, `uint32`, `sint64`, `uint64`.
- *
- * @param[in] address The address of old value in global or shared memory
- * @param[in] val The value to be computed
- *
- * @returns The old value at `address`
- */
-template <typename T, std::enable_if_t<std::is_integral_v<T>, T>* = nullptr>
-__forceinline__ __device__ T atomicXor(T* address, T val)
-{
-  return cudf::genericAtomicOperation(address, val, cudf::DeviceXor{});
-}
diff --git a/cpp/include/cudf/detail/utilities/device_operators.cuh b/cpp/include/cudf/detail/utilities/device_operators.cuh
index 8490d3ee73b..46f424e051b 100644
--- a/cpp/include/cudf/detail/utilities/device_operators.cuh
+++ b/cpp/include/cudf/detail/utilities/device_operators.cuh
@@ -230,39 +230,6 @@ struct DeviceProduct {
   }
 };
 
-/**
- * @brief binary `and` operator
- */
-struct DeviceAnd {
-  template <typename T, std::enable_if_t<std::is_integral_v<T>>* = nullptr>
-  CUDF_HOST_DEVICE inline auto operator()(T const& lhs, T const& rhs) -> decltype(lhs & rhs)
-  {
-    return (lhs & rhs);
-  }
-};
-
-/**
- * @brief binary `or` operator
- */
-struct DeviceOr {
-  template <typename T, std::enable_if_t<std::is_integral_v<T>>* = nullptr>
-  CUDF_HOST_DEVICE inline auto operator()(T const& lhs, T const& rhs) -> decltype(lhs | rhs)
-  {
-    return (lhs | rhs);
-  }
-};
-
-/**
- * @brief binary `xor` operator
- */
-struct DeviceXor {
-  template <typename T, std::enable_if_t<std::is_integral_v<T>>* = nullptr>
-  CUDF_HOST_DEVICE inline auto operator()(T const& lhs, T const& rhs) -> decltype(lhs ^ rhs)
-  {
-    return (lhs ^ rhs);
-  }
-};
-
 /**
  * @brief Operator for calculating Lead/Lag window function.
  */
diff --git a/cpp/src/groupby/hash/multi_pass_kernels.cuh b/cpp/src/groupby/hash/multi_pass_kernels.cuh
index 15a38029bc4..4bc73631732 100644
--- a/cpp/src/groupby/hash/multi_pass_kernels.cuh
+++ b/cpp/src/groupby/hash/multi_pass_kernels.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,11 +20,12 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/utilities/assert.cuh>
-#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/type_dispatcher.hpp>
 
+#include <cuda/atomic>
+
 #include <cmath>
 
 namespace cudf {
@@ -86,7 +87,8 @@ struct var_hash_functor {
     auto x        = static_cast<Target>(source.element<Source>(source_index));
     auto mean     = static_cast<Target>(sum.element<SumType>(target_index)) / group_size;
     Target result = (x - mean) * (x - mean) / (group_size - ddof);
-    atomicAdd(&target.element<Target>(target_index), result);
+    cuda::atomic_ref<Target, cuda::thread_scope_device> ref{target.element<Target>(target_index)};
+    ref.fetch_add(result, cuda::std::memory_order_relaxed);
     // STD sqrt is applied in finalize()
 
     if (target_has_nulls and target.is_null(target_index)) { target.set_valid(target_index); }
diff --git a/cpp/src/hash/concurrent_unordered_map.cuh b/cpp/src/hash/concurrent_unordered_map.cuh
index 2b9f2a1b0fa..5acfba0a8bf 100644
--- a/cpp/src/hash/concurrent_unordered_map.cuh
+++ b/cpp/src/hash/concurrent_unordered_map.cuh
@@ -21,7 +21,6 @@
 #include <hash/managed.cuh>
 
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/detail/utilities/hash_functions.cuh>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
@@ -35,6 +34,8 @@
 #include <limits>
 #include <type_traits>
 
+#include <cuda/atomic>
+
 namespace {
 template <std::size_t N>
 struct packed {
@@ -91,8 +92,8 @@ union pair_packer;
 template <typename pair_type>
 union pair_packer<pair_type, std::enable_if_t<is_packable<pair_type>()>> {
   using packed_type = packed_t<pair_type>;
-  packed_type const packed;
-  pair_type const pair;
+  packed_type packed;
+  pair_type pair;
 
   __device__ pair_packer(pair_type _pair) : pair{_pair} {}
 
@@ -268,16 +269,21 @@ class concurrent_unordered_map {
   __device__ std::enable_if_t<is_packable<pair_type>(), insert_result> attempt_insert(
     value_type* const __restrict__ insert_location, value_type const& insert_pair)
   {
-    pair_packer<pair_type> const unused{thrust::make_pair(m_unused_key, m_unused_element)};
-    pair_packer<pair_type> const new_pair{insert_pair};
-    pair_packer<pair_type> const old{
-      atomicCAS(reinterpret_cast<typename pair_packer<pair_type>::packed_type*>(insert_location),
-                unused.packed,
-                new_pair.packed)};
+    pair_packer<pair_type> expected{thrust::make_pair(m_unused_key, m_unused_element)};
+    pair_packer<pair_type> desired{insert_pair};
+
+    using packed_type = typename pair_packer<pair_type>::packed_type;
 
-    if (old.packed == unused.packed) { return insert_result::SUCCESS; }
+    auto* insert_ptr = reinterpret_cast<packed_type*>(insert_location);
+    cuda::atomic_ref<packed_type, cuda::thread_scope_device> ref{*insert_ptr};
+    auto const success =
+      ref.compare_exchange_strong(expected.packed, desired.packed, cuda::std::memory_order_relaxed);
 
-    if (m_equal(old.pair.first, insert_pair.first)) { return insert_result::DUPLICATE; }
+    if (success) {
+      return insert_result::SUCCESS;
+    } else if (m_equal(expected.pair.first, insert_pair.first)) {
+      return insert_result::DUPLICATE;
+    }
     return insert_result::CONTINUE;
   }
 
@@ -292,16 +298,20 @@ class concurrent_unordered_map {
   __device__ std::enable_if_t<not is_packable<pair_type>(), insert_result> attempt_insert(
     value_type* const __restrict__ insert_location, value_type const& insert_pair)
   {
-    key_type const old_key{atomicCAS(&(insert_location->first), m_unused_key, insert_pair.first)};
+    auto expected = m_unused_key;
+    cuda::atomic_ref<key_type, cuda::thread_scope_device> ref{insert_location->first};
+    auto const key_success =
+      ref.compare_exchange_strong(expected, insert_pair.first, cuda::std::memory_order_relaxed);
 
     // Hash bucket empty
-    if (m_unused_key == old_key) {
+    if (key_success) {
       insert_location->second = insert_pair.second;
       return insert_result::SUCCESS;
     }
-
     // Key already exists
-    if (m_equal(old_key, insert_pair.first)) { return insert_result::DUPLICATE; }
+    else if (m_equal(expected, insert_pair.first)) {
+      return insert_result::DUPLICATE;
+    }
 
     return insert_result::CONTINUE;
   }
diff --git a/cpp/src/hash/unordered_multiset.cuh b/cpp/src/hash/unordered_multiset.cuh
index 6c807eca4c3..96e6728df81 100644
--- a/cpp/src/hash/unordered_multiset.cuh
+++ b/cpp/src/hash/unordered_multiset.cuh
@@ -18,7 +18,6 @@
 
 #include <hash/helper_functions.cuh>
 
-#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/detail/utilities/hash_functions.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/utilities/span.hpp>
@@ -32,6 +31,8 @@
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/scan.h>
 
+#include <cuda/atomic>
+
 namespace cudf {
 namespace detail {
 /*
@@ -95,16 +96,18 @@ class unordered_multiset {
     size_type* d_hash_bins_end   = hash_bins_end.data();
     Element* d_hash_data         = hash_data.data();
 
-    thrust::for_each(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator<size_type>(0),
-                     thrust::make_counting_iterator<size_type>(col.size()),
-                     [d_hash_bins_start, d_col, hasher] __device__(size_t idx) {
-                       if (!d_col.is_null(idx)) {
-                         Element e     = d_col.element<Element>(idx);
-                         size_type tmp = hasher(e) % (2 * d_col.size());
-                         atomicAdd(d_hash_bins_start + tmp, size_type{1});
-                       }
-                     });
+    thrust::for_each(
+      rmm::exec_policy(stream),
+      thrust::make_counting_iterator<size_type>(0),
+      thrust::make_counting_iterator<size_type>(col.size()),
+      [d_hash_bins_start, d_col, hasher] __device__(size_t idx) {
+        if (!d_col.is_null(idx)) {
+          Element e     = d_col.element<Element>(idx);
+          size_type tmp = hasher(e) % (2 * d_col.size());
+          cuda::atomic_ref<size_type, cuda::thread_scope_device> ref{*(d_hash_bins_start + tmp)};
+          ref.fetch_add(1, cuda::std::memory_order_relaxed);
+        }
+      });
 
     thrust::exclusive_scan(rmm::exec_policy(stream),
                            hash_bins_start.begin(),
@@ -116,17 +119,19 @@ class unordered_multiset {
                  hash_bins_end.end(),
                  hash_bins_start.begin());
 
-    thrust::for_each(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator<size_type>(0),
-                     thrust::make_counting_iterator<size_type>(col.size()),
-                     [d_hash_bins_end, d_hash_data, d_col, hasher] __device__(size_t idx) {
-                       if (!d_col.is_null(idx)) {
-                         Element e           = d_col.element<Element>(idx);
-                         size_type tmp       = hasher(e) % (2 * d_col.size());
-                         size_type offset    = atomicAdd(d_hash_bins_end + tmp, size_type{1});
-                         d_hash_data[offset] = e;
-                       }
-                     });
+    thrust::for_each(
+      rmm::exec_policy(stream),
+      thrust::make_counting_iterator<size_type>(0),
+      thrust::make_counting_iterator<size_type>(col.size()),
+      [d_hash_bins_end, d_hash_data, d_col, hasher] __device__(size_t idx) {
+        if (!d_col.is_null(idx)) {
+          Element e     = d_col.element<Element>(idx);
+          size_type tmp = hasher(e) % (2 * d_col.size());
+          cuda::atomic_ref<size_type, cuda::thread_scope_device> ref{*(d_hash_bins_end + tmp)};
+          size_type offset    = ref.fetch_add(1, cuda::std::memory_order_relaxed);
+          d_hash_data[offset] = e;
+        }
+      });
 
     return unordered_multiset(d_col.size(), std::move(hash_bins_start), std::move(hash_data));
   }
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index 29c7f3e105e..3a79d832d06 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -21,7 +21,6 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/detail/utilities/visitor_overload.hpp>
 #include <cudf/io/detail/data_casting.cuh>
@@ -48,6 +47,8 @@
 #include <thrust/transform.h>
 #include <thrust/unique.h>
 
+#include <cuda/atomic>
+
 #include <algorithm>
 #include <cstdint>
 
@@ -232,8 +233,9 @@ reduce_to_column_tree(tree_meta_t& tree,
                        auto parent_col_id = parent_col_ids[col_id];
                        if (parent_col_id != parent_node_sentinel and
                            column_categories[parent_col_id] == node_t::NC_LIST) {
-                         atomicMax(list_parents_children_max_row_offsets + parent_col_id,
-                                   max_row_offsets[col_id]);
+                         cuda::atomic_ref<NodeIndexT, cuda::thread_scope_device> ref{
+                           *(list_parents_children_max_row_offsets + parent_col_id)};
+                         ref.fetch_max(max_row_offsets[col_id], cuda::std::memory_order_relaxed);
                        }
                      });
     thrust::gather_if(
diff --git a/cpp/src/join/conditional_join_kernels.cuh b/cpp/src/join/conditional_join_kernels.cuh
index 30650f6769f..dc455ad9cef 100644
--- a/cpp/src/join/conditional_join_kernels.cuh
+++ b/cpp/src/join/conditional_join_kernels.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -106,7 +106,10 @@ __global__ void compute_conditional_join_output_size(
   std::size_t block_counter = BlockReduce(temp_storage).Sum(thread_counter);
 
   // Add block counter to global counter
-  if (threadIdx.x == 0) atomicAdd(output_size, block_counter);
+  if (threadIdx.x == 0) {
+    cuda::atomic_ref<std::size_t, cuda::thread_scope_device> ref{*output_size};
+    ref.fetch_add(block_counter, cuda::std::memory_order_relaxed);
+  }
 }
 
 /**
diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp
index b78b55bdacc..cbccd78049a 100644
--- a/cpp/src/join/join_common_utils.hpp
+++ b/cpp/src/join/join_common_utils.hpp
@@ -16,7 +16,6 @@
 #pragma once
 
 #include <cudf/detail/join.hpp>
-#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/detail/utilities/hash_functions.cuh>
 #include <cudf/join.hpp>
 #include <cudf/table/row_operators.cuh>
@@ -30,6 +29,8 @@
 #include <cuco/static_map.cuh>
 #include <cuco/static_multimap.cuh>
 
+#include <cuda/atomic>
+
 #include <limits>
 
 namespace cudf {
diff --git a/cpp/src/join/mixed_join_size_kernel.cuh b/cpp/src/join/mixed_join_size_kernel.cuh
index 0b7596dbe6b..ef377dadc4b 100644
--- a/cpp/src/join/mixed_join_size_kernel.cuh
+++ b/cpp/src/join/mixed_join_size_kernel.cuh
@@ -95,7 +95,10 @@ __launch_bounds__(block_size) __global__ void compute_mixed_join_output_size(
   std::size_t block_counter = BlockReduce(temp_storage).Sum(thread_counter);
 
   // Add block counter to global counter
-  if (threadIdx.x == 0) atomicAdd(output_size, block_counter);
+  if (threadIdx.x == 0) {
+    cuda::atomic_ref<std::size_t, cuda::thread_scope_device> ref{*output_size};
+    ref.fetch_add(block_counter, cuda::std::memory_order_relaxed);
+  }
 }
 
 }  // namespace detail
diff --git a/cpp/src/join/mixed_join_size_kernels_semi.cu b/cpp/src/join/mixed_join_size_kernels_semi.cu
index 009669252fb..fd7bf0234e9 100644
--- a/cpp/src/join/mixed_join_size_kernels_semi.cu
+++ b/cpp/src/join/mixed_join_size_kernels_semi.cu
@@ -83,7 +83,10 @@ __launch_bounds__(block_size) __global__ void compute_mixed_join_output_size_sem
   std::size_t block_counter = BlockReduce(temp_storage).Sum(thread_counter);
 
   // Add block counter to global counter
-  if (threadIdx.x == 0) atomicAdd(output_size, block_counter);
+  if (threadIdx.x == 0) {
+    cuda::atomic_ref<std::size_t, cuda::thread_scope_device> ref{*output_size};
+    ref.fetch_add(block_counter, cuda::std::memory_order_relaxed);
+  }
 }
 
 template __global__ void compute_mixed_join_output_size_semi<DEFAULT_JOIN_BLOCK_SIZE, true>(
diff --git a/cpp/src/reductions/all.cu b/cpp/src/reductions/all.cu
index c2f278532ac..4717c0673e3 100644
--- a/cpp/src/reductions/all.cu
+++ b/cpp/src/reductions/all.cu
@@ -16,7 +16,6 @@
 
 #include "simple.cuh"
 
-#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/reduction/detail/reduction_functions.hpp>
 
@@ -25,6 +24,8 @@
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/reduce.h>
 
+#include <cuda/atomic>
+
 namespace cudf {
 namespace reduction {
 namespace detail {
@@ -43,7 +44,10 @@ struct all_fn {
   struct all_true_fn {
     __device__ void operator()(size_type idx)
     {
-      if (*d_result && (iter[idx] != *d_result)) atomicAnd(d_result, false);
+      if (*d_result && (iter[idx] != *d_result)) {
+        cuda::atomic_ref<int32_t, cuda::thread_scope_device> ref{*d_result};
+        ref.fetch_and(0, cuda::std::memory_order_relaxed);
+      }
     }
     Iterator iter;
     int32_t* d_result;
diff --git a/cpp/src/reductions/any.cu b/cpp/src/reductions/any.cu
index d1b66d5c254..f3093df5ac7 100644
--- a/cpp/src/reductions/any.cu
+++ b/cpp/src/reductions/any.cu
@@ -16,7 +16,6 @@
 
 #include "simple.cuh"
 
-#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/reduction/detail/reduction_functions.hpp>
 
@@ -25,6 +24,8 @@
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/reduce.h>
 
+#include <cuda/atomic>
+
 namespace cudf {
 namespace reduction {
 namespace detail {
@@ -43,7 +44,10 @@ struct any_fn {
   struct any_true_fn {
     __device__ void operator()(size_type idx)
     {
-      if (!*d_result && (iter[idx] != *d_result)) atomicOr(d_result, true);
+      if (!*d_result && (iter[idx] != *d_result)) {
+        cuda::atomic_ref<int32_t, cuda::thread_scope_device> ref{*d_result};
+        ref.fetch_or(1, cuda::std::memory_order_relaxed);
+      }
     }
     Iterator iter;
     int32_t* d_result;
diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu
index 02660c46c63..c5fe7a19f53 100644
--- a/cpp/src/strings/case.cu
+++ b/cpp/src/strings/case.cu
@@ -20,7 +20,6 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/strings/case.hpp>
 #include <cudf/strings/detail/char_tables.hpp>
 #include <cudf/strings/detail/strings_children.cuh>
@@ -33,6 +32,8 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <cuda/atomic>
+
 namespace cudf {
 namespace strings {
 namespace detail {
@@ -167,7 +168,10 @@ struct count_bytes_fn {
       size += converter.process_character(u8);
     }
     // this is every so slightly faster than using the cub::warp_reduce
-    if (size > 0) atomicAdd(d_offsets + str_idx, size);
+    if (size > 0) {
+      cuda::atomic_ref<size_type, cuda::thread_scope_block> ref{*(d_offsets + str_idx)};
+      ref.fetch_add(size, cuda::std::memory_order_relaxed);
+    }
   }
 };
 
diff --git a/cpp/src/strings/search/find.cu b/cpp/src/strings/search/find.cu
index f71c4b6b49e..e5ce88e7583 100644
--- a/cpp/src/strings/search/find.cu
+++ b/cpp/src/strings/search/find.cu
@@ -19,7 +19,6 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/find.hpp>
@@ -37,6 +36,8 @@
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
 
+#include <cuda/atomic>
+
 namespace cudf {
 namespace strings {
 namespace detail {
@@ -154,7 +155,9 @@ __global__ void finder_warp_parallel_fn(column_device_view const d_strings,
 
   // find stores the minimum position while rfind stores the maximum position
   // note that this was slightly faster than using cub::WarpReduce
-  forward ? atomicMin(d_results + str_idx, position) : atomicMax(d_results + str_idx, position);
+  cuda::atomic_ref<size_type, cuda::thread_scope_block> ref{*(d_results + str_idx)};
+  forward ? ref.fetch_min(position, cuda::std::memory_order_relaxed)
+          : ref.fetch_max(position, cuda::std::memory_order_relaxed);
   __syncwarp();
 
   if (lane_idx == 0) {
diff --git a/cpp/src/strings/split/split.cuh b/cpp/src/strings/split/split.cuh
index 31257a441a1..e76d8ac1c60 100644
--- a/cpp/src/strings/split/split.cuh
+++ b/cpp/src/strings/split/split.cuh
@@ -20,7 +20,6 @@
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/sizes_to_offsets_iterator.cuh>
-#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/strings/detail/split_utils.cuh>
 #include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/string_view.cuh>
@@ -37,6 +36,8 @@
 #include <thrust/scan.h>
 #include <thrust/transform.h>
 
+#include <cuda/atomic>
+
 namespace cudf::strings::detail {
 
 /**
@@ -356,7 +357,9 @@ std::pair<std::unique_ptr<column>, rmm::device_uvector<string_index_pair>> split
                        delimiter_count,
                        [d_string_indices, d_delimiter_offsets] __device__(size_type idx) {
                          auto const str_idx = d_string_indices[idx] - 1;
-                         atomicAdd(d_delimiter_offsets + str_idx, 1);
+                         cuda::atomic_ref<size_type, cuda::thread_scope_device> ref{
+                           *(d_delimiter_offsets + str_idx)};
+                         ref.fetch_add(1, cuda::std::memory_order_relaxed);
                        });
     // finally, convert the delimiter counts into offsets
     thrust::exclusive_scan(rmm::exec_policy(stream),
diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu
index d2cc90bb971..6658d574dcc 100644
--- a/cpp/src/text/minhash.cu
+++ b/cpp/src/text/minhash.cu
@@ -25,7 +25,6 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/sequence.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/detail/utilities/hash_functions.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
@@ -41,6 +40,8 @@
 
 #include <limits>
 
+#include <cuda/atomic>
+
 namespace nvtext {
 namespace detail {
 namespace {
@@ -92,7 +93,9 @@ struct minhash_fn {
       for (std::size_t seed_idx = 0; seed_idx < seeds.size(); ++seed_idx) {
         auto const hasher = cudf::detail::MurmurHash3_32<cudf::string_view>{seeds[seed_idx]};
         auto const hvalue = hasher(hash_str);
-        atomicMin(d_output + seed_idx, hvalue);
+        cuda::atomic_ref<cudf::hash_value_type, cuda::thread_scope_block> ref{
+          *(d_output + seed_idx)};
+        ref.fetch_min(hvalue, cuda::std::memory_order_relaxed);
       }
     }
   }
diff --git a/cpp/tests/device_atomics/device_atomics_test.cu b/cpp/tests/device_atomics/device_atomics_test.cu
index d50db649354..24195362d92 100644
--- a/cpp/tests/device_atomics/device_atomics_test.cu
+++ b/cpp/tests/device_atomics/device_atomics_test.cu
@@ -256,95 +256,4 @@ TYPED_TEST(AtomicsTest, atomicCASRandom)
   this->atomic_test(input_array, is_cas_test, block_size, grid_size);
 }
 
-template <typename T>
-__global__ void gpu_atomic_bitwiseOp_test(T* result, T* data, size_t size)
-{
-  size_t id   = blockIdx.x * blockDim.x + threadIdx.x;
-  size_t step = blockDim.x * gridDim.x;
-
-  for (; id < size; id += step) {
-    atomicAnd(&result[0], data[id]);
-    atomicOr(&result[1], data[id]);
-    atomicXor(&result[2], data[id]);
-    cudf::genericAtomicOperation(&result[3], data[id], cudf::DeviceAnd{});
-    cudf::genericAtomicOperation(&result[4], data[id], cudf::DeviceOr{});
-    cudf::genericAtomicOperation(&result[5], data[id], cudf::DeviceXor{});
-  }
-}
-
-template <typename T>
-struct AtomicsBitwiseOpTest : public cudf::test::BaseFixture {
-  void atomic_test(std::vector<uint64_t> const& v_input, int block_size = 0, int grid_size = 1)
-  {
-    size_t vec_size = v_input.size();
-    std::vector<T> v(vec_size);
-    std::transform(v_input.begin(), v_input.end(), v.begin(), [](int x) {
-      T t(x);
-      return t;
-    });
-
-    thrust::host_vector<T> identity(9, T{0});  // +3 elements padding for int8 tests
-    identity[0] = T(~0ull);
-    identity[3] = T(~0ull);
-
-    T exact[3];
-    exact[0] = std::accumulate(
-      v.begin(), v.end(), identity[0], [](T acc, uint64_t i) { return acc & T(i); });
-    exact[1] = std::accumulate(
-      v.begin(), v.end(), identity[1], [](T acc, uint64_t i) { return acc | T(i); });
-    exact[2] = std::accumulate(
-      v.begin(), v.end(), identity[2], [](T acc, uint64_t i) { return acc ^ T(i); });
-
-    auto dev_result = cudf::detail::make_device_uvector_sync(
-      identity, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
-    auto dev_data = cudf::detail::make_device_uvector_sync(
-      v, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
-
-    if (block_size == 0) { block_size = vec_size; }
-
-    gpu_atomic_bitwiseOp_test<T><<<grid_size, block_size, 0, cudf::get_default_stream().value()>>>(
-      reinterpret_cast<T*>(dev_result.data()), reinterpret_cast<T*>(dev_data.data()), vec_size);
-
-    auto host_result = cudf::detail::make_host_vector_sync(dev_result, cudf::get_default_stream());
-
-    CUDF_CHECK_CUDA(cudf::get_default_stream().value());
-
-    // print_exact(exact, "exact");
-    // print_exact(host_result.data(), "result");
-
-    EXPECT_EQ(host_result[0], exact[0]) << "atomicAnd test failed";
-    EXPECT_EQ(host_result[1], exact[1]) << "atomicOr  test failed";
-    EXPECT_EQ(host_result[2], exact[2]) << "atomicXor test failed";
-    EXPECT_EQ(host_result[3], exact[0]) << "atomicAnd test(2) failed";
-    EXPECT_EQ(host_result[4], exact[1]) << "atomicOr  test(2) failed";
-    EXPECT_EQ(host_result[5], exact[2]) << "atomicXor test(2) failed";
-  }
-
-  [[maybe_unused]] void print_exact(T const* v, char const* msg)
-  {
-    std::cout << std::hex << std::showbase;
-    std::cout << "The " << msg << " = {" << +v[0] << ", " << +v[1] << ", " << +v[2] << "}"
-              << std::endl;
-  }
-};
-
-using BitwiseOpTestingTypes =
-  cudf::test::Types<int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, uint32_t, uint64_t>;
-
-TYPED_TEST_SUITE(AtomicsBitwiseOpTest, BitwiseOpTestingTypes);
-
-TYPED_TEST(AtomicsBitwiseOpTest, atomicBitwiseOps)
-{
-  {  // test for AND, XOR
-    std::vector<uint64_t> input_array(
-      {0xfcfc'fcfc'fcfc'fc7f, 0x7f'7f7f'7f7f'7ffc, 0xfffd'dffd'dffd'dfdf, 0x7f'7f7f'7f7f'7ffc});
-    this->atomic_test(input_array);
-  }
-  {  // test for OR, XOR
-    std::vector<uint64_t> input_array(
-      {0x01, 0xfc02, 0x1d'ff03, 0x1100'a0b0'801d'0003, 0x8000'0000'0000'0000, 0x1d'ff03});
-    this->atomic_test(input_array);
-  }
-}
-
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/hash_map/map_test.cu b/cpp/tests/hash_map/map_test.cu
index f38c5b3f58f..8d71c512c79 100644
--- a/cpp/tests/hash_map/map_test.cu
+++ b/cpp/tests/hash_map/map_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -68,10 +68,7 @@ struct InsertTest : public cudf::test::BaseFixture {
 
 using TestTypes = ::testing::Types<key_value_types<int32_t, int32_t>,
                                    key_value_types<int64_t, int64_t>,
-                                   key_value_types<int8_t, int8_t>,
                                    key_value_types<int16_t, int16_t>,
-                                   key_value_types<int8_t, float>,
-                                   key_value_types<int16_t, double>,
                                    key_value_types<int32_t, float>,
                                    key_value_types<int64_t, double>>;