NVIDIA · PointKernel · Jan 19, 2024 · Jan 9, 2024 · Jan 11, 2024 · Jan 19, 2024
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 #pragma once
+#include <cuco/detail/utility/cuda.cuh>
 
 #include <cub/block/block_reduce.cuh>
 
@@ -25,6 +26,8 @@ namespace cuco {
 namespace detail {
 namespace cg = cooperative_groups;
 
+CUCO_SUPPRESS_KERNEL_WARNINGS
+
 /**
  * @brief Inserts all key/value pairs in the range `[first, last)`.
  *
@@ -62,15 +65,15 @@ template <uint32_t block_size,
           typename atomicT,
           typename Hash,
           typename KeyEqual>
-__global__ void insert(InputIt first,
-                       InputIt last,
-                       viewT* submap_views,
-                       mutableViewT* submap_mutable_views,
-                       atomicT* num_successes,
-                       uint32_t insert_idx,
-                       uint32_t num_submaps,
-                       Hash hash,
-                       KeyEqual key_equal)
+CUCO_KERNEL void insert(InputIt first,
+                        InputIt last,
+                        viewT* submap_views,
+                        mutableViewT* submap_mutable_views,
+                        atomicT* num_successes,
+                        uint32_t insert_idx,
+                        uint32_t num_submaps,
+                        Hash hash,
+                        KeyEqual key_equal)
 {
   using BlockReduce = cub::BlockReduce<std::size_t, block_size>;
   __shared__ typename BlockReduce::TempStorage temp_storage;
@@ -147,15 +150,15 @@ template <uint32_t block_size,
           typename atomicT,
           typename Hash,
           typename KeyEqual>
-__global__ void insert(InputIt first,
-                       InputIt last,
-                       viewT* submap_views,
-                       mutableViewT* submap_mutable_views,
-                       atomicT** submap_num_successes,
-                       uint32_t insert_idx,
-                       uint32_t num_submaps,
-                       Hash hash,
-                       KeyEqual key_equal)
+CUCO_KERNEL void insert(InputIt first,
+                        InputIt last,
+                        viewT* submap_views,
+                        mutableViewT* submap_mutable_views,
+                        atomicT** submap_num_successes,
+                        uint32_t insert_idx,
+                        uint32_t num_submaps,
+                        Hash hash,
+                        KeyEqual key_equal)
 {
   using BlockReduce = cub::BlockReduce<std::size_t, block_size>;
   __shared__ typename BlockReduce::TempStorage temp_storage;
@@ -225,13 +228,13 @@ template <uint32_t block_size,
           typename atomicT,
           typename Hash,
           typename KeyEqual>
-__global__ void erase(InputIt first,
-                      InputIt last,
-                      mutableViewT* submap_mutable_views,
-                      atomicT** submap_num_successes,
-                      uint32_t num_submaps,
-                      Hash hash,
-                      KeyEqual key_equal)
+CUCO_KERNEL void erase(InputIt first,
+                       InputIt last,
+                       mutableViewT* submap_mutable_views,
+                       atomicT** submap_num_successes,
+                       uint32_t num_submaps,
+                       Hash hash,
+                       KeyEqual key_equal)
 {
   extern __shared__ unsigned long long submap_block_num_successes[];
 
@@ -296,13 +299,13 @@ template <uint32_t block_size,
           typename atomicT,
           typename Hash,
           typename KeyEqual>
-__global__ void erase(InputIt first,
-                      InputIt last,
-                      mutableViewT* submap_mutable_views,
-                      atomicT** submap_num_successes,
-                      uint32_t num_submaps,
-                      Hash hash,
-                      KeyEqual key_equal)
+CUCO_KERNEL void erase(InputIt first,
+                       InputIt last,
+                       mutableViewT* submap_mutable_views,
+                       atomicT** submap_num_successes,
+                       uint32_t num_submaps,
+                       Hash hash,
+                       KeyEqual key_equal)
 {
   extern __shared__ unsigned long long submap_block_num_successes[];
 
@@ -368,13 +371,13 @@ template <uint32_t block_size,
           typename viewT,
           typename Hash,
           typename KeyEqual>
-__global__ void find(InputIt first,
-                     InputIt last,
-                     OutputIt output_begin,
-                     viewT* submap_views,
-                     uint32_t num_submaps,
-                     Hash hash,
-                     KeyEqual key_equal)
+CUCO_KERNEL void find(InputIt first,
+                      InputIt last,
+                      OutputIt output_begin,
+                      viewT* submap_views,
+                      uint32_t num_submaps,
+                      Hash hash,
+                      KeyEqual key_equal)
 {
   auto tid                  = blockDim.x * blockIdx.x + threadIdx.x;
   auto empty_value_sentinel = submap_views[0].get_empty_value_sentinel();
@@ -443,13 +446,13 @@ template <uint32_t block_size,
           typename viewT,
           typename Hash,
           typename KeyEqual>
-__global__ void find(InputIt first,
-                     InputIt last,
-                     OutputIt output_begin,
-                     viewT* submap_views,
-                     uint32_t num_submaps,
-                     Hash hash,
-                     KeyEqual key_equal)
+CUCO_KERNEL void find(InputIt first,
+                      InputIt last,
+                      OutputIt output_begin,
+                      viewT* submap_views,
+                      uint32_t num_submaps,
+                      Hash hash,
+                      KeyEqual key_equal)
 {
   auto tile                 = cg::tiled_partition<tile_size>(cg::this_thread_block());
   auto tid                  = blockDim.x * blockIdx.x + threadIdx.x;
@@ -514,13 +517,13 @@ template <uint32_t block_size,
           typename viewT,
           typename Hash,
           typename KeyEqual>
-__global__ void contains(InputIt first,
-                         InputIt last,
-                         OutputIt output_begin,
-                         viewT* submap_views,
-                         uint32_t num_submaps,
-                         Hash hash,
-                         KeyEqual key_equal)
+CUCO_KERNEL void contains(InputIt first,
+                          InputIt last,
+                          OutputIt output_begin,
+                          viewT* submap_views,
+                          uint32_t num_submaps,
+                          Hash hash,
+                          KeyEqual key_equal)
 {
   auto tid = blockDim.x * blockIdx.x + threadIdx.x;
   __shared__ bool writeBuffer[block_size];
@@ -582,13 +585,13 @@ template <uint32_t block_size,
           typename viewT,
           typename Hash,
           typename KeyEqual>
-__global__ void contains(InputIt first,
-                         InputIt last,
-                         OutputIt output_begin,
-                         viewT* submap_views,
-                         uint32_t num_submaps,
-                         Hash hash,
-                         KeyEqual key_equal)
+CUCO_KERNEL void contains(InputIt first,
+                          InputIt last,
+                          OutputIt output_begin,
+                          viewT* submap_views,
+                          uint32_t num_submaps,
+                          Hash hash,
+                          KeyEqual key_equal)
 {
   auto tile    = cg::tiled_partition<tile_size>(cg::this_thread_block());
   auto tid     = blockDim.x * blockIdx.x + threadIdx.x;

@@ -27,6 +27,7 @@
 
 namespace cuco {
 namespace detail {
+CUCO_SUPPRESS_KERNEL_WARNINGS
 
 /**
  * @brief Inserts all elements in the range `[first, first + n)` and returns the number of
@@ -61,12 +62,12 @@ template <int32_t CGSize,
           typename Predicate,
           typename AtomicT,
           typename Ref>
-__global__ void insert_if_n(InputIt first,
-                            cuco::detail::index_type n,
-                            StencilIt stencil,
-                            Predicate pred,
-                            AtomicT* num_successes,
-                            Ref ref)
+CUCO_KERNEL void insert_if_n(InputIt first,
+                             cuco::detail::index_type n,
+                             StencilIt stencil,
+                             Predicate pred,
+                             AtomicT* num_successes,
+                             Ref ref)
 {
   using BlockReduce = cub::BlockReduce<typename Ref::size_type, BlockSize>;
   __shared__ typename BlockReduce::TempStorage temp_storage;
@@ -127,7 +128,7 @@ template <int32_t CGSize,
           typename StencilIt,
           typename Predicate,
           typename Ref>
-__global__ void insert_if_n(
+CUCO_KERNEL void insert_if_n(
   InputIt first, cuco::detail::index_type n, StencilIt stencil, Predicate pred, Ref ref)
 {
   auto const loop_stride = cuco::detail::grid_stride() / CGSize;
@@ -162,7 +163,7 @@ __global__ void insert_if_n(
  * @param ref Non-owning container device ref used to access the slot storage
  */
 template <int32_t CGSize, int32_t BlockSize, typename InputIt, typename Ref>
-__global__ void erase(InputIt first, cuco::detail::index_type n, Ref ref)
+CUCO_KERNEL void erase(InputIt first, cuco::detail::index_type n, Ref ref)
 {
   auto const loop_stride = cuco::detail::grid_stride() / CGSize;
   auto idx               = cuco::detail::global_thread_id() / CGSize;
@@ -212,12 +213,12 @@ template <int32_t CGSize,
           typename Predicate,
           typename OutputIt,
           typename Ref>
-__global__ void contains_if_n(InputIt first,
-                              cuco::detail::index_type n,
-                              StencilIt stencil,
-                              Predicate pred,
-                              OutputIt output_begin,
-                              Ref ref)
+CUCO_KERNEL void contains_if_n(InputIt first,
+                               cuco::detail::index_type n,
+                               StencilIt stencil,
+                               Predicate pred,
+                               OutputIt output_begin,
+                               Ref ref)
 {
   namespace cg = cooperative_groups;
 
@@ -267,7 +268,7 @@ __global__ void contains_if_n(InputIt first,
  * @param count Number of filled slots
  */
 template <int32_t BlockSize, typename StorageRef, typename Predicate, typename AtomicT>
-__global__ void size(StorageRef storage, Predicate is_filled, AtomicT* count)
+CUCO_KERNEL void size(StorageRef storage, Predicate is_filled, AtomicT* count)
 {
   using size_type = typename StorageRef::size_type;
 
@@ -293,9 +294,9 @@ __global__ void size(StorageRef storage, Predicate is_filled, AtomicT* count)
 }
 
 template <int32_t BlockSize, typename ContainerRef, typename Predicate>
-__global__ void rehash(typename ContainerRef::storage_ref_type storage_ref,
-                       ContainerRef container_ref,
-                       Predicate is_filled)
+CUCO_KERNEL void rehash(typename ContainerRef::storage_ref_type storage_ref,
+                        ContainerRef container_ref,
+                        Predicate is_filled)
 {
   namespace cg = cooperative_groups;
 

@@ -28,6 +28,7 @@
 namespace cuco {
 namespace static_map_ns {
 namespace detail {
+CUCO_SUPPRESS_KERNEL_WARNINGS
 
 /**
  * @brief For any key-value pair `{k, v}` in the range `[first, first + n)`, if a key equivalent to
@@ -48,7 +49,7 @@ namespace detail {
  * @param ref Non-owning container device ref used to access the slot storage
  */
 template <int32_t CGSize, int32_t BlockSize, typename InputIt, typename Ref>
-__global__ void insert_or_assign(InputIt first, cuco::detail::index_type n, Ref ref)
+CUCO_KERNEL void insert_or_assign(InputIt first, cuco::detail::index_type n, Ref ref)
 {
   auto const loop_stride = cuco::detail::grid_stride() / CGSize;
   auto idx               = cuco::detail::global_thread_id() / CGSize;
@@ -87,7 +88,7 @@ __global__ void insert_or_assign(InputIt first, cuco::detail::index_type n, Ref
  * @param ref Non-owning map device ref used to access the slot storage
  */
 template <int32_t CGSize, int32_t BlockSize, typename InputIt, typename OutputIt, typename Ref>
-__global__ void find(InputIt first, cuco::detail::index_type n, OutputIt output_begin, Ref ref)
+CUCO_KERNEL void find(InputIt first, cuco::detail::index_type n, OutputIt output_begin, Ref ref)
 {
   namespace cg = cooperative_groups;