diff --git a/include/cuco/detail/dynamic_map_kernels.cuh b/include/cuco/detail/dynamic_map_kernels.cuh
index 566576e1e..32b2b16b1 100644
--- a/include/cuco/detail/dynamic_map_kernels.cuh
+++ b/include/cuco/detail/dynamic_map_kernels.cuh
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 #pragma once
+#include <cuco/detail/utility/cuda.cuh>
 
 #include <cub/block/block_reduce.cuh>
 
@@ -25,6 +26,8 @@ namespace cuco {
 namespace detail {
 namespace cg = cooperative_groups;
 
+CUCO_SUPPRESS_KERNEL_WARNINGS
+
 /**
  * @brief Inserts all key/value pairs in the range `[first, last)`.
  *
@@ -62,15 +65,15 @@ template <uint32_t block_size,
           typename atomicT,
           typename Hash,
           typename KeyEqual>
-__global__ void insert(InputIt first,
-                       InputIt last,
-                       viewT* submap_views,
-                       mutableViewT* submap_mutable_views,
-                       atomicT* num_successes,
-                       uint32_t insert_idx,
-                       uint32_t num_submaps,
-                       Hash hash,
-                       KeyEqual key_equal)
+CUCO_KERNEL void insert(InputIt first,
+                        InputIt last,
+                        viewT* submap_views,
+                        mutableViewT* submap_mutable_views,
+                        atomicT* num_successes,
+                        uint32_t insert_idx,
+                        uint32_t num_submaps,
+                        Hash hash,
+                        KeyEqual key_equal)
 {
   using BlockReduce = cub::BlockReduce<std::size_t, block_size>;
   __shared__ typename BlockReduce::TempStorage temp_storage;
@@ -147,15 +150,15 @@ template <uint32_t block_size,
           typename atomicT,
           typename Hash,
           typename KeyEqual>
-__global__ void insert(InputIt first,
-                       InputIt last,
-                       viewT* submap_views,
-                       mutableViewT* submap_mutable_views,
-                       atomicT** submap_num_successes,
-                       uint32_t insert_idx,
-                       uint32_t num_submaps,
-                       Hash hash,
-                       KeyEqual key_equal)
+CUCO_KERNEL void insert(InputIt first,
+                        InputIt last,
+                        viewT* submap_views,
+                        mutableViewT* submap_mutable_views,
+                        atomicT** submap_num_successes,
+                        uint32_t insert_idx,
+                        uint32_t num_submaps,
+                        Hash hash,
+                        KeyEqual key_equal)
 {
   using BlockReduce = cub::BlockReduce<std::size_t, block_size>;
   __shared__ typename BlockReduce::TempStorage temp_storage;
@@ -225,13 +228,13 @@ template <uint32_t block_size,
           typename atomicT,
           typename Hash,
           typename KeyEqual>
-__global__ void erase(InputIt first,
-                      InputIt last,
-                      mutableViewT* submap_mutable_views,
-                      atomicT** submap_num_successes,
-                      uint32_t num_submaps,
-                      Hash hash,
-                      KeyEqual key_equal)
+CUCO_KERNEL void erase(InputIt first,
+                       InputIt last,
+                       mutableViewT* submap_mutable_views,
+                       atomicT** submap_num_successes,
+                       uint32_t num_submaps,
+                       Hash hash,
+                       KeyEqual key_equal)
 {
   extern __shared__ unsigned long long submap_block_num_successes[];
 
@@ -296,13 +299,13 @@ template <uint32_t block_size,
           typename atomicT,
           typename Hash,
           typename KeyEqual>
-__global__ void erase(InputIt first,
-                      InputIt last,
-                      mutableViewT* submap_mutable_views,
-                      atomicT** submap_num_successes,
-                      uint32_t num_submaps,
-                      Hash hash,
-                      KeyEqual key_equal)
+CUCO_KERNEL void erase(InputIt first,
+                       InputIt last,
+                       mutableViewT* submap_mutable_views,
+                       atomicT** submap_num_successes,
+                       uint32_t num_submaps,
+                       Hash hash,
+                       KeyEqual key_equal)
 {
   extern __shared__ unsigned long long submap_block_num_successes[];
 
@@ -368,13 +371,13 @@ template <uint32_t block_size,
           typename viewT,
           typename Hash,
           typename KeyEqual>
-__global__ void find(InputIt first,
-                     InputIt last,
-                     OutputIt output_begin,
-                     viewT* submap_views,
-                     uint32_t num_submaps,
-                     Hash hash,
-                     KeyEqual key_equal)
+CUCO_KERNEL void find(InputIt first,
+                      InputIt last,
+                      OutputIt output_begin,
+                      viewT* submap_views,
+                      uint32_t num_submaps,
+                      Hash hash,
+                      KeyEqual key_equal)
 {
   auto tid                  = blockDim.x * blockIdx.x + threadIdx.x;
   auto empty_value_sentinel = submap_views[0].get_empty_value_sentinel();
@@ -443,13 +446,13 @@ template <uint32_t block_size,
           typename viewT,
           typename Hash,
           typename KeyEqual>
-__global__ void find(InputIt first,
-                     InputIt last,
-                     OutputIt output_begin,
-                     viewT* submap_views,
-                     uint32_t num_submaps,
-                     Hash hash,
-                     KeyEqual key_equal)
+CUCO_KERNEL void find(InputIt first,
+                      InputIt last,
+                      OutputIt output_begin,
+                      viewT* submap_views,
+                      uint32_t num_submaps,
+                      Hash hash,
+                      KeyEqual key_equal)
 {
   auto tile                 = cg::tiled_partition<tile_size>(cg::this_thread_block());
   auto tid                  = blockDim.x * blockIdx.x + threadIdx.x;
@@ -514,13 +517,13 @@ template <uint32_t block_size,
           typename viewT,
           typename Hash,
           typename KeyEqual>
-__global__ void contains(InputIt first,
-                         InputIt last,
-                         OutputIt output_begin,
-                         viewT* submap_views,
-                         uint32_t num_submaps,
-                         Hash hash,
-                         KeyEqual key_equal)
+CUCO_KERNEL void contains(InputIt first,
+                          InputIt last,
+                          OutputIt output_begin,
+                          viewT* submap_views,
+                          uint32_t num_submaps,
+                          Hash hash,
+                          KeyEqual key_equal)
 {
   auto tid = blockDim.x * blockIdx.x + threadIdx.x;
   __shared__ bool writeBuffer[block_size];
@@ -582,13 +585,13 @@ template <uint32_t block_size,
           typename viewT,
           typename Hash,
           typename KeyEqual>
-__global__ void contains(InputIt first,
-                         InputIt last,
-                         OutputIt output_begin,
-                         viewT* submap_views,
-                         uint32_t num_submaps,
-                         Hash hash,
-                         KeyEqual key_equal)
+CUCO_KERNEL void contains(InputIt first,
+                          InputIt last,
+                          OutputIt output_begin,
+                          viewT* submap_views,
+                          uint32_t num_submaps,
+                          Hash hash,
+                          KeyEqual key_equal)
 {
   auto tile    = cg::tiled_partition<tile_size>(cg::this_thread_block());
   auto tid     = blockDim.x * blockIdx.x + threadIdx.x;
diff --git a/include/cuco/detail/open_addressing/kernels.cuh b/include/cuco/detail/open_addressing/kernels.cuh
index 6772014df..7aff8a1c2 100644
--- a/include/cuco/detail/open_addressing/kernels.cuh
+++ b/include/cuco/detail/open_addressing/kernels.cuh
@@ -27,6 +27,7 @@
 
 namespace cuco {
 namespace detail {
+CUCO_SUPPRESS_KERNEL_WARNINGS
 
 /**
  * @brief Inserts all elements in the range `[first, first + n)` and returns the number of
@@ -61,12 +62,12 @@ template <int32_t CGSize,
           typename Predicate,
           typename AtomicT,
           typename Ref>
-__global__ void insert_if_n(InputIt first,
-                            cuco::detail::index_type n,
-                            StencilIt stencil,
-                            Predicate pred,
-                            AtomicT* num_successes,
-                            Ref ref)
+CUCO_KERNEL void insert_if_n(InputIt first,
+                             cuco::detail::index_type n,
+                             StencilIt stencil,
+                             Predicate pred,
+                             AtomicT* num_successes,
+                             Ref ref)
 {
   using BlockReduce = cub::BlockReduce<typename Ref::size_type, BlockSize>;
   __shared__ typename BlockReduce::TempStorage temp_storage;
@@ -127,7 +128,7 @@ template <int32_t CGSize,
           typename StencilIt,
           typename Predicate,
           typename Ref>
-__global__ void insert_if_n(
+CUCO_KERNEL void insert_if_n(
   InputIt first, cuco::detail::index_type n, StencilIt stencil, Predicate pred, Ref ref)
 {
   auto const loop_stride = cuco::detail::grid_stride() / CGSize;
@@ -162,7 +163,7 @@ __global__ void insert_if_n(
  * @param ref Non-owning container device ref used to access the slot storage
  */
 template <int32_t CGSize, int32_t BlockSize, typename InputIt, typename Ref>
-__global__ void erase(InputIt first, cuco::detail::index_type n, Ref ref)
+CUCO_KERNEL void erase(InputIt first, cuco::detail::index_type n, Ref ref)
 {
   auto const loop_stride = cuco::detail::grid_stride() / CGSize;
   auto idx               = cuco::detail::global_thread_id() / CGSize;
@@ -212,12 +213,12 @@ template <int32_t CGSize,
           typename Predicate,
           typename OutputIt,
           typename Ref>
-__global__ void contains_if_n(InputIt first,
-                              cuco::detail::index_type n,
-                              StencilIt stencil,
-                              Predicate pred,
-                              OutputIt output_begin,
-                              Ref ref)
+CUCO_KERNEL void contains_if_n(InputIt first,
+                               cuco::detail::index_type n,
+                               StencilIt stencil,
+                               Predicate pred,
+                               OutputIt output_begin,
+                               Ref ref)
 {
   namespace cg = cooperative_groups;
 
@@ -267,7 +268,7 @@ __global__ void contains_if_n(InputIt first,
  * @param count Number of filled slots
  */
 template <int32_t BlockSize, typename StorageRef, typename Predicate, typename AtomicT>
-__global__ void size(StorageRef storage, Predicate is_filled, AtomicT* count)
+CUCO_KERNEL void size(StorageRef storage, Predicate is_filled, AtomicT* count)
 {
   using size_type = typename StorageRef::size_type;
 
@@ -293,9 +294,9 @@ __global__ void size(StorageRef storage, Predicate is_filled, AtomicT* count)
 }
 
 template <int32_t BlockSize, typename ContainerRef, typename Predicate>
-__global__ void rehash(typename ContainerRef::storage_ref_type storage_ref,
-                       ContainerRef container_ref,
-                       Predicate is_filled)
+CUCO_KERNEL void rehash(typename ContainerRef::storage_ref_type storage_ref,
+                        ContainerRef container_ref,
+                        Predicate is_filled)
 {
   namespace cg = cooperative_groups;
 
diff --git a/include/cuco/detail/static_map/kernels.cuh b/include/cuco/detail/static_map/kernels.cuh
index 76923f858..6e034567b 100644
--- a/include/cuco/detail/static_map/kernels.cuh
+++ b/include/cuco/detail/static_map/kernels.cuh
@@ -28,6 +28,7 @@
 namespace cuco {
 namespace static_map_ns {
 namespace detail {
+CUCO_SUPPRESS_KERNEL_WARNINGS
 
 /**
  * @brief For any key-value pair `{k, v}` in the range `[first, first + n)`, if a key equivalent to
@@ -48,7 +49,7 @@ namespace detail {
  * @param ref Non-owning container device ref used to access the slot storage
  */
 template <int32_t CGSize, int32_t BlockSize, typename InputIt, typename Ref>
-__global__ void insert_or_assign(InputIt first, cuco::detail::index_type n, Ref ref)
+CUCO_KERNEL void insert_or_assign(InputIt first, cuco::detail::index_type n, Ref ref)
 {
   auto const loop_stride = cuco::detail::grid_stride() / CGSize;
   auto idx               = cuco::detail::global_thread_id() / CGSize;
@@ -87,7 +88,7 @@ __global__ void insert_or_assign(InputIt first, cuco::detail::index_type n, Ref
  * @param ref Non-owning map device ref used to access the slot storage
  */
 template <int32_t CGSize, int32_t BlockSize, typename InputIt, typename OutputIt, typename Ref>
-__global__ void find(InputIt first, cuco::detail::index_type n, OutputIt output_begin, Ref ref)
+CUCO_KERNEL void find(InputIt first, cuco::detail::index_type n, OutputIt output_begin, Ref ref)
 {
   namespace cg = cooperative_groups;
 
diff --git a/include/cuco/detail/static_map_kernels.cuh b/include/cuco/detail/static_map_kernels.cuh
index aa4f56daa..3d2a9cfd3 100644
--- a/include/cuco/detail/static_map_kernels.cuh
+++ b/include/cuco/detail/static_map_kernels.cuh
@@ -24,6 +24,7 @@
 namespace cuco::legacy::detail {
 namespace cg = cooperative_groups;
 
+CUCO_SUPPRESS_KERNEL_WARNINGS
 /**
  * @brief Initializes each slot in the flat `slots` storage to contain `k` and `v`.
  *
@@ -47,7 +48,7 @@ template <std::size_t block_size,
           typename Key,
           typename Value,
           typename pair_atomic_type>
-__global__ void initialize(pair_atomic_type* const slots, Key k, Value v, int64_t size)
+CUCO_KERNEL void initialize(pair_atomic_type* const slots, Key k, Value v, int64_t size)
 {
   int64_t const loop_stride = gridDim.x * block_size;
   int64_t idx               = block_size * blockIdx.x + threadIdx.x;
@@ -85,7 +86,7 @@ template <std::size_t block_size,
           typename viewT,
           typename Hash,
           typename KeyEqual>
-__global__ void insert(
+CUCO_KERNEL void insert(
   InputIt first, int64_t n, atomicT* num_successes, viewT view, Hash hash, KeyEqual key_equal)
 {
   typedef cub::BlockReduce<std::size_t, block_size> BlockReduce;
@@ -140,7 +141,7 @@ template <std::size_t block_size,
           typename viewT,
           typename Hash,
           typename KeyEqual>
-__global__ void insert(
+CUCO_KERNEL void insert(
   InputIt first, int64_t n, atomicT* num_successes, viewT view, Hash hash, KeyEqual key_equal)
 {
   typedef cub::BlockReduce<std::size_t, block_size> BlockReduce;
@@ -194,7 +195,7 @@ template <std::size_t block_size,
           typename viewT,
           typename Hash,
           typename KeyEqual>
-__global__ void erase(
+CUCO_KERNEL void erase(
   InputIt first, int64_t n, atomicT* num_successes, viewT view, Hash hash, KeyEqual key_equal)
 {
   using BlockReduce = cub::BlockReduce<std::size_t, block_size>;
@@ -247,7 +248,7 @@ template <std::size_t block_size,
           typename viewT,
           typename Hash,
           typename KeyEqual>
-__global__ void erase(
+CUCO_KERNEL void erase(
   InputIt first, int64_t n, atomicT* num_successes, viewT view, Hash hash, KeyEqual key_equal)
 {
   typedef cub::BlockReduce<std::size_t, block_size> BlockReduce;
@@ -311,14 +312,14 @@ template <std::size_t block_size,
           typename Predicate,
           typename Hash,
           typename KeyEqual>
-__global__ void insert_if_n(InputIt first,
-                            int64_t n,
-                            atomicT* num_successes,
-                            viewT view,
-                            StencilIt stencil,
-                            Predicate pred,
-                            Hash hash,
-                            KeyEqual key_equal)
+CUCO_KERNEL void insert_if_n(InputIt first,
+                             int64_t n,
+                             atomicT* num_successes,
+                             viewT view,
+                             StencilIt stencil,
+                             Predicate pred,
+                             Hash hash,
+                             KeyEqual key_equal)
 {
   typedef cub::BlockReduce<std::size_t, block_size> BlockReduce;
   __shared__ typename BlockReduce::TempStorage temp_storage;
@@ -375,7 +376,7 @@ template <std::size_t block_size,
           typename viewT,
           typename Hash,
           typename KeyEqual>
-__global__ void find(
+CUCO_KERNEL void find(
   InputIt first, int64_t n, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal)
 {
   int64_t const loop_stride = gridDim.x * block_size;
@@ -437,7 +438,7 @@ template <std::size_t block_size,
           typename viewT,
           typename Hash,
           typename KeyEqual>
-__global__ void find(
+CUCO_KERNEL void find(
   InputIt first, int64_t n, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal)
 {
   auto tile                 = cg::tiled_partition<tile_size>(cg::this_thread_block());
@@ -494,7 +495,7 @@ template <std::size_t block_size,
           typename viewT,
           typename Hash,
           typename KeyEqual>
-__global__ void contains(
+CUCO_KERNEL void contains(
   InputIt first, int64_t n, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal)
 {
   int64_t const loop_stride = gridDim.x * block_size;
@@ -551,7 +552,7 @@ template <std::size_t block_size,
           typename viewT,
           typename Hash,
           typename KeyEqual>
-__global__ void contains(
+CUCO_KERNEL void contains(
   InputIt first, int64_t n, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal)
 {
   auto tile                 = cg::tiled_partition<tile_size>(cg::this_thread_block());
diff --git a/include/cuco/detail/static_multimap/kernels.cuh b/include/cuco/detail/static_multimap/kernels.cuh
index 67fb36045..5036c3f7f 100644
--- a/include/cuco/detail/static_multimap/kernels.cuh
+++ b/include/cuco/detail/static_multimap/kernels.cuh
@@ -15,6 +15,7 @@
  */
 #pragma once
 
+#include <cuco/detail/utility/cuda.cuh>
 #include <cuco/pair.cuh>
 
 #include <thrust/type_traits/is_contiguous_iterator.h>
@@ -29,6 +30,7 @@ namespace cuco {
 namespace detail {
 namespace cg = cooperative_groups;
 
+CUCO_SUPPRESS_KERNEL_WARNINGS
 /**
  * @brief Initializes each slot in the flat `slots` storage to contain `k` and `v`.
  *
@@ -51,7 +53,7 @@ template <typename atomic_key_type,
           typename Key,
           typename Value,
           typename pair_atomic_type>
-__global__ void initialize(pair_atomic_type* const slots, Key k, Value v, int64_t size)
+CUCO_KERNEL void initialize(pair_atomic_type* const slots, Key k, Value v, int64_t size)
 {
   int64_t const loop_stride = gridDim.x * blockDim.x;
   int64_t idx               = threadIdx.x + blockIdx.x * blockDim.x;
@@ -82,7 +84,7 @@ __global__ void initialize(pair_atomic_type* const slots, Key k, Value v, int64_
  * @param view Mutable device view used to access the hash map's slot storage
  */
 template <uint32_t block_size, uint32_t tile_size, typename InputIt, typename viewT>
-__global__ void insert(InputIt first, int64_t n, viewT view)
+CUCO_KERNEL void insert(InputIt first, int64_t n, viewT view)
 {
   auto tile                 = cg::tiled_partition<tile_size>(cg::this_thread_block());
   int64_t const loop_stride = gridDim.x * block_size / tile_size;
@@ -130,7 +132,7 @@ template <uint32_t block_size,
           typename StencilIt,
           typename viewT,
           typename Predicate>
-__global__ void insert_if_n(InputIt first, StencilIt s, int64_t n, viewT view, Predicate pred)
+CUCO_KERNEL void insert_if_n(InputIt first, StencilIt s, int64_t n, viewT view, Predicate pred)
 {
   auto tile                 = cg::tiled_partition<tile_size>(cg::this_thread_block());
   int64_t const loop_stride = gridDim.x * block_size / tile_size;
@@ -177,7 +179,7 @@ template <bool is_pair_contains,
           typename OutputIt,
           typename viewT,
           typename Equal>
-__global__ void contains(InputIt first, int64_t n, OutputIt output_begin, viewT view, Equal equal)
+CUCO_KERNEL void contains(InputIt first, int64_t n, OutputIt output_begin, viewT view, Equal equal)
 {
   auto tile                 = cg::tiled_partition<tile_size>(cg::this_thread_block());
   int64_t const loop_stride = gridDim.x * block_size / tile_size;
@@ -235,7 +237,7 @@ template <uint32_t block_size,
           typename atomicT,
           typename viewT,
           typename KeyEqual>
-__global__ void count(
+CUCO_KERNEL void count(
   InputIt first, int64_t n, atomicT* num_matches, viewT view, KeyEqual key_equal)
 {
   auto tile                 = cg::tiled_partition<tile_size>(cg::this_thread_block());
@@ -294,7 +296,7 @@ template <uint32_t block_size,
           typename atomicT,
           typename viewT,
           typename PairEqual>
-__global__ void pair_count(
+CUCO_KERNEL void pair_count(
   InputIt first, int64_t n, atomicT* num_matches, viewT view, PairEqual pair_equal)
 {
   auto tile                 = cg::tiled_partition<tile_size>(cg::this_thread_block());
@@ -363,12 +365,12 @@ template <uint32_t block_size,
           typename atomicT,
           typename viewT,
           typename KeyEqual>
-__global__ void retrieve(InputIt first,
-                         int64_t n,
-                         OutputIt output_begin,
-                         atomicT* num_matches,
-                         viewT view,
-                         KeyEqual key_equal)
+CUCO_KERNEL void retrieve(InputIt first,
+                          int64_t n,
+                          OutputIt output_begin,
+                          atomicT* num_matches,
+                          viewT view,
+                          KeyEqual key_equal)
 {
   using pair_type = typename viewT::value_type;
 
@@ -476,13 +478,13 @@ template <uint32_t block_size,
           typename atomicT,
           typename viewT,
           typename PairEqual>
-__global__ void pair_retrieve(InputIt first,
-                              int64_t n,
-                              OutputIt1 probe_output_begin,
-                              OutputIt2 contained_output_begin,
-                              atomicT* num_matches,
-                              viewT view,
-                              PairEqual pair_equal)
+CUCO_KERNEL void pair_retrieve(InputIt first,
+                               int64_t n,
+                               OutputIt1 probe_output_begin,
+                               OutputIt2 contained_output_begin,
+                               atomicT* num_matches,
+                               viewT view,
+                               PairEqual pair_equal)
 {
   using pair_type = typename viewT::value_type;
 
@@ -549,6 +551,5 @@ __global__ void pair_retrieve(InputIt first,
                              contained_output_begin);
   }
 }
-
 }  // namespace detail
 }  // namespace cuco
diff --git a/include/cuco/detail/static_set/kernels.cuh b/include/cuco/detail/static_set/kernels.cuh
index dce4dc8ce..cce11b1b5 100644
--- a/include/cuco/detail/static_set/kernels.cuh
+++ b/include/cuco/detail/static_set/kernels.cuh
@@ -30,6 +30,7 @@ namespace cuco {
 namespace static_set_ns {
 namespace detail {
 
+CUCO_SUPPRESS_KERNEL_WARNINGS
 /**
  * @brief Finds the equivalent set elements of all keys in the range `[first, last)`.
  *
@@ -50,7 +51,7 @@ namespace detail {
  * @param ref Non-owning set device ref used to access the slot storage
  */
 template <int32_t CGSize, int32_t BlockSize, typename InputIt, typename OutputIt, typename Ref>
-__global__ void find(InputIt first, cuco::detail::index_type n, OutputIt output_begin, Ref ref)
+CUCO_KERNEL void find(InputIt first, cuco::detail::index_type n, OutputIt output_begin, Ref ref)
 {
   namespace cg = cooperative_groups;
 
diff --git a/include/cuco/detail/storage/kernels.cuh b/include/cuco/detail/storage/kernels.cuh
index 6b3b10260..55e73bb6f 100644
--- a/include/cuco/detail/storage/kernels.cuh
+++ b/include/cuco/detail/storage/kernels.cuh
@@ -22,6 +22,8 @@
 namespace cuco {
 namespace detail {
 
+CUCO_SUPPRESS_KERNEL_WARNINGS
+
 /**
  * @brief Initializes each slot in the window storage to contain `value`.
  *
@@ -32,9 +34,9 @@ namespace detail {
  * @param value Value to which all values in `slots` are initialized
  */
 template <typename WindowT>
-__global__ void initialize(WindowT* windows,
-                           cuco::detail::index_type n,
-                           typename WindowT::value_type value)
+CUCO_KERNEL void initialize(WindowT* windows,
+                            cuco::detail::index_type n,
+                            typename WindowT::value_type value)
 {
   auto const loop_stride = cuco::detail::grid_stride();
   auto idx               = cuco::detail::global_thread_id();
diff --git a/include/cuco/detail/trie/dynamic_bitset/kernels.cuh b/include/cuco/detail/trie/dynamic_bitset/kernels.cuh
index c92ab60b2..c08041693 100644
--- a/include/cuco/detail/trie/dynamic_bitset/kernels.cuh
+++ b/include/cuco/detail/trie/dynamic_bitset/kernels.cuh
@@ -26,6 +26,7 @@ namespace cuco {
 namespace experimental {
 namespace detail {
 
+CUCO_SUPPRESS_KERNEL_WARNINGS
 /*
  * @brief Test bits for a range of keys
  *
@@ -41,10 +42,10 @@ namespace detail {
  * @param num_keys Number of input keys
  */
 template <typename BitsetRef, typename KeyIt, typename OutputIt>
-__global__ void bitset_test_kernel(BitsetRef ref,
-                                   KeyIt keys,
-                                   OutputIt outputs,
-                                   cuco::detail::index_type num_keys)
+CUCO_KERNEL void bitset_test_kernel(BitsetRef ref,
+                                    KeyIt keys,
+                                    OutputIt outputs,
+                                    cuco::detail::index_type num_keys)
 {
   auto key_id       = cuco::detail::global_thread_id();
   auto const stride = cuco::detail::grid_stride();
@@ -70,10 +71,10 @@ __global__ void bitset_test_kernel(BitsetRef ref,
  * @param num_keys Number of input keys
  */
 template <typename BitsetRef, typename KeyIt, typename OutputIt>
-__global__ void bitset_rank_kernel(BitsetRef ref,
-                                   KeyIt keys,
-                                   OutputIt outputs,
-                                   cuco::detail::index_type num_keys)
+CUCO_KERNEL void bitset_rank_kernel(BitsetRef ref,
+                                    KeyIt keys,
+                                    OutputIt outputs,
+                                    cuco::detail::index_type num_keys)
 {
   auto key_id       = cuco::detail::global_thread_id();
   auto const stride = cuco::detail::grid_stride();
@@ -99,10 +100,10 @@ __global__ void bitset_rank_kernel(BitsetRef ref,
  * @param num_keys Number of input keys
  */
 template <typename BitsetRef, typename KeyIt, typename OutputIt>
-__global__ void bitset_select_kernel(BitsetRef ref,
-                                     KeyIt keys,
-                                     OutputIt outputs,
-                                     cuco::detail::index_type num_keys)
+CUCO_KERNEL void bitset_select_kernel(BitsetRef ref,
+                                      KeyIt keys,
+                                      OutputIt outputs,
+                                      cuco::detail::index_type num_keys)
 {
   auto key_id       = cuco::detail::global_thread_id();
   auto const stride = cuco::detail::grid_stride();
@@ -125,10 +126,10 @@ __global__ void bitset_select_kernel(BitsetRef ref,
  * @param flip_bits Boolean to request negation of words before counting bits
  */
 template <typename WordType, typename SizeType>
-__global__ void bit_counts_kernel(WordType const* words,
-                                  SizeType* bit_counts,
-                                  cuco::detail::index_type num_words,
-                                  bool flip_bits)
+CUCO_KERNEL void bit_counts_kernel(WordType const* words,
+                                   SizeType* bit_counts,
+                                   cuco::detail::index_type num_words,
+                                   bool flip_bits)
 {
   auto word_id      = cuco::detail::global_thread_id();
   auto const stride = cuco::detail::grid_stride();
@@ -157,11 +158,11 @@ __global__ void bit_counts_kernel(WordType const* words,
  * @param words_per_block Number of words in each block
  */
 template <typename SizeType>
-__global__ void encode_ranks_from_prefix_bit_counts(const SizeType* prefix_bit_counts,
-                                                    rank* ranks,
-                                                    SizeType num_words,
-                                                    SizeType num_blocks,
-                                                    SizeType words_per_block)
+CUCO_KERNEL void encode_ranks_from_prefix_bit_counts(const SizeType* prefix_bit_counts,
+                                                     rank* ranks,
+                                                     SizeType num_words,
+                                                     SizeType num_blocks,
+                                                     SizeType words_per_block)
 {
   auto rank_id      = cuco::detail::global_thread_id();
   auto const stride = cuco::detail::grid_stride();
@@ -200,11 +201,11 @@ __global__ void encode_ranks_from_prefix_bit_counts(const SizeType* prefix_bit_c
  * @param bits_per_block Number of bits in each block
  */
 template <typename SizeType>
-__global__ void mark_blocks_with_select_entries(SizeType const* prefix_bit_counts,
-                                                SizeType* select_markers,
-                                                SizeType num_blocks,
-                                                SizeType words_per_block,
-                                                SizeType bits_per_block)
+CUCO_KERNEL void mark_blocks_with_select_entries(SizeType const* prefix_bit_counts,
+                                                 SizeType* select_markers,
+                                                 SizeType num_blocks,
+                                                 SizeType words_per_block,
+                                                 SizeType bits_per_block)
 {
   auto block_id     = cuco::detail::global_thread_id();
   auto const stride = cuco::detail::grid_stride();
diff --git a/include/cuco/detail/utility/cuda.cuh b/include/cuco/detail/utility/cuda.cuh
index 6e5f13ff7..d4838d8da 100644
--- a/include/cuco/detail/utility/cuda.cuh
+++ b/include/cuco/detail/utility/cuda.cuh
@@ -17,6 +17,22 @@
 
 #include <cuco/detail/utility/cuda.hpp>
 
+#if defined(CUCO_DISABLE_KERNEL_VISIBILITY_WARNING_SUPPRESSION)
+#define CUCO_SUPPRESS_KERNEL_WARNINGS
+#elif defined(__NVCC__) && (defined(__GNUC__) || defined(__clang__))
+// handle when nvcc is the CUDA compiler and gcc or clang is host
+#define CUCO_SUPPRESS_KERNEL_WARNINGS _Pragma("nv_diag_suppress 1407")
+_Pragma("GCC diagnostic ignored \"-Wattributes\"")
+#elif defined(__clang__)
+// handle when clang is the CUDA compiler
+#define CUCO_SUPPRESS_KERNEL_WARNINGS _Pragma("clang diagnostic ignored \"-Wattributes\"")
+#elif defined(__NVCOMPILER)
+#define CUCO_SUPPRESS_KERNEL_WARNINGS #pragma diag_suppress attribute_requires_external_linkage
+#endif
+
+#ifndef CUCO_KERNEL
+#define CUCO_KERNEL __attribute__((visibility("hidden"))) __global__
+#endif
 namespace cuco {
 namespace detail {