diff --git a/include/cuco/detail/dynamic_map_kernels.cuh b/include/cuco/detail/dynamic_map_kernels.cuh index 566576e1e..32b2b16b1 100644 --- a/include/cuco/detail/dynamic_map_kernels.cuh +++ b/include/cuco/detail/dynamic_map_kernels.cuh @@ -14,6 +14,7 @@ * limitations under the License. */ #pragma once +#include #include @@ -25,6 +26,8 @@ namespace cuco { namespace detail { namespace cg = cooperative_groups; +CUCO_SUPPRESS_KERNEL_WARNINGS + /** * @brief Inserts all key/value pairs in the range `[first, last)`. * @@ -62,15 +65,15 @@ template -__global__ void insert(InputIt first, - InputIt last, - viewT* submap_views, - mutableViewT* submap_mutable_views, - atomicT* num_successes, - uint32_t insert_idx, - uint32_t num_submaps, - Hash hash, - KeyEqual key_equal) +CUCO_KERNEL void insert(InputIt first, + InputIt last, + viewT* submap_views, + mutableViewT* submap_mutable_views, + atomicT* num_successes, + uint32_t insert_idx, + uint32_t num_submaps, + Hash hash, + KeyEqual key_equal) { using BlockReduce = cub::BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; @@ -147,15 +150,15 @@ template -__global__ void insert(InputIt first, - InputIt last, - viewT* submap_views, - mutableViewT* submap_mutable_views, - atomicT** submap_num_successes, - uint32_t insert_idx, - uint32_t num_submaps, - Hash hash, - KeyEqual key_equal) +CUCO_KERNEL void insert(InputIt first, + InputIt last, + viewT* submap_views, + mutableViewT* submap_mutable_views, + atomicT** submap_num_successes, + uint32_t insert_idx, + uint32_t num_submaps, + Hash hash, + KeyEqual key_equal) { using BlockReduce = cub::BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; @@ -225,13 +228,13 @@ template -__global__ void erase(InputIt first, - InputIt last, - mutableViewT* submap_mutable_views, - atomicT** submap_num_successes, - uint32_t num_submaps, - Hash hash, - KeyEqual key_equal) +CUCO_KERNEL void erase(InputIt first, + InputIt last, + mutableViewT* submap_mutable_views, + atomicT** submap_num_successes, + uint32_t num_submaps, + Hash hash, + KeyEqual key_equal) { extern __shared__ unsigned long long submap_block_num_successes[]; @@ -296,13 +299,13 @@ template -__global__ void erase(InputIt first, - InputIt last, - mutableViewT* submap_mutable_views, - atomicT** submap_num_successes, - uint32_t num_submaps, - Hash hash, - KeyEqual key_equal) +CUCO_KERNEL void erase(InputIt first, + InputIt last, + mutableViewT* submap_mutable_views, + atomicT** submap_num_successes, + uint32_t num_submaps, + Hash hash, + KeyEqual key_equal) { extern __shared__ unsigned long long submap_block_num_successes[]; @@ -368,13 +371,13 @@ template -__global__ void find(InputIt first, - InputIt last, - OutputIt output_begin, - viewT* submap_views, - uint32_t num_submaps, - Hash hash, - KeyEqual key_equal) +CUCO_KERNEL void find(InputIt first, + InputIt last, + OutputIt output_begin, + viewT* submap_views, + uint32_t num_submaps, + Hash hash, + KeyEqual key_equal) { auto tid = blockDim.x * blockIdx.x + threadIdx.x; auto empty_value_sentinel = submap_views[0].get_empty_value_sentinel(); @@ -443,13 +446,13 @@ template -__global__ void find(InputIt first, - InputIt last, - OutputIt output_begin, - viewT* submap_views, - uint32_t num_submaps, - Hash hash, - KeyEqual key_equal) +CUCO_KERNEL void find(InputIt first, + InputIt last, + OutputIt output_begin, + viewT* submap_views, + uint32_t num_submaps, + Hash hash, + KeyEqual key_equal) { auto tile = cg::tiled_partition(cg::this_thread_block()); auto tid = blockDim.x * blockIdx.x + threadIdx.x; @@ -514,13 +517,13 @@ template -__global__ void contains(InputIt first, - InputIt last, - OutputIt output_begin, - viewT* submap_views, - uint32_t num_submaps, - Hash hash, - KeyEqual key_equal) +CUCO_KERNEL void contains(InputIt first, + InputIt last, + OutputIt output_begin, + viewT* submap_views, + uint32_t num_submaps, + Hash hash, + KeyEqual key_equal) { auto tid = blockDim.x * blockIdx.x + threadIdx.x; __shared__ bool writeBuffer[block_size]; @@ -582,13 +585,13 @@ template -__global__ void contains(InputIt first, - InputIt last, - OutputIt output_begin, - viewT* submap_views, - uint32_t num_submaps, - Hash hash, - KeyEqual key_equal) +CUCO_KERNEL void contains(InputIt first, + InputIt last, + OutputIt output_begin, + viewT* submap_views, + uint32_t num_submaps, + Hash hash, + KeyEqual key_equal) { auto tile = cg::tiled_partition(cg::this_thread_block()); auto tid = blockDim.x * blockIdx.x + threadIdx.x; diff --git a/include/cuco/detail/open_addressing/kernels.cuh b/include/cuco/detail/open_addressing/kernels.cuh index 6772014df..7aff8a1c2 100644 --- a/include/cuco/detail/open_addressing/kernels.cuh +++ b/include/cuco/detail/open_addressing/kernels.cuh @@ -27,6 +27,7 @@ namespace cuco { namespace detail { +CUCO_SUPPRESS_KERNEL_WARNINGS /** * @brief Inserts all elements in the range `[first, first + n)` and returns the number of @@ -61,12 +62,12 @@ template -__global__ void insert_if_n(InputIt first, - cuco::detail::index_type n, - StencilIt stencil, - Predicate pred, - AtomicT* num_successes, - Ref ref) +CUCO_KERNEL void insert_if_n(InputIt first, + cuco::detail::index_type n, + StencilIt stencil, + Predicate pred, + AtomicT* num_successes, + Ref ref) { using BlockReduce = cub::BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; @@ -127,7 +128,7 @@ template -__global__ void insert_if_n( +CUCO_KERNEL void insert_if_n( InputIt first, cuco::detail::index_type n, StencilIt stencil, Predicate pred, Ref ref) { auto const loop_stride = cuco::detail::grid_stride() / CGSize; @@ -162,7 +163,7 @@ __global__ void insert_if_n( * @param ref Non-owning container device ref used to access the slot storage */ template -__global__ void erase(InputIt first, cuco::detail::index_type n, Ref ref) +CUCO_KERNEL void erase(InputIt first, cuco::detail::index_type n, Ref ref) { auto const loop_stride = cuco::detail::grid_stride() / CGSize; auto idx = cuco::detail::global_thread_id() / CGSize; @@ -212,12 +213,12 @@ template -__global__ void contains_if_n(InputIt first, - cuco::detail::index_type n, - StencilIt stencil, - Predicate pred, - OutputIt output_begin, - Ref ref) +CUCO_KERNEL void contains_if_n(InputIt first, + cuco::detail::index_type n, + StencilIt stencil, + Predicate pred, + OutputIt output_begin, + Ref ref) { namespace cg = cooperative_groups; @@ -267,7 +268,7 @@ __global__ void contains_if_n(InputIt first, * @param count Number of filled slots */ template -__global__ void size(StorageRef storage, Predicate is_filled, AtomicT* count) +CUCO_KERNEL void size(StorageRef storage, Predicate is_filled, AtomicT* count) { using size_type = typename StorageRef::size_type; @@ -293,9 +294,9 @@ __global__ void size(StorageRef storage, Predicate is_filled, AtomicT* count) } template -__global__ void rehash(typename ContainerRef::storage_ref_type storage_ref, - ContainerRef container_ref, - Predicate is_filled) +CUCO_KERNEL void rehash(typename ContainerRef::storage_ref_type storage_ref, + ContainerRef container_ref, + Predicate is_filled) { namespace cg = cooperative_groups; diff --git a/include/cuco/detail/static_map/kernels.cuh b/include/cuco/detail/static_map/kernels.cuh index 76923f858..6e034567b 100644 --- a/include/cuco/detail/static_map/kernels.cuh +++ b/include/cuco/detail/static_map/kernels.cuh @@ -28,6 +28,7 @@ namespace cuco { namespace static_map_ns { namespace detail { +CUCO_SUPPRESS_KERNEL_WARNINGS /** * @brief For any key-value pair `{k, v}` in the range `[first, first + n)`, if a key equivalent to @@ -48,7 +49,7 @@ namespace detail { * @param ref Non-owning container device ref used to access the slot storage */ template -__global__ void insert_or_assign(InputIt first, cuco::detail::index_type n, Ref ref) +CUCO_KERNEL void insert_or_assign(InputIt first, cuco::detail::index_type n, Ref ref) { auto const loop_stride = cuco::detail::grid_stride() / CGSize; auto idx = cuco::detail::global_thread_id() / CGSize; @@ -87,7 +88,7 @@ __global__ void insert_or_assign(InputIt first, cuco::detail::index_type n, Ref * @param ref Non-owning map device ref used to access the slot storage */ template -__global__ void find(InputIt first, cuco::detail::index_type n, OutputIt output_begin, Ref ref) +CUCO_KERNEL void find(InputIt first, cuco::detail::index_type n, OutputIt output_begin, Ref ref) { namespace cg = cooperative_groups; diff --git a/include/cuco/detail/static_map_kernels.cuh b/include/cuco/detail/static_map_kernels.cuh index aa4f56daa..3d2a9cfd3 100644 --- a/include/cuco/detail/static_map_kernels.cuh +++ b/include/cuco/detail/static_map_kernels.cuh @@ -24,6 +24,7 @@ namespace cuco::legacy::detail { namespace cg = cooperative_groups; +CUCO_SUPPRESS_KERNEL_WARNINGS /** * @brief Initializes each slot in the flat `slots` storage to contain `k` and `v`. * @@ -47,7 +48,7 @@ template -__global__ void initialize(pair_atomic_type* const slots, Key k, Value v, int64_t size) +CUCO_KERNEL void initialize(pair_atomic_type* const slots, Key k, Value v, int64_t size) { int64_t const loop_stride = gridDim.x * block_size; int64_t idx = block_size * blockIdx.x + threadIdx.x; @@ -85,7 +86,7 @@ template -__global__ void insert( +CUCO_KERNEL void insert( InputIt first, int64_t n, atomicT* num_successes, viewT view, Hash hash, KeyEqual key_equal) { typedef cub::BlockReduce BlockReduce; @@ -140,7 +141,7 @@ template -__global__ void insert( +CUCO_KERNEL void insert( InputIt first, int64_t n, atomicT* num_successes, viewT view, Hash hash, KeyEqual key_equal) { typedef cub::BlockReduce BlockReduce; @@ -194,7 +195,7 @@ template -__global__ void erase( +CUCO_KERNEL void erase( InputIt first, int64_t n, atomicT* num_successes, viewT view, Hash hash, KeyEqual key_equal) { using BlockReduce = cub::BlockReduce; @@ -247,7 +248,7 @@ template -__global__ void erase( +CUCO_KERNEL void erase( InputIt first, int64_t n, atomicT* num_successes, viewT view, Hash hash, KeyEqual key_equal) { typedef cub::BlockReduce BlockReduce; @@ -311,14 +312,14 @@ template -__global__ void insert_if_n(InputIt first, - int64_t n, - atomicT* num_successes, - viewT view, - StencilIt stencil, - Predicate pred, - Hash hash, - KeyEqual key_equal) +CUCO_KERNEL void insert_if_n(InputIt first, + int64_t n, + atomicT* num_successes, + viewT view, + StencilIt stencil, + Predicate pred, + Hash hash, + KeyEqual key_equal) { typedef cub::BlockReduce BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; @@ -375,7 +376,7 @@ template -__global__ void find( +CUCO_KERNEL void find( InputIt first, int64_t n, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal) { int64_t const loop_stride = gridDim.x * block_size; @@ -437,7 +438,7 @@ template -__global__ void find( +CUCO_KERNEL void find( InputIt first, int64_t n, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal) { auto tile = cg::tiled_partition(cg::this_thread_block()); @@ -494,7 +495,7 @@ template -__global__ void contains( +CUCO_KERNEL void contains( InputIt first, int64_t n, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal) { int64_t const loop_stride = gridDim.x * block_size; @@ -551,7 +552,7 @@ template -__global__ void contains( +CUCO_KERNEL void contains( InputIt first, int64_t n, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal) { auto tile = cg::tiled_partition(cg::this_thread_block()); diff --git a/include/cuco/detail/static_multimap/kernels.cuh b/include/cuco/detail/static_multimap/kernels.cuh index 67fb36045..5036c3f7f 100644 --- a/include/cuco/detail/static_multimap/kernels.cuh +++ b/include/cuco/detail/static_multimap/kernels.cuh @@ -15,6 +15,7 @@ */ #pragma once +#include #include #include @@ -29,6 +30,7 @@ namespace cuco { namespace detail { namespace cg = cooperative_groups; +CUCO_SUPPRESS_KERNEL_WARNINGS /** * @brief Initializes each slot in the flat `slots` storage to contain `k` and `v`. * @@ -51,7 +53,7 @@ template -__global__ void initialize(pair_atomic_type* const slots, Key k, Value v, int64_t size) +CUCO_KERNEL void initialize(pair_atomic_type* const slots, Key k, Value v, int64_t size) { int64_t const loop_stride = gridDim.x * blockDim.x; int64_t idx = threadIdx.x + blockIdx.x * blockDim.x; @@ -82,7 +84,7 @@ __global__ void initialize(pair_atomic_type* const slots, Key k, Value v, int64_ * @param view Mutable device view used to access the hash map's slot storage */ template -__global__ void insert(InputIt first, int64_t n, viewT view) +CUCO_KERNEL void insert(InputIt first, int64_t n, viewT view) { auto tile = cg::tiled_partition(cg::this_thread_block()); int64_t const loop_stride = gridDim.x * block_size / tile_size; @@ -130,7 +132,7 @@ template -__global__ void insert_if_n(InputIt first, StencilIt s, int64_t n, viewT view, Predicate pred) +CUCO_KERNEL void insert_if_n(InputIt first, StencilIt s, int64_t n, viewT view, Predicate pred) { auto tile = cg::tiled_partition(cg::this_thread_block()); int64_t const loop_stride = gridDim.x * block_size / tile_size; @@ -177,7 +179,7 @@ template -__global__ void contains(InputIt first, int64_t n, OutputIt output_begin, viewT view, Equal equal) +CUCO_KERNEL void contains(InputIt first, int64_t n, OutputIt output_begin, viewT view, Equal equal) { auto tile = cg::tiled_partition(cg::this_thread_block()); int64_t const loop_stride = gridDim.x * block_size / tile_size; @@ -235,7 +237,7 @@ template -__global__ void count( +CUCO_KERNEL void count( InputIt first, int64_t n, atomicT* num_matches, viewT view, KeyEqual key_equal) { auto tile = cg::tiled_partition(cg::this_thread_block()); @@ -294,7 +296,7 @@ template -__global__ void pair_count( +CUCO_KERNEL void pair_count( InputIt first, int64_t n, atomicT* num_matches, viewT view, PairEqual pair_equal) { auto tile = cg::tiled_partition(cg::this_thread_block()); @@ -363,12 +365,12 @@ template -__global__ void retrieve(InputIt first, - int64_t n, - OutputIt output_begin, - atomicT* num_matches, - viewT view, - KeyEqual key_equal) +CUCO_KERNEL void retrieve(InputIt first, + int64_t n, + OutputIt output_begin, + atomicT* num_matches, + viewT view, + KeyEqual key_equal) { using pair_type = typename viewT::value_type; @@ -476,13 +478,13 @@ template -__global__ void pair_retrieve(InputIt first, - int64_t n, - OutputIt1 probe_output_begin, - OutputIt2 contained_output_begin, - atomicT* num_matches, - viewT view, - PairEqual pair_equal) +CUCO_KERNEL void pair_retrieve(InputIt first, + int64_t n, + OutputIt1 probe_output_begin, + OutputIt2 contained_output_begin, + atomicT* num_matches, + viewT view, + PairEqual pair_equal) { using pair_type = typename viewT::value_type; @@ -549,6 +551,5 @@ __global__ void pair_retrieve(InputIt first, contained_output_begin); } } - } // namespace detail } // namespace cuco diff --git a/include/cuco/detail/static_set/kernels.cuh b/include/cuco/detail/static_set/kernels.cuh index dce4dc8ce..cce11b1b5 100644 --- a/include/cuco/detail/static_set/kernels.cuh +++ b/include/cuco/detail/static_set/kernels.cuh @@ -30,6 +30,7 @@ namespace cuco { namespace static_set_ns { namespace detail { +CUCO_SUPPRESS_KERNEL_WARNINGS /** * @brief Finds the equivalent set elements of all keys in the range `[first, last)`. * @@ -50,7 +51,7 @@ namespace detail { * @param ref Non-owning set device ref used to access the slot storage */ template -__global__ void find(InputIt first, cuco::detail::index_type n, OutputIt output_begin, Ref ref) +CUCO_KERNEL void find(InputIt first, cuco::detail::index_type n, OutputIt output_begin, Ref ref) { namespace cg = cooperative_groups; diff --git a/include/cuco/detail/storage/kernels.cuh b/include/cuco/detail/storage/kernels.cuh index 6b3b10260..55e73bb6f 100644 --- a/include/cuco/detail/storage/kernels.cuh +++ b/include/cuco/detail/storage/kernels.cuh @@ -22,6 +22,8 @@ namespace cuco { namespace detail { +CUCO_SUPPRESS_KERNEL_WARNINGS + /** * @brief Initializes each slot in the window storage to contain `value`. * @@ -32,9 +34,9 @@ namespace detail { * @param value Value to which all values in `slots` are initialized */ template -__global__ void initialize(WindowT* windows, - cuco::detail::index_type n, - typename WindowT::value_type value) +CUCO_KERNEL void initialize(WindowT* windows, + cuco::detail::index_type n, + typename WindowT::value_type value) { auto const loop_stride = cuco::detail::grid_stride(); auto idx = cuco::detail::global_thread_id(); diff --git a/include/cuco/detail/trie/dynamic_bitset/kernels.cuh b/include/cuco/detail/trie/dynamic_bitset/kernels.cuh index c92ab60b2..c08041693 100644 --- a/include/cuco/detail/trie/dynamic_bitset/kernels.cuh +++ b/include/cuco/detail/trie/dynamic_bitset/kernels.cuh @@ -26,6 +26,7 @@ namespace cuco { namespace experimental { namespace detail { +CUCO_SUPPRESS_KERNEL_WARNINGS /* * @brief Test bits for a range of keys * @@ -41,10 +42,10 @@ namespace detail { * @param num_keys Number of input keys */ template -__global__ void bitset_test_kernel(BitsetRef ref, - KeyIt keys, - OutputIt outputs, - cuco::detail::index_type num_keys) +CUCO_KERNEL void bitset_test_kernel(BitsetRef ref, + KeyIt keys, + OutputIt outputs, + cuco::detail::index_type num_keys) { auto key_id = cuco::detail::global_thread_id(); auto const stride = cuco::detail::grid_stride(); @@ -70,10 +71,10 @@ __global__ void bitset_test_kernel(BitsetRef ref, * @param num_keys Number of input keys */ template -__global__ void bitset_rank_kernel(BitsetRef ref, - KeyIt keys, - OutputIt outputs, - cuco::detail::index_type num_keys) +CUCO_KERNEL void bitset_rank_kernel(BitsetRef ref, + KeyIt keys, + OutputIt outputs, + cuco::detail::index_type num_keys) { auto key_id = cuco::detail::global_thread_id(); auto const stride = cuco::detail::grid_stride(); @@ -99,10 +100,10 @@ __global__ void bitset_rank_kernel(BitsetRef ref, * @param num_keys Number of input keys */ template -__global__ void bitset_select_kernel(BitsetRef ref, - KeyIt keys, - OutputIt outputs, - cuco::detail::index_type num_keys) +CUCO_KERNEL void bitset_select_kernel(BitsetRef ref, + KeyIt keys, + OutputIt outputs, + cuco::detail::index_type num_keys) { auto key_id = cuco::detail::global_thread_id(); auto const stride = cuco::detail::grid_stride(); @@ -125,10 +126,10 @@ __global__ void bitset_select_kernel(BitsetRef ref, * @param flip_bits Boolean to request negation of words before counting bits */ template -__global__ void bit_counts_kernel(WordType const* words, - SizeType* bit_counts, - cuco::detail::index_type num_words, - bool flip_bits) +CUCO_KERNEL void bit_counts_kernel(WordType const* words, + SizeType* bit_counts, + cuco::detail::index_type num_words, + bool flip_bits) { auto word_id = cuco::detail::global_thread_id(); auto const stride = cuco::detail::grid_stride(); @@ -157,11 +158,11 @@ __global__ void bit_counts_kernel(WordType const* words, * @param words_per_block Number of words in each block */ template -__global__ void encode_ranks_from_prefix_bit_counts(const SizeType* prefix_bit_counts, - rank* ranks, - SizeType num_words, - SizeType num_blocks, - SizeType words_per_block) +CUCO_KERNEL void encode_ranks_from_prefix_bit_counts(const SizeType* prefix_bit_counts, + rank* ranks, + SizeType num_words, + SizeType num_blocks, + SizeType words_per_block) { auto rank_id = cuco::detail::global_thread_id(); auto const stride = cuco::detail::grid_stride(); @@ -200,11 +201,11 @@ __global__ void encode_ranks_from_prefix_bit_counts(const SizeType* prefix_bit_c * @param bits_per_block Number of bits in each block */ template -__global__ void mark_blocks_with_select_entries(SizeType const* prefix_bit_counts, - SizeType* select_markers, - SizeType num_blocks, - SizeType words_per_block, - SizeType bits_per_block) +CUCO_KERNEL void mark_blocks_with_select_entries(SizeType const* prefix_bit_counts, + SizeType* select_markers, + SizeType num_blocks, + SizeType words_per_block, + SizeType bits_per_block) { auto block_id = cuco::detail::global_thread_id(); auto const stride = cuco::detail::grid_stride(); diff --git a/include/cuco/detail/utility/cuda.cuh b/include/cuco/detail/utility/cuda.cuh index 6e5f13ff7..d4838d8da 100644 --- a/include/cuco/detail/utility/cuda.cuh +++ b/include/cuco/detail/utility/cuda.cuh @@ -17,6 +17,22 @@ #include +#if defined(CUCO_DISABLE_KERNEL_VISIBILITY_WARNING_SUPPRESSION) +#define CUCO_SUPPRESS_KERNEL_WARNINGS +#elif defined(__NVCC__) && (defined(__GNUC__) || defined(__clang__)) +// handle when nvcc is the CUDA compiler and gcc or clang is host +#define CUCO_SUPPRESS_KERNEL_WARNINGS _Pragma("nv_diag_suppress 1407") +_Pragma("GCC diagnostic ignored \"-Wattributes\"") +#elif defined(__clang__) +// handle when clang is the CUDA compiler +#define CUCO_SUPPRESS_KERNEL_WARNINGS _Pragma("clang diagnostic ignored \"-Wattributes\"") +#elif defined(__NVCOMPILER) +#define CUCO_SUPPRESS_KERNEL_WARNINGS #pragma diag_suppress attribute_requires_external_linkage +#endif + +#ifndef CUCO_KERNEL +#define CUCO_KERNEL __attribute__((visibility("hidden"))) __global__ +#endif namespace cuco { namespace detail {