Skip to content

Commit

Permalink
Mark all cuco kernels as static so they have hidden visibility (#422)
Browse files Browse the repository at this point in the history
This marks all kernels in CUCO as `static` so that they have internal
linkage and won't conflict when used by multiple DSOs.

I didn't see a single shared/common header in cuco where I could place a
`CUCO_KERNEL` macro so I modified each instance instead.
While `cccl` went with a `__attribute__ ((visibility ("hidden")))`
approach to help reduce RDC size, this approach seemed very invasive for
cuco. This is due to the fact that we would need to pragma push and pop
both gcc warnings and nvcc warnings in each cuco header so that we don't
introduce any warnings. This is needed as the compiler incorrectly state
that the `__attribute__ ((visibility ("hidden")))` has no side-effect.

Context:
rapidsai/cudf#14726
NVIDIA/cccl#166
rapidsai/raft#1722

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Yunsong Wang <yunsongw@nvidia.com>
  • Loading branch information
3 people authored Jan 19, 2024
1 parent f2af321 commit 75c9613
Show file tree
Hide file tree
Showing 9 changed files with 174 additions and 147 deletions.
123 changes: 63 additions & 60 deletions include/cuco/detail/dynamic_map_kernels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
* limitations under the License.
*/
#pragma once
#include <cuco/detail/utility/cuda.cuh>

#include <cub/block/block_reduce.cuh>

Expand All @@ -25,6 +26,8 @@ namespace cuco {
namespace detail {
namespace cg = cooperative_groups;

CUCO_SUPPRESS_KERNEL_WARNINGS

/**
* @brief Inserts all key/value pairs in the range `[first, last)`.
*
Expand Down Expand Up @@ -62,15 +65,15 @@ template <uint32_t block_size,
typename atomicT,
typename Hash,
typename KeyEqual>
__global__ void insert(InputIt first,
InputIt last,
viewT* submap_views,
mutableViewT* submap_mutable_views,
atomicT* num_successes,
uint32_t insert_idx,
uint32_t num_submaps,
Hash hash,
KeyEqual key_equal)
CUCO_KERNEL void insert(InputIt first,
InputIt last,
viewT* submap_views,
mutableViewT* submap_mutable_views,
atomicT* num_successes,
uint32_t insert_idx,
uint32_t num_submaps,
Hash hash,
KeyEqual key_equal)
{
using BlockReduce = cub::BlockReduce<std::size_t, block_size>;
__shared__ typename BlockReduce::TempStorage temp_storage;
Expand Down Expand Up @@ -147,15 +150,15 @@ template <uint32_t block_size,
typename atomicT,
typename Hash,
typename KeyEqual>
__global__ void insert(InputIt first,
InputIt last,
viewT* submap_views,
mutableViewT* submap_mutable_views,
atomicT** submap_num_successes,
uint32_t insert_idx,
uint32_t num_submaps,
Hash hash,
KeyEqual key_equal)
CUCO_KERNEL void insert(InputIt first,
InputIt last,
viewT* submap_views,
mutableViewT* submap_mutable_views,
atomicT** submap_num_successes,
uint32_t insert_idx,
uint32_t num_submaps,
Hash hash,
KeyEqual key_equal)
{
using BlockReduce = cub::BlockReduce<std::size_t, block_size>;
__shared__ typename BlockReduce::TempStorage temp_storage;
Expand Down Expand Up @@ -225,13 +228,13 @@ template <uint32_t block_size,
typename atomicT,
typename Hash,
typename KeyEqual>
__global__ void erase(InputIt first,
InputIt last,
mutableViewT* submap_mutable_views,
atomicT** submap_num_successes,
uint32_t num_submaps,
Hash hash,
KeyEqual key_equal)
CUCO_KERNEL void erase(InputIt first,
InputIt last,
mutableViewT* submap_mutable_views,
atomicT** submap_num_successes,
uint32_t num_submaps,
Hash hash,
KeyEqual key_equal)
{
extern __shared__ unsigned long long submap_block_num_successes[];

Expand Down Expand Up @@ -296,13 +299,13 @@ template <uint32_t block_size,
typename atomicT,
typename Hash,
typename KeyEqual>
__global__ void erase(InputIt first,
InputIt last,
mutableViewT* submap_mutable_views,
atomicT** submap_num_successes,
uint32_t num_submaps,
Hash hash,
KeyEqual key_equal)
CUCO_KERNEL void erase(InputIt first,
InputIt last,
mutableViewT* submap_mutable_views,
atomicT** submap_num_successes,
uint32_t num_submaps,
Hash hash,
KeyEqual key_equal)
{
extern __shared__ unsigned long long submap_block_num_successes[];

Expand Down Expand Up @@ -368,13 +371,13 @@ template <uint32_t block_size,
typename viewT,
typename Hash,
typename KeyEqual>
__global__ void find(InputIt first,
InputIt last,
OutputIt output_begin,
viewT* submap_views,
uint32_t num_submaps,
Hash hash,
KeyEqual key_equal)
CUCO_KERNEL void find(InputIt first,
InputIt last,
OutputIt output_begin,
viewT* submap_views,
uint32_t num_submaps,
Hash hash,
KeyEqual key_equal)
{
auto tid = blockDim.x * blockIdx.x + threadIdx.x;
auto empty_value_sentinel = submap_views[0].get_empty_value_sentinel();
Expand Down Expand Up @@ -443,13 +446,13 @@ template <uint32_t block_size,
typename viewT,
typename Hash,
typename KeyEqual>
__global__ void find(InputIt first,
InputIt last,
OutputIt output_begin,
viewT* submap_views,
uint32_t num_submaps,
Hash hash,
KeyEqual key_equal)
CUCO_KERNEL void find(InputIt first,
InputIt last,
OutputIt output_begin,
viewT* submap_views,
uint32_t num_submaps,
Hash hash,
KeyEqual key_equal)
{
auto tile = cg::tiled_partition<tile_size>(cg::this_thread_block());
auto tid = blockDim.x * blockIdx.x + threadIdx.x;
Expand Down Expand Up @@ -514,13 +517,13 @@ template <uint32_t block_size,
typename viewT,
typename Hash,
typename KeyEqual>
__global__ void contains(InputIt first,
InputIt last,
OutputIt output_begin,
viewT* submap_views,
uint32_t num_submaps,
Hash hash,
KeyEqual key_equal)
CUCO_KERNEL void contains(InputIt first,
InputIt last,
OutputIt output_begin,
viewT* submap_views,
uint32_t num_submaps,
Hash hash,
KeyEqual key_equal)
{
auto tid = blockDim.x * blockIdx.x + threadIdx.x;
__shared__ bool writeBuffer[block_size];
Expand Down Expand Up @@ -582,13 +585,13 @@ template <uint32_t block_size,
typename viewT,
typename Hash,
typename KeyEqual>
__global__ void contains(InputIt first,
InputIt last,
OutputIt output_begin,
viewT* submap_views,
uint32_t num_submaps,
Hash hash,
KeyEqual key_equal)
CUCO_KERNEL void contains(InputIt first,
InputIt last,
OutputIt output_begin,
viewT* submap_views,
uint32_t num_submaps,
Hash hash,
KeyEqual key_equal)
{
auto tile = cg::tiled_partition<tile_size>(cg::this_thread_block());
auto tid = blockDim.x * blockIdx.x + threadIdx.x;
Expand Down
37 changes: 19 additions & 18 deletions include/cuco/detail/open_addressing/kernels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@

namespace cuco {
namespace detail {
CUCO_SUPPRESS_KERNEL_WARNINGS

/**
* @brief Inserts all elements in the range `[first, first + n)` and returns the number of
Expand Down Expand Up @@ -61,12 +62,12 @@ template <int32_t CGSize,
typename Predicate,
typename AtomicT,
typename Ref>
__global__ void insert_if_n(InputIt first,
cuco::detail::index_type n,
StencilIt stencil,
Predicate pred,
AtomicT* num_successes,
Ref ref)
CUCO_KERNEL void insert_if_n(InputIt first,
cuco::detail::index_type n,
StencilIt stencil,
Predicate pred,
AtomicT* num_successes,
Ref ref)
{
using BlockReduce = cub::BlockReduce<typename Ref::size_type, BlockSize>;
__shared__ typename BlockReduce::TempStorage temp_storage;
Expand Down Expand Up @@ -127,7 +128,7 @@ template <int32_t CGSize,
typename StencilIt,
typename Predicate,
typename Ref>
__global__ void insert_if_n(
CUCO_KERNEL void insert_if_n(
InputIt first, cuco::detail::index_type n, StencilIt stencil, Predicate pred, Ref ref)
{
auto const loop_stride = cuco::detail::grid_stride() / CGSize;
Expand Down Expand Up @@ -162,7 +163,7 @@ __global__ void insert_if_n(
* @param ref Non-owning container device ref used to access the slot storage
*/
template <int32_t CGSize, int32_t BlockSize, typename InputIt, typename Ref>
__global__ void erase(InputIt first, cuco::detail::index_type n, Ref ref)
CUCO_KERNEL void erase(InputIt first, cuco::detail::index_type n, Ref ref)
{
auto const loop_stride = cuco::detail::grid_stride() / CGSize;
auto idx = cuco::detail::global_thread_id() / CGSize;
Expand Down Expand Up @@ -212,12 +213,12 @@ template <int32_t CGSize,
typename Predicate,
typename OutputIt,
typename Ref>
__global__ void contains_if_n(InputIt first,
cuco::detail::index_type n,
StencilIt stencil,
Predicate pred,
OutputIt output_begin,
Ref ref)
CUCO_KERNEL void contains_if_n(InputIt first,
cuco::detail::index_type n,
StencilIt stencil,
Predicate pred,
OutputIt output_begin,
Ref ref)
{
namespace cg = cooperative_groups;

Expand Down Expand Up @@ -267,7 +268,7 @@ __global__ void contains_if_n(InputIt first,
* @param count Number of filled slots
*/
template <int32_t BlockSize, typename StorageRef, typename Predicate, typename AtomicT>
__global__ void size(StorageRef storage, Predicate is_filled, AtomicT* count)
CUCO_KERNEL void size(StorageRef storage, Predicate is_filled, AtomicT* count)
{
using size_type = typename StorageRef::size_type;

Expand All @@ -293,9 +294,9 @@ __global__ void size(StorageRef storage, Predicate is_filled, AtomicT* count)
}

template <int32_t BlockSize, typename ContainerRef, typename Predicate>
__global__ void rehash(typename ContainerRef::storage_ref_type storage_ref,
ContainerRef container_ref,
Predicate is_filled)
CUCO_KERNEL void rehash(typename ContainerRef::storage_ref_type storage_ref,
ContainerRef container_ref,
Predicate is_filled)
{
namespace cg = cooperative_groups;

Expand Down
5 changes: 3 additions & 2 deletions include/cuco/detail/static_map/kernels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
namespace cuco {
namespace static_map_ns {
namespace detail {
CUCO_SUPPRESS_KERNEL_WARNINGS

/**
* @brief For any key-value pair `{k, v}` in the range `[first, first + n)`, if a key equivalent to
Expand All @@ -48,7 +49,7 @@ namespace detail {
* @param ref Non-owning container device ref used to access the slot storage
*/
template <int32_t CGSize, int32_t BlockSize, typename InputIt, typename Ref>
__global__ void insert_or_assign(InputIt first, cuco::detail::index_type n, Ref ref)
CUCO_KERNEL void insert_or_assign(InputIt first, cuco::detail::index_type n, Ref ref)
{
auto const loop_stride = cuco::detail::grid_stride() / CGSize;
auto idx = cuco::detail::global_thread_id() / CGSize;
Expand Down Expand Up @@ -87,7 +88,7 @@ __global__ void insert_or_assign(InputIt first, cuco::detail::index_type n, Ref
* @param ref Non-owning map device ref used to access the slot storage
*/
template <int32_t CGSize, int32_t BlockSize, typename InputIt, typename OutputIt, typename Ref>
__global__ void find(InputIt first, cuco::detail::index_type n, OutputIt output_begin, Ref ref)
CUCO_KERNEL void find(InputIt first, cuco::detail::index_type n, OutputIt output_begin, Ref ref)
{
namespace cg = cooperative_groups;

Expand Down
Loading

0 comments on commit 75c9613

Please sign in to comment.