Minor compare exchange optimization #1001

gevtushenko · 2022-10-17T13:02:45Z

Currently, compare_exchange_strong is using __stronger_order_cuda:

inline __host__ __device__ int __stronger_order_cuda(int __a, int __b) {
    int const __max = __a > __b ? __a : __b;
    if(__max != __ATOMIC_RELEASE)
        return __max;
    static int const __xform[] = {
        __ATOMIC_RELEASE,
        __ATOMIC_ACQ_REL,
        __ATOMIC_ACQ_REL,
        __ATOMIC_RELEASE };
    return __xform[__a < __b ? __a : __b];
}

The code above leads to actual memory loads. We can consider the following optimization:

inline __host__ __device__ int __stronger_order_cuda(int __a, int __b) {
    int const __max = __a > __b ? __a : __b;
    if(__max != __ATOMIC_RELEASE)
        return __max;
    const int __min = __a < __b ? __a : __b;
    return __ATOMIC_ACQ_REL - ((__min & 1) == (__min >> 1));
}

The change leads to about 4% better performance of compare exchange on mobile 3070 ti when memory ordering is not known at compile time:

When the memory ordering is known at compile time, there's no difference in generated SASS for both versions. Here's the benchmark:

#include <iostream>
#include <cuda/atomic>

constexpr int threads_in_block = 1024;

__launch_bounds__(threads_in_block)
__global__ void kernel(int *ptr, int target, cuda::memory_order success, cuda::memory_order failure) {
  __shared__ int cache;

  int expected = -1;
  if (threadIdx.x == target) {
    cache = expected;
  }
  __syncthreads();

  cuda::atomic_ref<int, cuda::thread_scope_block> ref(cache);

  if (ref.compare_exchange_strong(expected, threadIdx.x, success, failure)) {
    ptr[blockIdx.x] = threadIdx.x;
  }
}

int main() {
  int blocks_in_grid = 256 * 1024;
  int n = blocks_in_grid;

  int *ptr{};
  cudaMalloc(&ptr, sizeof(int) * n);
  cudaMemset(ptr, 0, sizeof(int) * n);

  cudaEvent_t begin, end;
  cudaEventCreate(&begin);
  cudaEventCreate(&end);

  cudaEventRecord(begin);
  kernel<<<blocks_in_grid, threads_in_block>>>(ptr, 0, cuda::memory_order_release, cuda::memory_order_relaxed);
  cudaEventRecord(end);
  cudaEventSynchronize(end);

  float ms{};
  cudaEventElapsedTime(&ms, begin, end);

  std::cout << ms << std::endl;

  cudaEventDestroy(end);
  cudaEventDestroy(begin);

  cudaFree(ptr);
}

nvcc -gencode arch=compute_86,code=sm_86 -std=c++17 -DNDEBUG -O3 main.cu

The text was updated successfully, but these errors were encountered:

miscco · 2022-10-17T15:03:33Z

I am wondering whether we can reuse the comparison? That said, the code is definitely much worse

inline __host__ __device__ int __stronger_order_cuda(int __a, int __b) {
    if (__a > __b) {
        if (__a != _ATOMIC_RELEASE) {
            return __a;
        }
        return __ATOMIC_ACQ_REL - ((__b & 1) == (__b >> 1));
    }

    if (__b != _ATOMIC_RELEASE) {
        return __b;
    }
    return __ATOMIC_ACQ_REL - ((__a & 1) == (__a >> 1));
}

jrhemstad · 2022-10-17T16:30:15Z

Minor optimizations you say? Count me in!

How about this?

__device__ 
int __stronger_order_cuda(int __a, int __b) {
  using __min_max_t = cuda::std::pair<unsigned, unsigned>;

  __min_max_t const __min_max =  __a > __b 
                                 ? __min_max_t{__b, __a} 
                                 : __min_max_t{__a, __b};

  return ( __min_max.second ==__ATOMIC_RELEASE )
          ? (__ATOMIC_ACQ_REL - ( (__min_max.first % 2) == (__min_max.first / 2)) ) 
          : __min_max.second;
}

miscco · 2022-10-17T17:20:22Z

Minor optimizations you say? Count me in!

How about this?

Nothing better than nerdsniping 😹

I am slightly worried that the new object is affecting stack space and also the constructor of pair is non trivial.

That said I need to put it into goodbolt

jrhemstad added thrust For all items related to Thrust. libcu++ For all items related to libcu++ and removed thrust For all items related to Thrust. labels Feb 22, 2023

jarmak-nv assigned jrhemstad Feb 23, 2023

jrhemstad removed their assignment Mar 7, 2023

jarmak-nv transferred this issue from NVIDIA/libcudacxx Nov 8, 2023

GregoryKimball mentioned this issue Nov 15, 2023

[BUG] Resolve parquet reader performance regression on V100 from #14167 rapidsai/cudf#14415

Closed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Minor compare exchange optimization #1001

Minor compare exchange optimization #1001

gevtushenko commented Oct 17, 2022

miscco commented Oct 17, 2022

jrhemstad commented Oct 17, 2022

miscco commented Oct 17, 2022

Minor compare exchange optimization #1001

Minor compare exchange optimization #1001

Comments

gevtushenko commented Oct 17, 2022

miscco commented Oct 17, 2022

jrhemstad commented Oct 17, 2022

miscco commented Oct 17, 2022