Skip to content

Commit

Permalink
Add initial support for whole-array reduction on NVIDIA GPUs (#23689)
Browse files Browse the repository at this point in the history
Addresses Step 1 in #23324
Resolves Cray/chapel-private#5513

This PR adds several `gpu*Reduce` functions to perform whole-array
reductions for arrays that are allocated in GPU memory. The functions
added cover Chapel's default reductions and they are named:

- `gpuSumReduce`
- `gpuMinReduce`
- `gpuMaxReduce`
- `gpuMinLocReduce`
- `gpuMaxLocReduce`

### NVIDIA implementation

This is done by wrapping CUB: https://nvlabs.github.io/cub/. CUB is a
C++ header-only template library. We wrap it with some macro magic in
the runtime. This currently increases runtime build time by quite a bit.
We might consider wrapping the functions from the library in non-inline
helpers that can help a bit.

### AMD implementation

AMD has hipCUB: https://rocm.docs.amd.com/projects/hipCUB/en/latest/ and
a slightly lower-level, AMD-only rocPRIM:
https://rocm.docs.amd.com/projects/rocPRIM/en/latest/. I couldn't get
either to work. I can't run a simple HIP reproducer based off of one of
their tests. I might be doing something wrong in compilation, but what I
am getting is a segfault in the launched kernel (or `hipLaunchKernel`).
I filed ROCm/hipCUB#304, but
haven't received a response quick enough to address in this PR.

This is really unfortunate, but maybe we'll have a better/native
reduction support soon and we can cover AMD there, too.

### Implementation details:

- `chpl_gpu_X_reduce_Y` functions are added to the main runtime
interface via macros. Here, X is the reduction kind, Y is the data type.
This one prints debugging output, finds the stream to run the reduction
on and calls its `impl` cousin.
- `chpl_gpu_impl_X_reduce_Y` are added to the implementation layer in a
similar fashion.
- These functions are added to `gpu/Z/gpu-Z-reduce.cc` in the runtime
where Z is either `nvidia` or `amd`
- AMD versions are mostly "implemented" the way I think they should
work, but because of the segfaults that I was getting, they are in the
`else` branch of an `#if 1` at the moment.
- The module code has a private `doGpuReduce` that calls the appropriate
runtime function for any reduction type. This function has some
similarities to how atomics are implemented. Unfortunately the
interfaces are different enough that I can't come up with a good way to
refactor some of the helpers. All the reduction helpers are nested in
`doGpuReduce` to avoid confusion.
- To workaround a CUB limitation that prevents reducing arrays whose
size is close to `max(int(32))`, the implementation runs the underlying
CUB implementation with at most 2B elements at a time and stitches the
result on the host, if it ends up calling the implementation multiple
times. The underlying issue is captured in:
  - https://github.com/NVIDIA/thrust/issues/1271
  - NVIDIA/cccl#49

### Future work:
- Keep an eye on the AMD bug report
- Implement a fall back when we're ready to run an in-house reduction if
the bug remains unresolved.

[Reviewed by @stonea]

### Test Status
- [x] nvidia
- [x] amd
- [x] flat `make check`
  • Loading branch information
e-kayrakli authored Nov 6, 2023
2 parents 5554398 + 00ee967 commit befb7fb
Show file tree
Hide file tree
Showing 33 changed files with 874 additions and 6 deletions.
237 changes: 237 additions & 0 deletions modules/standard/GPU.chpl
Original file line number Diff line number Diff line change
Expand Up @@ -389,4 +389,241 @@ module GPU
/* When run on a GPU, atomically compare the value in 'x' and 'cmp', if they
are equal store 'val' in 'x'. The operation returns the old value of x. */
inline proc gpuAtomicCAS( ref x : ?T, cmp : T, val : T) : T { return gpuAtomicTernOp("CAS", x, cmp, val); }

// ============================
// Reductions
// ============================

@chpldoc.nodoc
config param gpuDebugReduce = false;

private inline proc doGpuReduce(param op: string, const ref A: [] ?t) {
if op != "sum" && op != "min" && op != "max" &&
op != "minloc" && op != "maxloc" {

compilerError("Unexpected reduction kind in doGpuReduce: ", op);
}


if CHPL_GPU == "amd" {
compilerError("gpu*Reduce functions are not supported on AMD GPUs");
}
else if CHPL_GPU == "cpu" {
select op {
when "sum" do return + reduce A;
when "min" do return min reduce A;
when "max" do return max reduce A;
when "minloc" do return minloc reduce zip (A.domain, A);
when "maxloc" do return maxloc reduce zip (A.domain, A);
otherwise do compilerError("Unknown reduction operation: ", op);
}
}
else {
compilerAssert(CHPL_GPU=="nvidia");
}


proc chplTypeToCTypeName(type t) param {
select t {
when int(8) do return "int8_t";
when int(16) do return "int16_t";
when int(32) do return "int32_t";
when int(64) do return "int64_t";
when uint(8) do return "uint8_t";
when uint(16) do return "uint16_t";
when uint(32) do return "uint32_t";
when uint(64) do return "uint64_t";
when real(32) do return "float";
when real(64) do return "double";
otherwise do
compilerError("Arrays with ", t:string, " elements cannot be reduced");
}
return "unknown";
}

proc getExternFuncName(param op: string, type t) param: string {
return "chpl_gpu_"+op+"_reduce_"+chplTypeToCTypeName(t);
}

proc isValReduce(param op) param {
return op=="sum" || op=="min" || op=="max";
}

proc isValIdxReduce(param op) param {
return op=="minloc" || op=="maxloc";
}

inline proc subReduceValIdx(param op, const baseOffset, ref accum, val) {
// do some type checking to be safe
compilerAssert(isTupleValue(val));
if isTupleValue(accum) {
compilerAssert(isValIdxReduce(op));
compilerAssert(val[1].type == accum[1].type);

}
else {
compilerAssert(isValReduce(op));
compilerAssert(val[1].type == accum.type);
}

select op {
when "sum" do accum += val[1];
when "min" do accum = min(accum, val[1]);
when "max" do accum = max(accum, val[1]);
when "minloc" do
if accum[1] > val[1] then accum = (val[0]+baseOffset, val[1]);
when "maxloc" do
if accum[1] < val[1] then accum = (val[0]+baseOffset, val[1]);
otherwise do compilerError("Unknown reduction operation: ", op);
}
}

iter offsetsThatCanFitIn32Bits(size: int) {
// Engin: I've tried to get max(int(32)) to work as this bug is about CUB
// using `int` as the size in the interface. However, getting close to
// max(int(32)) also triggers the bug. So, I am choosing this as a
// round/safe value for the time being.
param chunkSize = 2_000_000_000;

use Math only divCeil;
const numChunks = divCeil(size, chunkSize);
const standardChunkSize = divCeil(size, numChunks);

if gpuDebugReduce then
writeln("Will use ", numChunks, " chunks of size ", standardChunkSize);

foreach chunk in 0..<numChunks {
const start = chunk*standardChunkSize;
const curChunkSize = if start+standardChunkSize <= size
then standardChunkSize
else size-start;
if gpuDebugReduce then
writef("Chunk %i: (start=%i, curChunkSize=%i) ", chunk, start,
curChunkSize);

yield (start, curChunkSize);
}
}

use CTypes;

// find the extern function we'll use
param externFunc = getExternFuncName(op, t);
extern externFunc proc reduce_fn(data, size, ref val, ref idx);

// initialize the return value
var ret;
if isValReduce(op) {
var retTmp: t;
if op == "min" then retTmp = max(t);
else if op == "max" then retTmp = min(t);
ret = retTmp;
}
else if isValIdxReduce(op) {
var retTmp: (int, t);
if op == "minloc" then retTmp[1] = max(t);
else if op == "maxloc" then retTmp[1] = min(t);
ret = retTmp;
}
else {
compilerError("Unknown reduction operation: ", op);
ret = 0;
}

// perform the reduction
const basePtr = c_ptrToConst(A);
for (offset,size) in offsetsThatCanFitIn32Bits(A.size) {
var curIdx: int(32) = -1; // should remain -1 for sum, min, max
var curVal: t;
reduce_fn(basePtr+offset, size, curVal, curIdx);
subReduceValIdx(op, offset, ret, (curIdx, curVal));
if gpuDebugReduce then
writef(" (curIdx=%i curVal=%i ret=%?)\n", curIdx, curVal, ret);
}

if isValIdxReduce(op) then
ret[0] += A.domain.first;

return ret;
}

/*
Add all elements of an array together on the GPU (that is, perform a
sum-reduction). The array must be in GPU-accessible memory and the function
must be called from outside a GPU-eligible loop. Only arrays with int, uint,
and real types are supported. A simple example is the following:
.. code-block:: chapel
on here.gpus[0] {
var Arr = [3, 2, 1, 5, 4]; // will be GPU-accessible
writeln(gpuSumReduce(Arr)); // 15
}
*/
inline proc gpuSumReduce(const ref A: [] ?t) do return doGpuReduce("sum", A);

/*
Return the minimum element of an array on the GPU (that is, perform a
min-reduction). The array must be in GPU-accessible memory and the function
must be called from outside a GPU-eligible loop. Only arrays with int, uint,
and real types are supported. A simple example is the following:
.. code-block:: chapel
on here.gpus[0] {
var Arr = [3, 2, 1, 5, 4]; // will be GPU-accessible
writeln(gpuMinReduce(Arr)); // 1
}
*/
inline proc gpuMinReduce(const ref A: [] ?t) do return doGpuReduce("min", A);

/*
Return the maximum element of an array on the GPU (that is, perform a
max-reduction). The array must be in GPU-accessible memory and the function
must be called from outside a GPU-eligible loop. Only arrays with int, uint,
and real types are supported. A simple example is the following:
.. code-block:: chapel
on here.gpus[0] {
var Arr = [3, 2, 1, 5, 4]; // will be GPU-accessible
writeln(gpuMaxReduce(Arr)); // 5
}
*/
inline proc gpuMaxReduce(const ref A: [] ?t) do return doGpuReduce("max", A);

/*
For an array on the GPU, return a tuple with the index and the value of the
minimum element (that is, perform a minloc-reduction). If there are multiple
elements with the same minimum value, the index of the first one is
returned. The array must be in GPU-accessible memory and the function must
be called from outside a GPU-eligible loop. Only arrays with int, uint, and
real types are supported. A simple example is the following:
.. code-block:: chapel
on here.gpus[0] {
var Arr = [3, 2, 1, 5, 4]; // will be GPU-accessible
writeln(gpuMinLocReduce(Arr)); // (2, 1). Note that Arr[2]==1.
}
*/
inline proc gpuMinLocReduce(const ref A: [] ?t) do return doGpuReduce("minloc", A);

/*
For an array on the GPU, return a tuple with the index and the value of the
maximum element (that is, perform a maxloc-reduction). If there are multiple
elements with the same maximum value, the index of the first one is
returned. The array must be in GPU-accessible memory and the function must
be called from outside a GPU-eligible loop. Only arrays with int, uint, and
real types are supported. A simple example is the following:
.. code-block:: chapel
on here.gpus[0] {
var Arr = [3, 2, 1, 5, 4]; // will be GPU-accessible
writeln(gpuMaxLocReduce(Arr)); // (3, 5). Note that Arr[3]==5.
}
*/
inline proc gpuMaxLocReduce(const ref A: [] ?t) do return doGpuReduce("maxloc", A);

}
12 changes: 12 additions & 0 deletions runtime/include/chpl-gpu-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,18 @@ void chpl_gpu_impl_stream_destroy(void* stream);
bool chpl_gpu_impl_stream_ready(void* stream);
void chpl_gpu_impl_stream_synchronize(void* stream);

#define DECL_ONE_REDUCE_IMPL(chpl_kind, data_type) \
void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
data_type* val, int* idx,\
void* stream);
GPU_REDUCE(DECL_ONE_REDUCE_IMPL, sum)
GPU_REDUCE(DECL_ONE_REDUCE_IMPL, min)
GPU_REDUCE(DECL_ONE_REDUCE_IMPL, max)
GPU_REDUCE(DECL_ONE_REDUCE_IMPL, minloc)
GPU_REDUCE(DECL_ONE_REDUCE_IMPL, maxloc)

#undef DECL_ONE_REDUCE_IMPL

#ifdef __cplusplus
}
#endif
Expand Down
14 changes: 14 additions & 0 deletions runtime/include/chpl-gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include <stdbool.h>
#include "chpl-tasks.h"
#include "chpl-mem-desc.h"
#include "gpu/chpl-gpu-reduce-util.h"

#ifdef __cplusplus
extern "C" {
Expand Down Expand Up @@ -149,6 +150,19 @@ size_t chpl_gpu_get_alloc_size(void* ptr);
bool chpl_gpu_can_access_peer(int dev1, int dev2);
void chpl_gpu_set_peer_access(int dev1, int dev2, bool enable);

#define DECL_ONE_REDUCE(chpl_kind, data_type) \
void chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
data_type* val, int* idx);

GPU_REDUCE(DECL_ONE_REDUCE, sum);
GPU_REDUCE(DECL_ONE_REDUCE, min);
GPU_REDUCE(DECL_ONE_REDUCE, max);
GPU_REDUCE(DECL_ONE_REDUCE, minloc);
GPU_REDUCE(DECL_ONE_REDUCE, maxloc);

#undef DECL_ONE_REDUCE


#endif // HAS_GPU_LOCALE

#ifdef __cplusplus
Expand Down
47 changes: 47 additions & 0 deletions runtime/include/gpu/chpl-gpu-reduce-util.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
/*
* Copyright 2020-2023 Hewlett Packard Enterprise Development LP
* Copyright 2004-2019 Cray Inc.
* Other additional copyright holders may be indicated within. *
* The entirety of this work is licensed under the Apache License,
* Version 2.0 (the "License"); you may not use this file except
* in compliance with the License.
*
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifdef HAS_GPU_LOCALE

#define GPU_IMPL_REDUCE(MACRO, impl_kind, chpl_kind) \
MACRO(impl_kind, chpl_kind, int8_t) \
MACRO(impl_kind, chpl_kind, int16_t) \
MACRO(impl_kind, chpl_kind, int32_t) \
MACRO(impl_kind, chpl_kind, int64_t) \
MACRO(impl_kind, chpl_kind, uint8_t) \
MACRO(impl_kind, chpl_kind, uint16_t) \
MACRO(impl_kind, chpl_kind, uint32_t) \
MACRO(impl_kind, chpl_kind, uint64_t) \
MACRO(impl_kind, chpl_kind, float) \
MACRO(impl_kind, chpl_kind, double);

#define GPU_REDUCE(MACRO, chpl_kind) \
MACRO(chpl_kind, int8_t) \
MACRO(chpl_kind, int16_t) \
MACRO(chpl_kind, int32_t) \
MACRO(chpl_kind, int64_t) \
MACRO(chpl_kind, uint8_t) \
MACRO(chpl_kind, uint16_t) \
MACRO(chpl_kind, uint32_t) \
MACRO(chpl_kind, uint64_t) \
MACRO(chpl_kind, float) \
MACRO(chpl_kind, double);

#endif // HAS_GPU_LOCALE

29 changes: 29 additions & 0 deletions runtime/src/chpl-gpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ bool chpl_gpu_use_stream_per_task = true;
#include "chpl-env.h"
#include "chpl-comm-compiler-macros.h"

#include "gpu/chpl-gpu-reduce-util.h"

void chpl_gpu_init(void) {
chpl_gpu_impl_init(&chpl_gpu_num_devices);

Expand Down Expand Up @@ -700,4 +702,31 @@ void chpl_gpu_set_peer_access(int dev1, int dev2, bool enable) {
chpl_gpu_impl_set_peer_access(dev1, dev2, enable);
}

#define DEF_ONE_REDUCE(kind, data_type)\
void chpl_gpu_##kind##_reduce_##data_type(data_type *data, int n, \
data_type* val, int* idx) { \
CHPL_GPU_DEBUG("chpl_gpu_" #kind "_reduce_" #data_type " called\n"); \
\
int dev = chpl_task_getRequestedSubloc(); \
chpl_gpu_impl_use_device(dev); \
void* stream = get_stream(dev); \
\
chpl_gpu_impl_##kind##_reduce_##data_type(data, n, val, idx, stream); \
\
if (chpl_gpu_sync_with_host) { \
CHPL_GPU_DEBUG("Eagerly synchronizing stream %p\n", stream); \
wait_stream(stream); \
} \
\
CHPL_GPU_DEBUG("chpl_gpu_" #kind "_reduce_" #data_type " returned\n"); \
}

GPU_REDUCE(DEF_ONE_REDUCE, sum)
GPU_REDUCE(DEF_ONE_REDUCE, min)
GPU_REDUCE(DEF_ONE_REDUCE, max)
GPU_REDUCE(DEF_ONE_REDUCE, minloc)
GPU_REDUCE(DEF_ONE_REDUCE, maxloc)

#undef DEF_ONE_REDUCE

#endif
1 change: 1 addition & 0 deletions runtime/src/gpu/amd/Makefile.include
Original file line number Diff line number Diff line change
Expand Up @@ -21,5 +21,6 @@ GPU_SUBDIR = src/gpu/amd
GPU_OBJDIR = $(RUNTIME_BUILD)/$(GPU_SUBDIR)

ALL_SRCS += $(CURDIR)/$(GPU_SUBDIR)/*.c
ALL_SRCS += $(CURDIR)/$(GPU_SUBDIR)/*.cc

include $(RUNTIME_ROOT)/$(GPU_SUBDIR)/Makefile.share
Loading

0 comments on commit befb7fb

Please sign in to comment.