Add initial support for whole-array reduction on NVIDIA GPUs (#23689)

Addresses Step 1 in #23324 Resolves Cray/chapel-private#5513 This PR adds several `gpu*Reduce` functions to perform whole-array reductions for arrays that are allocated in GPU memory. The functions added cover Chapel's default reductions and they are named: - `gpuSumReduce` - `gpuMinReduce` - `gpuMaxReduce` - `gpuMinLocReduce` - `gpuMaxLocReduce` ### NVIDIA implementation This is done by wrapping CUB: https://nvlabs.github.io/cub/. CUB is a C++ header-only template library. We wrap it with some macro magic in the runtime. This currently increases runtime build time by quite a bit. We might consider wrapping the functions from the library in non-inline helpers that can help a bit. ### AMD implementation AMD has hipCUB: https://rocm.docs.amd.com/projects/hipCUB/en/latest/ and a slightly lower-level, AMD-only rocPRIM: https://rocm.docs.amd.com/projects/rocPRIM/en/latest/. I couldn't get either to work. I can't run a simple HIP reproducer based off of one of their tests. I might be doing something wrong in compilation, but what I am getting is a segfault in the launched kernel (or `hipLaunchKernel`). I filed ROCm/hipCUB#304, but haven't received a response quick enough to address in this PR. This is really unfortunate, but maybe we'll have a better/native reduction support soon and we can cover AMD there, too. ### Implementation details: - `chpl_gpu_X_reduce_Y` functions are added to the main runtime interface via macros. Here, X is the reduction kind, Y is the data type. This one prints debugging output, finds the stream to run the reduction on and calls its `impl` cousin. - `chpl_gpu_impl_X_reduce_Y` are added to the implementation layer in a similar fashion. - These functions are added to `gpu/Z/gpu-Z-reduce.cc` in the runtime where Z is either `nvidia` or `amd` - AMD versions are mostly "implemented" the way I think they should work, but because of the segfaults that I was getting, they are in the `else` branch of an `#if 1` at the moment. - The module code has a private `doGpuReduce` that calls the appropriate runtime function for any reduction type. This function has some similarities to how atomics are implemented. Unfortunately the interfaces are different enough that I can't come up with a good way to refactor some of the helpers. All the reduction helpers are nested in `doGpuReduce` to avoid confusion. - To workaround a CUB limitation that prevents reducing arrays whose size is close to `max(int(32))`, the implementation runs the underlying CUB implementation with at most 2B elements at a time and stitches the result on the host, if it ends up calling the implementation multiple times. The underlying issue is captured in: - https://github.com/NVIDIA/thrust/issues/1271 - NVIDIA/cccl#49 ### Future work: - Keep an eye on the AMD bug report - Implement a fall back when we're ready to run an in-house reduction if the bug remains unresolved. [Reviewed by @stonea] ### Test Status - [x] nvidia - [x] amd - [x] flat `make check`
chapel-lang · Nov 6, 2023 · befb7fb · befb7fb
2 parents 5554398 + 00ee967
commit befb7fb
Show file tree

Hide file tree

Showing 33 changed files with 874 additions and 6 deletions.
diff --git a/modules/standard/GPU.chpl b/modules/standard/GPU.chpl
@@ -389,4 +389,241 @@ module GPU
   /* When run on a GPU, atomically compare the value in 'x' and 'cmp', if they
      are equal store 'val' in 'x'. The operation returns the old value of x. */
   inline proc gpuAtomicCAS(  ref x : ?T, cmp : T, val : T) : T { return gpuAtomicTernOp("CAS", x, cmp, val); }
+
+  // ============================
+  // Reductions
+  // ============================
+
+  @chpldoc.nodoc
+  config param gpuDebugReduce = false;
+
+  private inline proc doGpuReduce(param op: string, const ref A: [] ?t) {
+    if op != "sum" && op != "min" && op != "max" &&
+       op != "minloc" && op != "maxloc" {
+
+      compilerError("Unexpected reduction kind in doGpuReduce: ", op);
+    }
+
+
+    if CHPL_GPU == "amd" {
+      compilerError("gpu*Reduce functions are not supported on AMD GPUs");
+    }
+    else if CHPL_GPU == "cpu" {
+      select op {
+        when "sum" do return + reduce A;
+        when "min" do return min reduce A;
+        when "max" do return max reduce A;
+        when "minloc" do return minloc reduce zip (A.domain, A);
+        when "maxloc" do return maxloc reduce zip (A.domain, A);
+        otherwise do compilerError("Unknown reduction operation: ", op);
+      }
+    }
+    else {
+      compilerAssert(CHPL_GPU=="nvidia");
+    }
+
+
+    proc chplTypeToCTypeName(type t) param {
+      select t {
+        when int(8)   do return "int8_t";
+        when int(16)  do return "int16_t";
+        when int(32)  do return "int32_t";
+        when int(64)  do return "int64_t";
+        when uint(8)  do return "uint8_t";
+        when uint(16) do return "uint16_t";
+        when uint(32) do return "uint32_t";
+        when uint(64) do return "uint64_t";
+        when real(32) do return "float";
+        when real(64) do return "double";
+        otherwise do
+          compilerError("Arrays with ", t:string, " elements cannot be reduced");
+      }
+      return "unknown";
+    }
+
+    proc getExternFuncName(param op: string, type t) param: string {
+      return "chpl_gpu_"+op+"_reduce_"+chplTypeToCTypeName(t);
+    }
+
+    proc isValReduce(param op) param {
+      return op=="sum" || op=="min" || op=="max";
+    }
+
+    proc isValIdxReduce(param op) param {
+      return op=="minloc" || op=="maxloc";
+    }
+
+    inline proc subReduceValIdx(param op, const baseOffset, ref accum, val) {
+      // do some type checking to be safe
+      compilerAssert(isTupleValue(val));
+      if isTupleValue(accum) {
+        compilerAssert(isValIdxReduce(op));
+        compilerAssert(val[1].type == accum[1].type);
+
+      }
+      else {
+        compilerAssert(isValReduce(op));
+        compilerAssert(val[1].type == accum.type);
+      }
+
+      select op {
+        when "sum" do accum += val[1];
+        when "min" do accum = min(accum, val[1]);
+        when "max" do accum = max(accum, val[1]);
+        when "minloc" do
+          if accum[1] > val[1] then accum = (val[0]+baseOffset, val[1]);
+        when "maxloc" do
+          if accum[1] < val[1] then accum = (val[0]+baseOffset, val[1]);
+        otherwise do compilerError("Unknown reduction operation: ", op);
+      }
+    }
+
+    iter offsetsThatCanFitIn32Bits(size: int) {
+      // Engin: I've tried to get max(int(32)) to work as this bug is about CUB
+      // using `int` as the size in the interface. However, getting close to
+      // max(int(32)) also triggers the bug. So, I am choosing this as a
+      // round/safe value for the time being.
+      param chunkSize = 2_000_000_000;
+
+      use Math only divCeil;
+      const numChunks = divCeil(size, chunkSize);
+      const standardChunkSize = divCeil(size, numChunks);
+
+      if gpuDebugReduce then
+        writeln("Will use ", numChunks, " chunks of size ", standardChunkSize);
+
+      foreach chunk in 0..<numChunks {
+        const start = chunk*standardChunkSize;
+        const curChunkSize = if start+standardChunkSize <= size
+                               then standardChunkSize
+                               else size-start;
+        if gpuDebugReduce then
+          writef("Chunk %i: (start=%i, curChunkSize=%i) ", chunk, start,
+                 curChunkSize);
+
+        yield (start, curChunkSize);
+      }
+    }
+
+    use CTypes;
+
+    // find the extern function we'll use
+    param externFunc = getExternFuncName(op, t);
+    extern externFunc proc reduce_fn(data, size, ref val, ref idx);
+
+    // initialize the return value
+    var ret;
+    if isValReduce(op) {
+      var retTmp: t;
+      if op == "min" then retTmp = max(t);
+      else if op == "max" then retTmp = min(t);
+      ret = retTmp;
+    }
+    else if isValIdxReduce(op) {
+      var retTmp: (int, t);
+      if op == "minloc" then retTmp[1] = max(t);
+      else if op == "maxloc" then retTmp[1] = min(t);
+      ret = retTmp;
+    }
+    else {
+      compilerError("Unknown reduction operation: ", op);
+      ret = 0;
+    }
+
+    // perform the reduction
+    const basePtr = c_ptrToConst(A);
+    for (offset,size) in offsetsThatCanFitIn32Bits(A.size) {
+      var curIdx: int(32) = -1; // should remain -1 for sum, min, max
+      var curVal: t;
+      reduce_fn(basePtr+offset, size, curVal, curIdx);
+      subReduceValIdx(op, offset, ret, (curIdx, curVal));
+      if gpuDebugReduce then
+        writef(" (curIdx=%i curVal=%i ret=%?)\n", curIdx, curVal, ret);
+    }
+
+    if isValIdxReduce(op) then
+      ret[0] += A.domain.first;
+
+    return ret;
+  }
+
+  /*
+    Add all elements of an array together on the GPU (that is, perform a
+    sum-reduction). The array must be in GPU-accessible memory and the function
+    must be called from outside a GPU-eligible loop. Only arrays with int, uint,
+    and real types are supported. A simple example is the following:
+
+     .. code-block:: chapel
+
+       on here.gpus[0] {
+         var Arr = [3, 2, 1, 5, 4]; // will be GPU-accessible
+         writeln(gpuSumReduce(Arr)); // 15
+       }
+  */
+  inline proc gpuSumReduce(const ref A: [] ?t) do return doGpuReduce("sum", A);
+
+  /*
+    Return the minimum element of an array on the GPU (that is, perform a
+    min-reduction). The array must be in GPU-accessible memory and the function
+    must be called from outside a GPU-eligible loop. Only arrays with int, uint,
+    and real types are supported. A simple example is the following:
+
+     .. code-block:: chapel
+
+       on here.gpus[0] {
+         var Arr = [3, 2, 1, 5, 4]; // will be GPU-accessible
+         writeln(gpuMinReduce(Arr)); // 1
+       }
+  */
+  inline proc gpuMinReduce(const ref A: [] ?t) do return doGpuReduce("min", A);
+
+  /*
+    Return the maximum element of an array on the GPU (that is, perform a
+    max-reduction). The array must be in GPU-accessible memory and the function
+    must be called from outside a GPU-eligible loop. Only arrays with int, uint,
+    and real types are supported. A simple example is the following:
+
+     .. code-block:: chapel
+
+       on here.gpus[0] {
+         var Arr = [3, 2, 1, 5, 4]; // will be GPU-accessible
+         writeln(gpuMaxReduce(Arr)); // 5
+       }
+  */
+  inline proc gpuMaxReduce(const ref A: [] ?t) do return doGpuReduce("max", A);
+
+  /*
+    For an array on the GPU, return a tuple with the index and the value of the
+    minimum element (that is, perform a minloc-reduction). If there are multiple
+    elements with the same minimum value, the index of the first one is
+    returned. The array must be in GPU-accessible memory and the function must
+    be called from outside a GPU-eligible loop.  Only arrays with int, uint, and
+    real types are supported. A simple example is the following:
+
+     .. code-block:: chapel
+
+       on here.gpus[0] {
+         var Arr = [3, 2, 1, 5, 4]; // will be GPU-accessible
+         writeln(gpuMinLocReduce(Arr)); // (2, 1). Note that Arr[2]==1.
+       }
+  */
+  inline proc gpuMinLocReduce(const ref A: [] ?t) do return doGpuReduce("minloc", A);
+
+  /*
+    For an array on the GPU, return a tuple with the index and the value of the
+    maximum element (that is, perform a maxloc-reduction). If there are multiple
+    elements with the same maximum value, the index of the first one is
+    returned. The array must be in GPU-accessible memory and the function must
+    be called from outside a GPU-eligible loop.  Only arrays with int, uint, and
+    real types are supported. A simple example is the following:
+
+     .. code-block:: chapel
+
+       on here.gpus[0] {
+         var Arr = [3, 2, 1, 5, 4]; // will be GPU-accessible
+         writeln(gpuMaxLocReduce(Arr)); // (3, 5). Note that Arr[3]==5.
+       }
+  */
+  inline proc gpuMaxLocReduce(const ref A: [] ?t) do return doGpuReduce("maxloc", A);
+
 }
diff --git a/runtime/include/chpl-gpu-impl.h b/runtime/include/chpl-gpu-impl.h
@@ -76,6 +76,18 @@ void chpl_gpu_impl_stream_destroy(void* stream);
 bool chpl_gpu_impl_stream_ready(void* stream);
 void chpl_gpu_impl_stream_synchronize(void* stream);
 
+#define DECL_ONE_REDUCE_IMPL(chpl_kind, data_type) \
+void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
+                                                    data_type* val, int* idx,\
+                                                    void* stream);
+GPU_REDUCE(DECL_ONE_REDUCE_IMPL, sum)
+GPU_REDUCE(DECL_ONE_REDUCE_IMPL, min)
+GPU_REDUCE(DECL_ONE_REDUCE_IMPL, max)
+GPU_REDUCE(DECL_ONE_REDUCE_IMPL, minloc)
+GPU_REDUCE(DECL_ONE_REDUCE_IMPL, maxloc)
+
+#undef DECL_ONE_REDUCE_IMPL
+
 #ifdef __cplusplus
 }
 #endif

diff --git a/runtime/include/chpl-gpu.h b/runtime/include/chpl-gpu.h
@@ -24,6 +24,7 @@
 #include <stdbool.h>
 #include "chpl-tasks.h"
 #include "chpl-mem-desc.h"
+#include "gpu/chpl-gpu-reduce-util.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -149,6 +150,19 @@ size_t chpl_gpu_get_alloc_size(void* ptr);
 bool chpl_gpu_can_access_peer(int dev1, int dev2);
 void chpl_gpu_set_peer_access(int dev1, int dev2, bool enable);
 
+#define DECL_ONE_REDUCE(chpl_kind, data_type) \
+void chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
+                                               data_type* val, int* idx);
+
+GPU_REDUCE(DECL_ONE_REDUCE, sum);
+GPU_REDUCE(DECL_ONE_REDUCE, min);
+GPU_REDUCE(DECL_ONE_REDUCE, max);
+GPU_REDUCE(DECL_ONE_REDUCE, minloc);
+GPU_REDUCE(DECL_ONE_REDUCE, maxloc);
+
+#undef DECL_ONE_REDUCE
+
+
 #endif // HAS_GPU_LOCALE
 
 #ifdef __cplusplus

diff --git a/runtime/include/gpu/chpl-gpu-reduce-util.h b/runtime/include/gpu/chpl-gpu-reduce-util.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2020-2023 Hewlett Packard Enterprise Development LP
+ * Copyright 2004-2019 Cray Inc.
+ * Other additional copyright holders may be indicated within.  *
+ * The entirety of this work is licensed under the Apache License,
+ * Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef HAS_GPU_LOCALE
+
+#define GPU_IMPL_REDUCE(MACRO, impl_kind, chpl_kind) \
+  MACRO(impl_kind, chpl_kind, int8_t)  \
+  MACRO(impl_kind, chpl_kind, int16_t)  \
+  MACRO(impl_kind, chpl_kind, int32_t)  \
+  MACRO(impl_kind, chpl_kind, int64_t)  \
+  MACRO(impl_kind, chpl_kind, uint8_t)  \
+  MACRO(impl_kind, chpl_kind, uint16_t)  \
+  MACRO(impl_kind, chpl_kind, uint32_t)  \
+  MACRO(impl_kind, chpl_kind, uint64_t)  \
+  MACRO(impl_kind, chpl_kind, float)   \
+  MACRO(impl_kind, chpl_kind, double);
+
+#define GPU_REDUCE(MACRO, chpl_kind) \
+  MACRO(chpl_kind, int8_t)  \
+  MACRO(chpl_kind, int16_t)  \
+  MACRO(chpl_kind, int32_t)  \
+  MACRO(chpl_kind, int64_t)  \
+  MACRO(chpl_kind, uint8_t)  \
+  MACRO(chpl_kind, uint16_t)  \
+  MACRO(chpl_kind, uint32_t)  \
+  MACRO(chpl_kind, uint64_t)  \
+  MACRO(chpl_kind, float)   \
+  MACRO(chpl_kind, double);
+
+#endif // HAS_GPU_LOCALE
+
diff --git a/runtime/src/chpl-gpu.c b/runtime/src/chpl-gpu.c
@@ -43,6 +43,8 @@ bool chpl_gpu_use_stream_per_task = true;
 #include "chpl-env.h"
 #include "chpl-comm-compiler-macros.h"
 
+#include "gpu/chpl-gpu-reduce-util.h"
+
 void chpl_gpu_init(void) {
   chpl_gpu_impl_init(&chpl_gpu_num_devices);
 
@@ -700,4 +702,31 @@ void chpl_gpu_set_peer_access(int dev1, int dev2, bool enable) {
   chpl_gpu_impl_set_peer_access(dev1, dev2, enable);
 }
 
+#define DEF_ONE_REDUCE(kind, data_type)\
+void chpl_gpu_##kind##_reduce_##data_type(data_type *data, int n, \
+                                          data_type* val, int* idx) { \
+  CHPL_GPU_DEBUG("chpl_gpu_" #kind "_reduce_" #data_type " called\n"); \
+  \
+  int dev = chpl_task_getRequestedSubloc(); \
+  chpl_gpu_impl_use_device(dev); \
+  void* stream = get_stream(dev); \
+  \
+  chpl_gpu_impl_##kind##_reduce_##data_type(data, n, val, idx, stream); \
+  \
+  if (chpl_gpu_sync_with_host) { \
+    CHPL_GPU_DEBUG("Eagerly synchronizing stream %p\n", stream); \
+    wait_stream(stream); \
+  } \
+  \
+  CHPL_GPU_DEBUG("chpl_gpu_" #kind "_reduce_" #data_type " returned\n"); \
+}
+
+GPU_REDUCE(DEF_ONE_REDUCE, sum)
+GPU_REDUCE(DEF_ONE_REDUCE, min)
+GPU_REDUCE(DEF_ONE_REDUCE, max)
+GPU_REDUCE(DEF_ONE_REDUCE, minloc)
+GPU_REDUCE(DEF_ONE_REDUCE, maxloc)
+
+#undef DEF_ONE_REDUCE
+
 #endif
diff --git a/runtime/src/gpu/amd/Makefile.include b/runtime/src/gpu/amd/Makefile.include
@@ -21,5 +21,6 @@ GPU_SUBDIR = src/gpu/amd
 GPU_OBJDIR = $(RUNTIME_BUILD)/$(GPU_SUBDIR)
 
 ALL_SRCS += $(CURDIR)/$(GPU_SUBDIR)/*.c
+ALL_SRCS += $(CURDIR)/$(GPU_SUBDIR)/*.cc
 
 include $(RUNTIME_ROOT)/$(GPU_SUBDIR)/Makefile.share