From 096fdbac820ccaaaad1dda82eb47ef0514893ba5 Mon Sep 17 00:00:00 2001
From: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
Date: Tue, 19 Sep 2023 10:03:19 -0700
Subject: [PATCH 01/42] Snapshot

Signed-off-by: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
---
 runtime/include/chpl-gpu.h              |  2 ++
 runtime/make/Makefile.runtime.include   |  1 +
 runtime/src/gpu/nvidia/Makefile.include |  3 +++
 runtime/src/gpu/nvidia/gpu-nvidia.c     | 19 +++++++++++++++++++
 4 files changed, 25 insertions(+)

diff --git a/runtime/include/chpl-gpu.h b/runtime/include/chpl-gpu.h
index 5fbc5276facc..e4b00361bc87 100644
--- a/runtime/include/chpl-gpu.h
+++ b/runtime/include/chpl-gpu.h
@@ -149,6 +149,8 @@ size_t chpl_gpu_get_alloc_size(void* ptr);
 bool chpl_gpu_can_access_peer(int dev1, int dev2);
 void chpl_gpu_set_peer_access(int dev1, int dev2, bool enable);
 
+int chpl_gpu_aux_sum_reduce(int* data, int n);
+
 #endif // HAS_GPU_LOCALE
 
 #ifdef __cplusplus
diff --git a/runtime/make/Makefile.runtime.include b/runtime/make/Makefile.runtime.include
index b19be2b8ae7e..dd474519c9f7 100644
--- a/runtime/make/Makefile.runtime.include
+++ b/runtime/make/Makefile.runtime.include
@@ -59,6 +59,7 @@ RUNTIME_INCLS += \
         -I. \
         -I$(RUNTIME_INCLUDE_ROOT)/localeModels/$(CHPL_MAKE_LOCALE_MODEL) \
         -I$(RUNTIME_INCLUDE_ROOT)/localeModels \
+        -I$(RUNTIME_INCLUDE_ROOT)/gpu/$(CHPL_MAKE_GPU)/cub-wrappers \
         -I$(RUNTIME_INCLUDE_ROOT)/gpu/$(CHPL_MAKE_GPU) \
         -I$(RUNTIME_INCLUDE_ROOT)/gpu \
         -I$(RUNTIME_INCLUDE_ROOT)/comm/$(CHPL_MAKE_COMM) \
diff --git a/runtime/src/gpu/nvidia/Makefile.include b/runtime/src/gpu/nvidia/Makefile.include
index cb4da19051a8..b98d7052dc41 100644
--- a/runtime/src/gpu/nvidia/Makefile.include
+++ b/runtime/src/gpu/nvidia/Makefile.include
@@ -22,4 +22,7 @@ GPU_OBJDIR = $(RUNTIME_BUILD)/$(GPU_SUBDIR)
 
 ALL_SRCS += $(CURDIR)/$(GPU_SUBDIR)/*.c
 
+RUNTIME_INCLS += $(CURDIR)/$(GPU_SUBDIR)/cub-wrappers
+
+
 include $(RUNTIME_ROOT)/$(GPU_SUBDIR)/Makefile.share
diff --git a/runtime/src/gpu/nvidia/gpu-nvidia.c b/runtime/src/gpu/nvidia/gpu-nvidia.c
index bc5e2cc787dd..f6774ae7f54c 100644
--- a/runtime/src/gpu/nvidia/gpu-nvidia.c
+++ b/runtime/src/gpu/nvidia/gpu-nvidia.c
@@ -469,5 +469,24 @@ void chpl_gpu_impl_stream_synchronize(void* stream) {
     CUDA_CALL(cuStreamSynchronize(stream));
   }
 }
+/*
+#include <cub/cub.cuh>
+
+int* chpl_gpu_aux_sum_reduce(int* data, int n) {
+
+  int* result;
+  void* temp = NULL;
+  size_t temp_bytes = 0;
+  cub::DeviceReduce::Sum(temp, temp_bytes, data, result, n);
+
+  // Allocate temporary storage
+  cuMemAlloc(&(CUdeviceptr)temp, temp_bytes);
+
+  // Run sum-reduction
+  cub::DeviceReduce::Sum(temp, temp_bytes, d_in, d_out, num_items);
+
+  printf("Result %d\n", *d_out);
+}
+*/
 
 #endif // HAS_GPU_LOCALE

From 9020bf541d43381ebcc5ed39638422ea2b19157e Mon Sep 17 00:00:00 2001
From: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
Date: Thu, 21 Sep 2023 17:16:29 -0700
Subject: [PATCH 02/42] Get things to link in a hacky way

Signed-off-by: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
---
 runtime/Makefile.help                         |  3 ++
 runtime/src/gpu/nvidia/Makefile               |  2 +
 runtime/src/gpu/nvidia/cub-wrappers/Makefile  | 41 +++++++++++++++++++
 .../gpu/nvidia/cub-wrappers/Makefile.include  | 25 +++++++++++
 .../gpu/nvidia/cub-wrappers/Makefile.share    | 29 +++++++++++++
 .../cub-wrappers/gpu-nvidia-cub-wrappers.cc   | 21 ++++++++++
 6 files changed, 121 insertions(+)
 create mode 100644 runtime/src/gpu/nvidia/cub-wrappers/Makefile
 create mode 100644 runtime/src/gpu/nvidia/cub-wrappers/Makefile.include
 create mode 100644 runtime/src/gpu/nvidia/cub-wrappers/Makefile.share
 create mode 100644 runtime/src/gpu/nvidia/cub-wrappers/gpu-nvidia-cub-wrappers.cc

diff --git a/runtime/Makefile.help b/runtime/Makefile.help
index cf0d29a72fb4..fa98f7761456 100644
--- a/runtime/Makefile.help
+++ b/runtime/Makefile.help
@@ -78,6 +78,8 @@ include $(call FIND_MAKEFILE,LAUNCHER,launch)
 include $(call FIND_MAKEFILE,MEM,mem)
 include $(call FIND_MAKEFILE,TOPO,topo)
 
+include src/gpu/nvidia/cub-wrappers/Makefile.include
+
 include src/qio/Makefile.include
 
 
@@ -96,6 +98,7 @@ RUNTIME_OBJS = \
 	$(THREADS_OBJS) \
 	$(TIMERS_OBJS) \
 	$(COMM_OBJS) \
+	$(CUB_OBJS) \
 	$(GPU_OBJS) \
 	$(MEM_COMMON_OBJS) \
 	$(QIO_OBJS) \
diff --git a/runtime/src/gpu/nvidia/Makefile b/runtime/src/gpu/nvidia/Makefile
index 7b82be818066..25026a44bc4c 100644
--- a/runtime/src/gpu/nvidia/Makefile
+++ b/runtime/src/gpu/nvidia/Makefile
@@ -33,6 +33,8 @@ include Makefile.share
 
 TARGETS = $(GPU_OBJS)
 
+SUBDIRS = cub-wrappers
+
 include $(RUNTIME_ROOT)/make/Makefile.runtime.subdirrules
 
 #
diff --git a/runtime/src/gpu/nvidia/cub-wrappers/Makefile b/runtime/src/gpu/nvidia/cub-wrappers/Makefile
new file mode 100644
index 000000000000..7cbacd864502
--- /dev/null
+++ b/runtime/src/gpu/nvidia/cub-wrappers/Makefile
@@ -0,0 +1,41 @@
+# Copyright 2020-2023 Hewlett Packard Enterprise Development LP
+# Copyright 2004-2019 Cray Inc.
+# Other additional copyright holders may be indicated within.
+#
+# The entirety of this work is licensed under the Apache License,
+# Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License.
+#
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+RUNTIME_ROOT = ../../../..
+RUNTIME_SUBDIR = src/gpu/nvidia/cub-wrappers
+
+ifndef CHPL_MAKE_HOME
+export CHPL_MAKE_HOME=$(shell pwd)/$(RUNTIME_ROOT)/..
+endif
+
+#
+# standard header
+#
+include $(RUNTIME_ROOT)/make/Makefile.runtime.head
+
+CUB_OBJDIR = $(RUNTIME_OBJDIR)
+include Makefile.share
+
+TARGETS = $(CUB_OBJS)
+
+include $(RUNTIME_ROOT)/make/Makefile.runtime.subdirrules
+
+#
+# standard footer
+#
+include $(RUNTIME_ROOT)/make/Makefile.runtime.foot
diff --git a/runtime/src/gpu/nvidia/cub-wrappers/Makefile.include b/runtime/src/gpu/nvidia/cub-wrappers/Makefile.include
new file mode 100644
index 000000000000..2b8ca29764af
--- /dev/null
+++ b/runtime/src/gpu/nvidia/cub-wrappers/Makefile.include
@@ -0,0 +1,25 @@
+# Copyright 2020-2023 Hewlett Packard Enterprise Development LP
+# Copyright 2004-2019 Cray Inc.
+# Other additional copyright holders may be indicated within.
+#
+# The entirety of this work is licensed under the Apache License,
+# Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License.
+#
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+CUB_SUBDIR = src/gpu/nvidia/cub-wrappers
+
+CUB_OBJDIR = $(RUNTIME_BUILD)/$(CUB_SUBDIR)
+
+ALL_SRCS += $(CURDIR)/$(CUB_SUBDIR)/*.cc
+
+include $(RUNTIME_ROOT)/$(CUB_SUBDIR)/Makefile.share
diff --git a/runtime/src/gpu/nvidia/cub-wrappers/Makefile.share b/runtime/src/gpu/nvidia/cub-wrappers/Makefile.share
new file mode 100644
index 000000000000..48f44ffde897
--- /dev/null
+++ b/runtime/src/gpu/nvidia/cub-wrappers/Makefile.share
@@ -0,0 +1,29 @@
+# Copyright 2020-2023 Hewlett Packard Enterprise Development LP
+# Copyright 2004-2019 Cray Inc.
+# Other additional copyright holders may be indicated within.
+#
+# The entirety of this work is licensed under the Apache License,
+# Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License.
+#
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+CUB_SRCS = gpu-nvidia-cub-wrappers.cc
+
+SRCS = gpu-nvidia-cub-wrappers.cc
+
+CUB_OBJS = $(CUB_SRCS:%.cc=$(CUB_OBJDIR)/%.o)
+
+RUNTIME_CXXFLAGS += -x cuda -Wno-unknown-cuda-version
+
+$(RUNTIME_OBJ_DIR)/gpu-nvidia-cub-wrappers.o: gpu-nvidia-cub-wrappers.cc \
+                                         $(RUNTIME_OBJ_DIR_STAMP)
+	$(CXX) -c $(CXX11_STD) $(RUNTIME_CXXFLAGS) $(RUNTIME_INCLS) -o $@ $<
diff --git a/runtime/src/gpu/nvidia/cub-wrappers/gpu-nvidia-cub-wrappers.cc b/runtime/src/gpu/nvidia/cub-wrappers/gpu-nvidia-cub-wrappers.cc
new file mode 100644
index 000000000000..dcf5f94fde10
--- /dev/null
+++ b/runtime/src/gpu/nvidia/cub-wrappers/gpu-nvidia-cub-wrappers.cc
@@ -0,0 +1,21 @@
+#include <cuda.h>
+#include <cub/cub.cuh>
+
+#include "chpl-gpu.h"
+
+int chpl_gpu_aux_sum_reduce(int* data, int n) {
+
+  int result = 0;
+  void* temp = NULL;
+  size_t temp_bytes = 0;
+  cub::DeviceReduce::Sum(temp, temp_bytes, data, &result, n);
+
+  // Allocate temporary storage
+  cuMemAlloc(((CUdeviceptr*)&temp), temp_bytes);
+
+  // Run sum-reduction
+  cub::DeviceReduce::Sum(temp, temp_bytes, data, &result, n);
+
+  printf("Result %d\n", result);
+  return result;
+}

From df97abbc78cb5e31cdd72274bac90a79d4ded44a Mon Sep 17 00:00:00 2001
From: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
Date: Thu, 21 Sep 2023 21:06:50 -0700
Subject: [PATCH 03/42] Get the initial example working

Signed-off-by: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
---
 runtime/include/chpl-gpu.h                    |  2 +-
 .../cub-wrappers/gpu-nvidia-cub-wrappers.cc   | 24 +++++++++++++------
 2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/runtime/include/chpl-gpu.h b/runtime/include/chpl-gpu.h
index e4b00361bc87..eddd3060c7ab 100644
--- a/runtime/include/chpl-gpu.h
+++ b/runtime/include/chpl-gpu.h
@@ -149,7 +149,7 @@ size_t chpl_gpu_get_alloc_size(void* ptr);
 bool chpl_gpu_can_access_peer(int dev1, int dev2);
 void chpl_gpu_set_peer_access(int dev1, int dev2, bool enable);
 
-int chpl_gpu_aux_sum_reduce(int* data, int n);
+int64_t chpl_gpu_aux_sum_reduce(int64_t* data, int n);
 
 #endif // HAS_GPU_LOCALE
 
diff --git a/runtime/src/gpu/nvidia/cub-wrappers/gpu-nvidia-cub-wrappers.cc b/runtime/src/gpu/nvidia/cub-wrappers/gpu-nvidia-cub-wrappers.cc
index dcf5f94fde10..4fc41364ede4 100644
--- a/runtime/src/gpu/nvidia/cub-wrappers/gpu-nvidia-cub-wrappers.cc
+++ b/runtime/src/gpu/nvidia/cub-wrappers/gpu-nvidia-cub-wrappers.cc
@@ -2,20 +2,30 @@
 #include <cub/cub.cuh>
 
 #include "chpl-gpu.h"
+#include "../../common/cuda-utils.h"
 
-int chpl_gpu_aux_sum_reduce(int* data, int n) {
+int64_t chpl_gpu_aux_sum_reduce(int64_t* data, int n) {
+
+  CUdeviceptr result;
+
+  CUDA_CALL(cuMemAlloc(&result, sizeof(int64_t)));
 
-  int result = 0;
   void* temp = NULL;
   size_t temp_bytes = 0;
-  cub::DeviceReduce::Sum(temp, temp_bytes, data, &result, n);
+  cub::DeviceReduce::Sum(temp, temp_bytes, data, (int64_t*)result, n);
+
+  printf("Allocation for scratch %zu\n", temp_bytes);
 
   // Allocate temporary storage
-  cuMemAlloc(((CUdeviceptr*)&temp), temp_bytes);
+  CUDA_CALL(cuMemAlloc(((CUdeviceptr*)&temp), temp_bytes));
+
+  printf("Temporary allocated %p\n", temp);
 
   // Run sum-reduction
-  cub::DeviceReduce::Sum(temp, temp_bytes, data, &result, n);
+  cub::DeviceReduce::Sum(temp, temp_bytes, data, (int64_t*)result, n, 0, true);
 
-  printf("Result %d\n", result);
-  return result;
+  int64_t result_host;
+  CUDA_CALL(cuMemcpyDtoH(&result_host, result, sizeof(int64_t)));
+  printf("Result %ld\n", result_host);
+  return result_host;
 }

From 01bcf849d34219442c6136467343a1c18e40a4b9 Mon Sep 17 00:00:00 2001
From: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
Date: Thu, 21 Sep 2023 21:08:53 -0700
Subject: [PATCH 04/42] Cleanup

Signed-off-by: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
---
 runtime/src/gpu/nvidia/gpu-nvidia.c | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/runtime/src/gpu/nvidia/gpu-nvidia.c b/runtime/src/gpu/nvidia/gpu-nvidia.c
index f6774ae7f54c..bc5e2cc787dd 100644
--- a/runtime/src/gpu/nvidia/gpu-nvidia.c
+++ b/runtime/src/gpu/nvidia/gpu-nvidia.c
@@ -469,24 +469,5 @@ void chpl_gpu_impl_stream_synchronize(void* stream) {
     CUDA_CALL(cuStreamSynchronize(stream));
   }
 }
-/*
-#include <cub/cub.cuh>
-
-int* chpl_gpu_aux_sum_reduce(int* data, int n) {
-
-  int* result;
-  void* temp = NULL;
-  size_t temp_bytes = 0;
-  cub::DeviceReduce::Sum(temp, temp_bytes, data, result, n);
-
-  // Allocate temporary storage
-  cuMemAlloc(&(CUdeviceptr)temp, temp_bytes);
-
-  // Run sum-reduction
-  cub::DeviceReduce::Sum(temp, temp_bytes, d_in, d_out, num_items);
-
-  printf("Result %d\n", *d_out);
-}
-*/
 
 #endif // HAS_GPU_LOCALE

From 38dc53fd6ddf1c5d11ca58123a52eaf3668e71e5 Mon Sep 17 00:00:00 2001
From: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
Date: Fri, 6 Oct 2023 20:46:26 -0700
Subject: [PATCH 05/42] Drop additional makefile logic

Signed-off-by: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
---
 runtime/src/gpu/nvidia/Makefile                    |  2 --
 runtime/src/gpu/nvidia/Makefile.include            |  3 +--
 runtime/src/gpu/nvidia/Makefile.share              | 14 +++++++++++---
 ...pu-nvidia-cub-wrappers.cc => gpu-nvidia-aux.cc} | 10 +++++-----
 4 files changed, 17 insertions(+), 12 deletions(-)
 rename runtime/src/gpu/nvidia/{cub-wrappers/gpu-nvidia-cub-wrappers.cc => gpu-nvidia-aux.cc} (76%)

diff --git a/runtime/src/gpu/nvidia/Makefile b/runtime/src/gpu/nvidia/Makefile
index 25026a44bc4c..7b82be818066 100644
--- a/runtime/src/gpu/nvidia/Makefile
+++ b/runtime/src/gpu/nvidia/Makefile
@@ -33,8 +33,6 @@ include Makefile.share
 
 TARGETS = $(GPU_OBJS)
 
-SUBDIRS = cub-wrappers
-
 include $(RUNTIME_ROOT)/make/Makefile.runtime.subdirrules
 
 #
diff --git a/runtime/src/gpu/nvidia/Makefile.include b/runtime/src/gpu/nvidia/Makefile.include
index b98d7052dc41..9e0f8711b311 100644
--- a/runtime/src/gpu/nvidia/Makefile.include
+++ b/runtime/src/gpu/nvidia/Makefile.include
@@ -21,8 +21,7 @@ GPU_SUBDIR = src/gpu/nvidia
 GPU_OBJDIR = $(RUNTIME_BUILD)/$(GPU_SUBDIR)
 
 ALL_SRCS += $(CURDIR)/$(GPU_SUBDIR)/*.c
-
-RUNTIME_INCLS += $(CURDIR)/$(GPU_SUBDIR)/cub-wrappers
+ALL_SRCS += $(CURDIR)/$(GPU_SUBDIR)/*.cc
 
 
 include $(RUNTIME_ROOT)/$(GPU_SUBDIR)/Makefile.share
diff --git a/runtime/src/gpu/nvidia/Makefile.share b/runtime/src/gpu/nvidia/Makefile.share
index 36d2debdee0d..2716a17c89d7 100644
--- a/runtime/src/gpu/nvidia/Makefile.share
+++ b/runtime/src/gpu/nvidia/Makefile.share
@@ -15,8 +15,16 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-SRCS = gpu-nvidia.c
+GPU_SRCS = gpu-nvidia.c gpu-nvidia-aux.cc
 
-GPU_SRCS = gpu-nvidia.c
+SRCS = $(GPU_SRCS)
 
-GPU_OBJS = $(GPU_SRCS:%.c=$(GPU_OBJDIR)/%.o)
+GPU_OBJS = $(addprefix $(GPU_OBJDIR)/,$(addsuffix .o,$(basename $(GPU_SRCS))))
+
+$(info $$GPU_OBJS is [${GPU_OBJS}])
+
+RUNTIME_CXXFLAGS += -x cuda -Wno-unknown-cuda-version
+
+$(RUNTIME_OBJ_DIR)/gpu-nvidia-aux.o: gpu-nvidia-aux.cc \
+                                         $(RUNTIME_OBJ_DIR_STAMP)
+	$(CXX) -c $(CXX11_STD) $(RUNTIME_CXXFLAGS) $(RUNTIME_INCLS) -o $@ $<
diff --git a/runtime/src/gpu/nvidia/cub-wrappers/gpu-nvidia-cub-wrappers.cc b/runtime/src/gpu/nvidia/gpu-nvidia-aux.cc
similarity index 76%
rename from runtime/src/gpu/nvidia/cub-wrappers/gpu-nvidia-cub-wrappers.cc
rename to runtime/src/gpu/nvidia/gpu-nvidia-aux.cc
index 4fc41364ede4..49ead366a6db 100644
--- a/runtime/src/gpu/nvidia/cub-wrappers/gpu-nvidia-cub-wrappers.cc
+++ b/runtime/src/gpu/nvidia/gpu-nvidia-aux.cc
@@ -2,7 +2,7 @@
 #include <cub/cub.cuh>
 
 #include "chpl-gpu.h"
-#include "../../common/cuda-utils.h"
+#include "../common/cuda-utils.h"
 
 int64_t chpl_gpu_aux_sum_reduce(int64_t* data, int n) {
 
@@ -14,18 +14,18 @@ int64_t chpl_gpu_aux_sum_reduce(int64_t* data, int n) {
   size_t temp_bytes = 0;
   cub::DeviceReduce::Sum(temp, temp_bytes, data, (int64_t*)result, n);
 
-  printf("Allocation for scratch %zu\n", temp_bytes);
+  //printf("Allocation for scratch %zu\n", temp_bytes);
 
   // Allocate temporary storage
   CUDA_CALL(cuMemAlloc(((CUdeviceptr*)&temp), temp_bytes));
 
-  printf("Temporary allocated %p\n", temp);
+  //printf("Temporary allocated %p\n", temp);
 
   // Run sum-reduction
-  cub::DeviceReduce::Sum(temp, temp_bytes, data, (int64_t*)result, n, 0, true);
+  cub::DeviceReduce::Sum(temp, temp_bytes, data, (int64_t*)result, n, 0);
 
   int64_t result_host;
   CUDA_CALL(cuMemcpyDtoH(&result_host, result, sizeof(int64_t)));
-  printf("Result %ld\n", result_host);
+  //printf("Result %ld\n", result_host);
   return result_host;
 }

From 34173011bb681fe7632cc057e62a3186b46442e9 Mon Sep 17 00:00:00 2001
From: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
Date: Fri, 6 Oct 2023 21:40:03 -0700
Subject: [PATCH 06/42] Finish implementing basic reductions

Signed-off-by: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
---
 runtime/Makefile.help                         |  2 -
 runtime/include/chpl-gpu.h                    | 20 ++++++-
 runtime/make/Makefile.runtime.include         |  1 -
 runtime/src/gpu/nvidia/cub-wrappers/Makefile  | 41 ---------------
 .../gpu/nvidia/cub-wrappers/Makefile.include  | 25 ---------
 .../gpu/nvidia/cub-wrappers/Makefile.share    | 29 -----------
 runtime/src/gpu/nvidia/gpu-nvidia-aux.cc      | 52 ++++++++++---------
 7 files changed, 47 insertions(+), 123 deletions(-)
 delete mode 100644 runtime/src/gpu/nvidia/cub-wrappers/Makefile
 delete mode 100644 runtime/src/gpu/nvidia/cub-wrappers/Makefile.include
 delete mode 100644 runtime/src/gpu/nvidia/cub-wrappers/Makefile.share

diff --git a/runtime/Makefile.help b/runtime/Makefile.help
index fa98f7761456..910277c18dab 100644
--- a/runtime/Makefile.help
+++ b/runtime/Makefile.help
@@ -78,8 +78,6 @@ include $(call FIND_MAKEFILE,LAUNCHER,launch)
 include $(call FIND_MAKEFILE,MEM,mem)
 include $(call FIND_MAKEFILE,TOPO,topo)
 
-include src/gpu/nvidia/cub-wrappers/Makefile.include
-
 include src/qio/Makefile.include
 
 
diff --git a/runtime/include/chpl-gpu.h b/runtime/include/chpl-gpu.h
index eddd3060c7ab..1508cf0a87e0 100644
--- a/runtime/include/chpl-gpu.h
+++ b/runtime/include/chpl-gpu.h
@@ -149,7 +149,25 @@ size_t chpl_gpu_get_alloc_size(void* ptr);
 bool chpl_gpu_can_access_peer(int dev1, int dev2);
 void chpl_gpu_set_peer_access(int dev1, int dev2, bool enable);
 
-int64_t chpl_gpu_aux_sum_reduce(int64_t* data, int n);
+
+#define DECL_ONE_BASIC_REDUCE(chpl_kind, data_type) \
+data_type chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n);
+
+#define DECL_BASIC_REDUCE(chpl_kind) \
+  DECL_ONE_BASIC_REDUCE(chpl_kind, int8_t) \
+  DECL_ONE_BASIC_REDUCE(chpl_kind, int16_t) \
+  DECL_ONE_BASIC_REDUCE(chpl_kind, int32_t) \
+  DECL_ONE_BASIC_REDUCE(chpl_kind, int64_t) \
+  DECL_ONE_BASIC_REDUCE(chpl_kind, uint8_t) \
+  DECL_ONE_BASIC_REDUCE(chpl_kind, uint16_t) \
+  DECL_ONE_BASIC_REDUCE(chpl_kind, uint32_t) \
+  DECL_ONE_BASIC_REDUCE(chpl_kind, uint64_t) \
+  DECL_ONE_BASIC_REDUCE(chpl_kind, float) \
+  DECL_ONE_BASIC_REDUCE(chpl_kind, double)
+
+DECL_BASIC_REDUCE(sum);
+DECL_BASIC_REDUCE(min);
+DECL_BASIC_REDUCE(max);
 
 #endif // HAS_GPU_LOCALE
 
diff --git a/runtime/make/Makefile.runtime.include b/runtime/make/Makefile.runtime.include
index dd474519c9f7..b19be2b8ae7e 100644
--- a/runtime/make/Makefile.runtime.include
+++ b/runtime/make/Makefile.runtime.include
@@ -59,7 +59,6 @@ RUNTIME_INCLS += \
         -I. \
         -I$(RUNTIME_INCLUDE_ROOT)/localeModels/$(CHPL_MAKE_LOCALE_MODEL) \
         -I$(RUNTIME_INCLUDE_ROOT)/localeModels \
-        -I$(RUNTIME_INCLUDE_ROOT)/gpu/$(CHPL_MAKE_GPU)/cub-wrappers \
         -I$(RUNTIME_INCLUDE_ROOT)/gpu/$(CHPL_MAKE_GPU) \
         -I$(RUNTIME_INCLUDE_ROOT)/gpu \
         -I$(RUNTIME_INCLUDE_ROOT)/comm/$(CHPL_MAKE_COMM) \
diff --git a/runtime/src/gpu/nvidia/cub-wrappers/Makefile b/runtime/src/gpu/nvidia/cub-wrappers/Makefile
deleted file mode 100644
index 7cbacd864502..000000000000
--- a/runtime/src/gpu/nvidia/cub-wrappers/Makefile
+++ /dev/null
@@ -1,41 +0,0 @@
-# Copyright 2020-2023 Hewlett Packard Enterprise Development LP
-# Copyright 2004-2019 Cray Inc.
-# Other additional copyright holders may be indicated within.
-#
-# The entirety of this work is licensed under the Apache License,
-# Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License.
-#
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-RUNTIME_ROOT = ../../../..
-RUNTIME_SUBDIR = src/gpu/nvidia/cub-wrappers
-
-ifndef CHPL_MAKE_HOME
-export CHPL_MAKE_HOME=$(shell pwd)/$(RUNTIME_ROOT)/..
-endif
-
-#
-# standard header
-#
-include $(RUNTIME_ROOT)/make/Makefile.runtime.head
-
-CUB_OBJDIR = $(RUNTIME_OBJDIR)
-include Makefile.share
-
-TARGETS = $(CUB_OBJS)
-
-include $(RUNTIME_ROOT)/make/Makefile.runtime.subdirrules
-
-#
-# standard footer
-#
-include $(RUNTIME_ROOT)/make/Makefile.runtime.foot
diff --git a/runtime/src/gpu/nvidia/cub-wrappers/Makefile.include b/runtime/src/gpu/nvidia/cub-wrappers/Makefile.include
deleted file mode 100644
index 2b8ca29764af..000000000000
--- a/runtime/src/gpu/nvidia/cub-wrappers/Makefile.include
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright 2020-2023 Hewlett Packard Enterprise Development LP
-# Copyright 2004-2019 Cray Inc.
-# Other additional copyright holders may be indicated within.
-#
-# The entirety of this work is licensed under the Apache License,
-# Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License.
-#
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-CUB_SUBDIR = src/gpu/nvidia/cub-wrappers
-
-CUB_OBJDIR = $(RUNTIME_BUILD)/$(CUB_SUBDIR)
-
-ALL_SRCS += $(CURDIR)/$(CUB_SUBDIR)/*.cc
-
-include $(RUNTIME_ROOT)/$(CUB_SUBDIR)/Makefile.share
diff --git a/runtime/src/gpu/nvidia/cub-wrappers/Makefile.share b/runtime/src/gpu/nvidia/cub-wrappers/Makefile.share
deleted file mode 100644
index 48f44ffde897..000000000000
--- a/runtime/src/gpu/nvidia/cub-wrappers/Makefile.share
+++ /dev/null
@@ -1,29 +0,0 @@
-# Copyright 2020-2023 Hewlett Packard Enterprise Development LP
-# Copyright 2004-2019 Cray Inc.
-# Other additional copyright holders may be indicated within.
-#
-# The entirety of this work is licensed under the Apache License,
-# Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License.
-#
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-CUB_SRCS = gpu-nvidia-cub-wrappers.cc
-
-SRCS = gpu-nvidia-cub-wrappers.cc
-
-CUB_OBJS = $(CUB_SRCS:%.cc=$(CUB_OBJDIR)/%.o)
-
-RUNTIME_CXXFLAGS += -x cuda -Wno-unknown-cuda-version
-
-$(RUNTIME_OBJ_DIR)/gpu-nvidia-cub-wrappers.o: gpu-nvidia-cub-wrappers.cc \
-                                         $(RUNTIME_OBJ_DIR_STAMP)
-	$(CXX) -c $(CXX11_STD) $(RUNTIME_CXXFLAGS) $(RUNTIME_INCLS) -o $@ $<
diff --git a/runtime/src/gpu/nvidia/gpu-nvidia-aux.cc b/runtime/src/gpu/nvidia/gpu-nvidia-aux.cc
index 49ead366a6db..17fda58d9336 100644
--- a/runtime/src/gpu/nvidia/gpu-nvidia-aux.cc
+++ b/runtime/src/gpu/nvidia/gpu-nvidia-aux.cc
@@ -4,28 +4,32 @@
 #include "chpl-gpu.h"
 #include "../common/cuda-utils.h"
 
-int64_t chpl_gpu_aux_sum_reduce(int64_t* data, int n) {
-
-  CUdeviceptr result;
-
-  CUDA_CALL(cuMemAlloc(&result, sizeof(int64_t)));
-
-  void* temp = NULL;
-  size_t temp_bytes = 0;
-  cub::DeviceReduce::Sum(temp, temp_bytes, data, (int64_t*)result, n);
-
-  //printf("Allocation for scratch %zu\n", temp_bytes);
-
-  // Allocate temporary storage
-  CUDA_CALL(cuMemAlloc(((CUdeviceptr*)&temp), temp_bytes));
-
-  //printf("Temporary allocated %p\n", temp);
-
-  // Run sum-reduction
-  cub::DeviceReduce::Sum(temp, temp_bytes, data, (int64_t*)result, n, 0);
-
-  int64_t result_host;
-  CUDA_CALL(cuMemcpyDtoH(&result_host, result, sizeof(int64_t)));
-  //printf("Result %ld\n", result_host);
-  return result_host;
+#define DEF_ONE_BASIC_REDUCE(cub_kind, chpl_kind, data_type) \
+data_type chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n) {\
+  CUdeviceptr result; \
+  CUDA_CALL(cuMemAlloc(&result, sizeof(data_type))); \
+  void* temp = NULL; \
+  size_t temp_bytes = 0; \
+  cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (data_type*)result, n); \
+  CUDA_CALL(cuMemAlloc(((CUdeviceptr*)&temp), temp_bytes)); \
+  cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (data_type*)result, n); \
+  data_type result_host; \
+  CUDA_CALL(cuMemcpyDtoH(&result_host, result, sizeof(data_type))); \
+  return result_host; \
 }
+
+#define DEF_BASIC_REDUCE(cub_kind, chpl_kind) \
+  DEF_ONE_BASIC_REDUCE(cub_kind, chpl_kind, int8_t)  \
+  DEF_ONE_BASIC_REDUCE(cub_kind, chpl_kind, int16_t)  \
+  DEF_ONE_BASIC_REDUCE(cub_kind, chpl_kind, int32_t)  \
+  DEF_ONE_BASIC_REDUCE(cub_kind, chpl_kind, int64_t)  \
+  DEF_ONE_BASIC_REDUCE(cub_kind, chpl_kind, uint8_t)  \
+  DEF_ONE_BASIC_REDUCE(cub_kind, chpl_kind, uint16_t)  \
+  DEF_ONE_BASIC_REDUCE(cub_kind, chpl_kind, uint32_t)  \
+  DEF_ONE_BASIC_REDUCE(cub_kind, chpl_kind, uint64_t)  \
+  DEF_ONE_BASIC_REDUCE(cub_kind, chpl_kind, float)   \
+  DEF_ONE_BASIC_REDUCE(cub_kind, chpl_kind, double);
+
+DEF_BASIC_REDUCE(Sum, sum)
+DEF_BASIC_REDUCE(Min, min)
+DEF_BASIC_REDUCE(Max, max)

From 17118b35aa4c11d20f6e8fd43670991afe112a9b Mon Sep 17 00:00:00 2001
From: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
Date: Fri, 6 Oct 2023 21:40:43 -0700
Subject: [PATCH 07/42] Add test

Signed-off-by: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
---
 test/gpu/native/reduction/basic.chpl | 69 ++++++++++++++++++++++++++++
 test/gpu/native/reduction/basic.good |  2 +
 2 files changed, 71 insertions(+)
 create mode 100644 test/gpu/native/reduction/basic.chpl
 create mode 100644 test/gpu/native/reduction/basic.good

diff --git a/test/gpu/native/reduction/basic.chpl b/test/gpu/native/reduction/basic.chpl
new file mode 100644
index 000000000000..7865caa24376
--- /dev/null
+++ b/test/gpu/native/reduction/basic.chpl
@@ -0,0 +1,69 @@
+
+inline proc chpl_reduceHelp(param op: string, ref A: [] ?t): t {
+  proc chplTypeToCTypeName(type t) param {
+    select t {
+      when int(8)   do return "int8_t";
+      when int(16)  do return "int16_t";
+      when int(32)  do return "int32_t";
+      when int(64)  do return "int64_t";
+      when uint(8)  do return "uint8_t";
+      when uint(16) do return "uint16_t";
+      when uint(32) do return "uint32_t";
+      when uint(64) do return "uint64_t";
+      when real(32) do return "float";
+      when real(64) do return "double";
+    }
+    return "unknown";
+  }
+
+  proc externFuncName(param op: string, type t) param: string {
+    return "chpl_gpu_"+op+"_reduce_"+chplTypeToCTypeName(t);
+  }
+
+  use CTypes;
+  extern externFuncName(op, t) proc reduce_fn(data, size): t;
+  return reduce_fn(c_ptrTo(A), A.size);
+}
+
+inline proc gpuSumReduce(ref A: [] ?t): t { return chpl_reduceHelp("sum", A); }
+inline proc gpuMinReduce(ref A: [] ?t): t { return chpl_reduceHelp("min", A); }
+inline proc gpuMaxReduce(ref A: [] ?t): t { return chpl_reduceHelp("max", A); }
+
+config const n = 100;
+
+
+proc testType(type t) {
+  proc test(param op: string, type t) {
+    on here.gpus[0] {
+      var Arr: [0..#n] t;
+
+      foreach i in Arr.domain do Arr[i] = i:t;
+
+      var res;
+      select op {
+        when "sum" do res=gpuSumReduce(Arr);
+        when "min" do res=gpuMinReduce(Arr);
+        when "max" do res=gpuMaxReduce(Arr);
+      }
+
+      writeln(op, ": ", res);
+    }
+  }
+
+  writeln("Testing type ", t:string);
+  test("sum", t);
+  test("min", t);
+  test("max", t);
+  writeln();
+}
+
+testType(int(8));
+testType(int(16));
+testType(int(32));
+testType(int(64));
+testType(uint(8));
+testType(uint(16));
+testType(uint(32));
+testType(uint(64));
+testType(real(32));
+testType(real(64));
diff --git a/test/gpu/native/reduction/basic.good b/test/gpu/native/reduction/basic.good
new file mode 100644
index 000000000000..053fbaf2a4dd
--- /dev/null
+++ b/test/gpu/native/reduction/basic.good
@@ -0,0 +1,2 @@
+warning: The prototype GPU support implies --no-checks. This may impact debuggability. To suppress this warning, compile with --no-checks explicitly
+4950

From b5e689befde2993d0c60831b306922a4c5d3dc4d Mon Sep 17 00:00:00 2001
From: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
Date: Sun, 8 Oct 2023 21:21:58 -0700
Subject: [PATCH 08/42] Add minloc and maxloc reduces

Signed-off-by: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
---
 runtime/include/chpl-gpu.h               | 51 +++++++++++------
 runtime/src/gpu/nvidia/gpu-nvidia-aux.cc | 32 +++++++++++
 test/gpu/native/reduction/basic.chpl     | 22 +++++++-
 test/gpu/native/reduction/basic.good     | 71 +++++++++++++++++++++++-
 4 files changed, 156 insertions(+), 20 deletions(-)

diff --git a/runtime/include/chpl-gpu.h b/runtime/include/chpl-gpu.h
index 1508cf0a87e0..745b1d2f08b1 100644
--- a/runtime/include/chpl-gpu.h
+++ b/runtime/include/chpl-gpu.h
@@ -150,24 +150,43 @@ bool chpl_gpu_can_access_peer(int dev1, int dev2);
 void chpl_gpu_set_peer_access(int dev1, int dev2, bool enable);
 
 
-#define DECL_ONE_BASIC_REDUCE(chpl_kind, data_type) \
+#define DECL_ONE_REDUCE_RET_VAL(chpl_kind, data_type) \
 data_type chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n);
 
-#define DECL_BASIC_REDUCE(chpl_kind) \
-  DECL_ONE_BASIC_REDUCE(chpl_kind, int8_t) \
-  DECL_ONE_BASIC_REDUCE(chpl_kind, int16_t) \
-  DECL_ONE_BASIC_REDUCE(chpl_kind, int32_t) \
-  DECL_ONE_BASIC_REDUCE(chpl_kind, int64_t) \
-  DECL_ONE_BASIC_REDUCE(chpl_kind, uint8_t) \
-  DECL_ONE_BASIC_REDUCE(chpl_kind, uint16_t) \
-  DECL_ONE_BASIC_REDUCE(chpl_kind, uint32_t) \
-  DECL_ONE_BASIC_REDUCE(chpl_kind, uint64_t) \
-  DECL_ONE_BASIC_REDUCE(chpl_kind, float) \
-  DECL_ONE_BASIC_REDUCE(chpl_kind, double)
-
-DECL_BASIC_REDUCE(sum);
-DECL_BASIC_REDUCE(min);
-DECL_BASIC_REDUCE(max);
+#define DECL_REDUCE_RET_VAL(chpl_kind) \
+  DECL_ONE_REDUCE_RET_VAL(chpl_kind, int8_t) \
+  DECL_ONE_REDUCE_RET_VAL(chpl_kind, int16_t) \
+  DECL_ONE_REDUCE_RET_VAL(chpl_kind, int32_t) \
+  DECL_ONE_REDUCE_RET_VAL(chpl_kind, int64_t) \
+  DECL_ONE_REDUCE_RET_VAL(chpl_kind, uint8_t) \
+  DECL_ONE_REDUCE_RET_VAL(chpl_kind, uint16_t) \
+  DECL_ONE_REDUCE_RET_VAL(chpl_kind, uint32_t) \
+  DECL_ONE_REDUCE_RET_VAL(chpl_kind, uint64_t) \
+  DECL_ONE_REDUCE_RET_VAL(chpl_kind, float) \
+  DECL_ONE_REDUCE_RET_VAL(chpl_kind, double)
+
+DECL_REDUCE_RET_VAL(sum);
+DECL_REDUCE_RET_VAL(min);
+DECL_REDUCE_RET_VAL(max);
+
+#define DECL_ONE_REDUCE_RET_VAL_IDX(chpl_kind, data_type) \
+void chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
+                                               data_type* val, int* idx);
+
+#define DECL_REDUCE_RET_VAL_IDX(chpl_kind) \
+  DECL_ONE_REDUCE_RET_VAL_IDX(chpl_kind, int8_t) \
+  DECL_ONE_REDUCE_RET_VAL_IDX(chpl_kind, int16_t) \
+  DECL_ONE_REDUCE_RET_VAL_IDX(chpl_kind, int32_t) \
+  DECL_ONE_REDUCE_RET_VAL_IDX(chpl_kind, int64_t) \
+  DECL_ONE_REDUCE_RET_VAL_IDX(chpl_kind, uint8_t) \
+  DECL_ONE_REDUCE_RET_VAL_IDX(chpl_kind, uint16_t) \
+  DECL_ONE_REDUCE_RET_VAL_IDX(chpl_kind, uint32_t) \
+  DECL_ONE_REDUCE_RET_VAL_IDX(chpl_kind, uint64_t) \
+  DECL_ONE_REDUCE_RET_VAL_IDX(chpl_kind, float) \
+  DECL_ONE_REDUCE_RET_VAL_IDX(chpl_kind, double)
+
+DECL_REDUCE_RET_VAL_IDX(minloc);
+DECL_REDUCE_RET_VAL_IDX(maxloc);
 
 #endif // HAS_GPU_LOCALE
 
diff --git a/runtime/src/gpu/nvidia/gpu-nvidia-aux.cc b/runtime/src/gpu/nvidia/gpu-nvidia-aux.cc
index 17fda58d9336..86190481f58c 100644
--- a/runtime/src/gpu/nvidia/gpu-nvidia-aux.cc
+++ b/runtime/src/gpu/nvidia/gpu-nvidia-aux.cc
@@ -33,3 +33,35 @@ data_type chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n) {\
 DEF_BASIC_REDUCE(Sum, sum)
 DEF_BASIC_REDUCE(Min, min)
 DEF_BASIC_REDUCE(Max, max)
+
+#define DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, data_type) \
+void chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
+                                               data_type* val, int* idx) {\
+  using kvp = cub::KeyValuePair<int,data_type>; \
+  CUdeviceptr result; \
+  CUDA_CALL(cuMemAlloc(&result, sizeof(kvp))); \
+  void* temp = NULL; \
+  size_t temp_bytes = 0; \
+  cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (kvp*)result, n); \
+  CUDA_CALL(cuMemAlloc(((CUdeviceptr*)&temp), temp_bytes)); \
+  cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (kvp*)result, n); \
+  kvp result_host; \
+  CUDA_CALL(cuMemcpyDtoH(&result_host, result, sizeof(kvp))); \
+  *val = result_host.value; \
+  *idx = result_host.key; \
+}
+
+#define DEF_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind) \
+  DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, int8_t)  \
+  DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, int16_t)  \
+  DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, int32_t)  \
+  DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, int64_t)  \
+  DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, uint8_t)  \
+  DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, uint16_t)  \
+  DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, uint32_t)  \
+  DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, uint64_t)  \
+  DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, float)   \
+  DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, double);
+
+DEF_REDUCE_RET_VAL_IDX(ArgMin, minloc)
+DEF_REDUCE_RET_VAL_IDX(ArgMax, maxloc)
diff --git a/test/gpu/native/reduction/basic.chpl b/test/gpu/native/reduction/basic.chpl
index 7865caa24376..1bb3d4b1c7be 100644
--- a/test/gpu/native/reduction/basic.chpl
+++ b/test/gpu/native/reduction/basic.chpl
@@ -1,5 +1,5 @@
 
-inline proc chpl_reduceHelp(param op: string, ref A: [] ?t): t {
+inline proc chpl_reduceHelp(param op: string, ref A: [] ?t) {
   proc chplTypeToCTypeName(type t) param {
     select t {
       when int(8)   do return "int8_t";
@@ -21,13 +21,25 @@ inline proc chpl_reduceHelp(param op: string, ref A: [] ?t): t {
   }
 
   use CTypes;
-  extern externFuncName(op, t) proc reduce_fn(data, size): t;
-  return reduce_fn(c_ptrTo(A), A.size);
+
+  if op == "sum" || op == "min" || op == "max" {
+    extern externFuncName(op, t) proc reduce_fn(data, size): t;
+    return reduce_fn(c_ptrTo(A), A.size);
+  }
+  else {
+    var idx: int(32);
+    var val: t;
+    extern externFuncName(op, t) proc reduce_fn(data, size, ref val, ref idx);
+    reduce_fn(c_ptrTo(A), A.size, val, idx);
+    return (idx, val);
+  }
 }
 
 inline proc gpuSumReduce(ref A: [] ?t): t { return chpl_reduceHelp("sum", A); }
 inline proc gpuMinReduce(ref A: [] ?t): t { return chpl_reduceHelp("min", A); }
 inline proc gpuMaxReduce(ref A: [] ?t): t { return chpl_reduceHelp("max", A); }
+inline proc gpuMinLocReduce(ref A: [] ?t) { return chpl_reduceHelp("minloc", A); }
+inline proc gpuMaxLocReduce(ref A: [] ?t) { return chpl_reduceHelp("maxloc", A); }
 
 config const n = 100;
 
@@ -44,6 +56,8 @@ proc testType(type t) {
         when "sum" do res=gpuSumReduce(Arr);
         when "min" do res=gpuMinReduce(Arr);
         when "max" do res=gpuMaxReduce(Arr);
+        when "minloc" do res=gpuMinLocReduce(Arr);
+        when "maxloc" do res=gpuMaxLocReduce(Arr);
       }
 
       writeln(op, ": ", res);
@@ -54,6 +68,8 @@ proc testType(type t) {
   test("sum", t);
   test("min", t);
   test("max", t);
+  test("minloc", t);
+  test("maxloc", t);
   writeln();
 }
 
diff --git a/test/gpu/native/reduction/basic.good b/test/gpu/native/reduction/basic.good
index 053fbaf2a4dd..4f9386297b60 100644
--- a/test/gpu/native/reduction/basic.good
+++ b/test/gpu/native/reduction/basic.good
@@ -1,2 +1,71 @@
 warning: The prototype GPU support implies --no-checks. This may impact debuggability. To suppress this warning, compile with --no-checks explicitly
-4950
+Testing type int(8)
+sum: 86
+min: 0
+max: 99
+minloc: (0, 0)
+maxloc: (99, 99)
+
+Testing type int(16)
+sum: 4950
+min: 0
+max: 99
+minloc: (0, 0)
+maxloc: (99, 99)
+
+Testing type int(32)
+sum: 4950
+min: 0
+max: 99
+minloc: (0, 0)
+maxloc: (99, 99)
+
+Testing type int(64)
+sum: 4950
+min: 0
+max: 99
+minloc: (0, 0)
+maxloc: (99, 99)
+
+Testing type uint(8)
+sum: 86
+min: 0
+max: 99
+minloc: (0, 0)
+maxloc: (99, 99)
+
+Testing type uint(16)
+sum: 4950
+min: 0
+max: 99
+minloc: (0, 0)
+maxloc: (99, 99)
+
+Testing type uint(32)
+sum: 4950
+min: 0
+max: 99
+minloc: (0, 0)
+maxloc: (99, 99)
+
+Testing type uint(64)
+sum: 4950
+min: 0
+max: 99
+minloc: (0, 0)
+maxloc: (99, 99)
+
+Testing type real(32)
+sum: 4950.0
+min: 0.0
+max: 99.0
+minloc: (0, 0.0)
+maxloc: (99, 99.0)
+
+Testing type real(64)
+sum: 4950.0
+min: 0.0
+max: 99.0
+minloc: (0, 0.0)
+maxloc: (99, 99.0)
+

From f0369318eade2d1828956e11e81156e92dc942c2 Mon Sep 17 00:00:00 2001
From: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
Date: Sun, 8 Oct 2023 21:29:45 -0700
Subject: [PATCH 09/42] Refactor basic reduce runtime interface, too

Signed-off-by: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
---
 runtime/include/chpl-gpu.h               |  3 +-
 runtime/src/gpu/nvidia/gpu-nvidia-aux.cc | 37 ++++++++++++------------
 test/gpu/native/reduction/basic.chpl     | 24 ++++++++-------
 3 files changed, 34 insertions(+), 30 deletions(-)

diff --git a/runtime/include/chpl-gpu.h b/runtime/include/chpl-gpu.h
index 745b1d2f08b1..510b78c1f238 100644
--- a/runtime/include/chpl-gpu.h
+++ b/runtime/include/chpl-gpu.h
@@ -151,7 +151,8 @@ void chpl_gpu_set_peer_access(int dev1, int dev2, bool enable);
 
 
 #define DECL_ONE_REDUCE_RET_VAL(chpl_kind, data_type) \
-data_type chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n);
+void chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
+                                               data_type* val);
 
 #define DECL_REDUCE_RET_VAL(chpl_kind) \
   DECL_ONE_REDUCE_RET_VAL(chpl_kind, int8_t) \
diff --git a/runtime/src/gpu/nvidia/gpu-nvidia-aux.cc b/runtime/src/gpu/nvidia/gpu-nvidia-aux.cc
index 86190481f58c..2eb127a7c0e0 100644
--- a/runtime/src/gpu/nvidia/gpu-nvidia-aux.cc
+++ b/runtime/src/gpu/nvidia/gpu-nvidia-aux.cc
@@ -4,8 +4,9 @@
 #include "chpl-gpu.h"
 #include "../common/cuda-utils.h"
 
-#define DEF_ONE_BASIC_REDUCE(cub_kind, chpl_kind, data_type) \
-data_type chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n) {\
+#define DEF_ONE_REDUCE_RET_VAL(cub_kind, chpl_kind, data_type) \
+void chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
+                                               data_type* val) {\
   CUdeviceptr result; \
   CUDA_CALL(cuMemAlloc(&result, sizeof(data_type))); \
   void* temp = NULL; \
@@ -13,26 +14,24 @@ data_type chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n) {\
   cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (data_type*)result, n); \
   CUDA_CALL(cuMemAlloc(((CUdeviceptr*)&temp), temp_bytes)); \
   cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (data_type*)result, n); \
-  data_type result_host; \
-  CUDA_CALL(cuMemcpyDtoH(&result_host, result, sizeof(data_type))); \
-  return result_host; \
+  CUDA_CALL(cuMemcpyDtoH(val, result, sizeof(data_type))); \
 }
 
-#define DEF_BASIC_REDUCE(cub_kind, chpl_kind) \
-  DEF_ONE_BASIC_REDUCE(cub_kind, chpl_kind, int8_t)  \
-  DEF_ONE_BASIC_REDUCE(cub_kind, chpl_kind, int16_t)  \
-  DEF_ONE_BASIC_REDUCE(cub_kind, chpl_kind, int32_t)  \
-  DEF_ONE_BASIC_REDUCE(cub_kind, chpl_kind, int64_t)  \
-  DEF_ONE_BASIC_REDUCE(cub_kind, chpl_kind, uint8_t)  \
-  DEF_ONE_BASIC_REDUCE(cub_kind, chpl_kind, uint16_t)  \
-  DEF_ONE_BASIC_REDUCE(cub_kind, chpl_kind, uint32_t)  \
-  DEF_ONE_BASIC_REDUCE(cub_kind, chpl_kind, uint64_t)  \
-  DEF_ONE_BASIC_REDUCE(cub_kind, chpl_kind, float)   \
-  DEF_ONE_BASIC_REDUCE(cub_kind, chpl_kind, double);
+#define DEF_REDUCE_RET_VAL(cub_kind, chpl_kind) \
+  DEF_ONE_REDUCE_RET_VAL(cub_kind, chpl_kind, int8_t)  \
+  DEF_ONE_REDUCE_RET_VAL(cub_kind, chpl_kind, int16_t)  \
+  DEF_ONE_REDUCE_RET_VAL(cub_kind, chpl_kind, int32_t)  \
+  DEF_ONE_REDUCE_RET_VAL(cub_kind, chpl_kind, int64_t)  \
+  DEF_ONE_REDUCE_RET_VAL(cub_kind, chpl_kind, uint8_t)  \
+  DEF_ONE_REDUCE_RET_VAL(cub_kind, chpl_kind, uint16_t)  \
+  DEF_ONE_REDUCE_RET_VAL(cub_kind, chpl_kind, uint32_t)  \
+  DEF_ONE_REDUCE_RET_VAL(cub_kind, chpl_kind, uint64_t)  \
+  DEF_ONE_REDUCE_RET_VAL(cub_kind, chpl_kind, float)   \
+  DEF_ONE_REDUCE_RET_VAL(cub_kind, chpl_kind, double);
 
-DEF_BASIC_REDUCE(Sum, sum)
-DEF_BASIC_REDUCE(Min, min)
-DEF_BASIC_REDUCE(Max, max)
+DEF_REDUCE_RET_VAL(Sum, sum)
+DEF_REDUCE_RET_VAL(Min, min)
+DEF_REDUCE_RET_VAL(Max, max)
 
 #define DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, data_type) \
 void chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
diff --git a/test/gpu/native/reduction/basic.chpl b/test/gpu/native/reduction/basic.chpl
index 1bb3d4b1c7be..0933ac1bfcfd 100644
--- a/test/gpu/native/reduction/basic.chpl
+++ b/test/gpu/native/reduction/basic.chpl
@@ -1,5 +1,5 @@
 
-inline proc chpl_reduceHelp(param op: string, ref A: [] ?t) {
+inline proc chpl_doGpuReduce(param op: string, ref A: [] ?t) {
   proc chplTypeToCTypeName(type t) param {
     select t {
       when int(8)   do return "int8_t";
@@ -16,30 +16,34 @@ inline proc chpl_reduceHelp(param op: string, ref A: [] ?t) {
     return "unknown";
   }
 
-  proc externFuncName(param op: string, type t) param: string {
+  proc getExternFuncName(param op: string, type t) param: string {
     return "chpl_gpu_"+op+"_reduce_"+chplTypeToCTypeName(t);
   }
 
   use CTypes;
 
+  param externFunc = getExternFuncName(op, t);
+
   if op == "sum" || op == "min" || op == "max" {
-    extern externFuncName(op, t) proc reduce_fn(data, size): t;
-    return reduce_fn(c_ptrTo(A), A.size);
+    var val: t;
+    extern externFunc proc reduce_fn(data, size, ref val);
+    reduce_fn(c_ptrTo(A), A.size, val);
+    return val;
   }
   else {
     var idx: int(32);
     var val: t;
-    extern externFuncName(op, t) proc reduce_fn(data, size, ref val, ref idx);
+    extern externFunc proc reduce_fn(data, size, ref val, ref idx);
     reduce_fn(c_ptrTo(A), A.size, val, idx);
     return (idx, val);
   }
 }
 
-inline proc gpuSumReduce(ref A: [] ?t): t { return chpl_reduceHelp("sum", A); }
-inline proc gpuMinReduce(ref A: [] ?t): t { return chpl_reduceHelp("min", A); }
-inline proc gpuMaxReduce(ref A: [] ?t): t { return chpl_reduceHelp("max", A); }
-inline proc gpuMinLocReduce(ref A: [] ?t) { return chpl_reduceHelp("minloc", A); }
-inline proc gpuMaxLocReduce(ref A: [] ?t) { return chpl_reduceHelp("maxloc", A); }
+inline proc gpuSumReduce(ref A: [] ?t): t { return chpl_doGpuReduce("sum", A); }
+inline proc gpuMinReduce(ref A: [] ?t): t { return chpl_doGpuReduce("min", A); }
+inline proc gpuMaxReduce(ref A: [] ?t): t { return chpl_doGpuReduce("max", A); }
+inline proc gpuMinLocReduce(ref A: [] ?t) { return chpl_doGpuReduce("minloc", A); }
+inline proc gpuMaxLocReduce(ref A: [] ?t) { return chpl_doGpuReduce("maxloc", A); }
 
 config const n = 100;
 

From 41e30a9f6885affb16f371bbfff74103ea20ddab Mon Sep 17 00:00:00 2001
From: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
Date: Sun, 8 Oct 2023 21:32:12 -0700
Subject: [PATCH 10/42] Move the actual reduction functions to the GPU module

Signed-off-by: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
---
 modules/standard/GPU.chpl            | 51 ++++++++++++++++++++++++++++
 test/gpu/native/reduction/basic.chpl | 48 +-------------------------
 2 files changed, 52 insertions(+), 47 deletions(-)

diff --git a/modules/standard/GPU.chpl b/modules/standard/GPU.chpl
index 62b333d6091b..566d9461b473 100644
--- a/modules/standard/GPU.chpl
+++ b/modules/standard/GPU.chpl
@@ -389,4 +389,55 @@ module GPU
   /* When run on a GPU, atomically compare the value in 'x' and 'cmp', if they
      are equal store 'val' in 'x'. The operation returns the old value of x. */
   inline proc gpuAtomicCAS(  ref x : ?T, cmp : T, val : T) : T { return gpuAtomicTernOp("CAS", x, cmp, val); }
+
+  // ============================
+  // Reductions
+  // ============================
+
+  private inline proc doGpuReduce(param op: string, ref A: [] ?t) {
+    proc chplTypeToCTypeName(type t) param {
+      select t {
+        when int(8)   do return "int8_t";
+        when int(16)  do return "int16_t";
+        when int(32)  do return "int32_t";
+        when int(64)  do return "int64_t";
+        when uint(8)  do return "uint8_t";
+        when uint(16) do return "uint16_t";
+        when uint(32) do return "uint32_t";
+        when uint(64) do return "uint64_t";
+        when real(32) do return "float";
+        when real(64) do return "double";
+      }
+      return "unknown";
+    }
+
+    proc getExternFuncName(param op: string, type t) param: string {
+      return "chpl_gpu_"+op+"_reduce_"+chplTypeToCTypeName(t);
+    }
+
+    use CTypes;
+
+    param externFunc = getExternFuncName(op, t);
+
+    if op == "sum" || op == "min" || op == "max" {
+      var val: t;
+      extern externFunc proc reduce_fn(data, size, ref val);
+      reduce_fn(c_ptrTo(A), A.size, val);
+      return val;
+    }
+    else {
+      var idx: int(32);
+      var val: t;
+      extern externFunc proc reduce_fn(data, size, ref val, ref idx);
+      reduce_fn(c_ptrTo(A), A.size, val, idx);
+      return (idx, val);
+    }
+  }
+
+  inline proc gpuSumReduce(ref A: [] ?t) do return doGpuReduce("sum", A);
+  inline proc gpuMinReduce(ref A: [] ?t) do return doGpuReduce("min", A);
+  inline proc gpuMaxReduce(ref A: [] ?t) do return doGpuReduce("max", A);
+  inline proc gpuMinLocReduce(ref A: [] ?t) do return doGpuReduce("minloc", A);
+  inline proc gpuMaxLocReduce(ref A: [] ?t) do return doGpuReduce("maxloc", A);
+
 }
diff --git a/test/gpu/native/reduction/basic.chpl b/test/gpu/native/reduction/basic.chpl
index 0933ac1bfcfd..5a1ceef66952 100644
--- a/test/gpu/native/reduction/basic.chpl
+++ b/test/gpu/native/reduction/basic.chpl
@@ -1,53 +1,7 @@
-
-inline proc chpl_doGpuReduce(param op: string, ref A: [] ?t) {
-  proc chplTypeToCTypeName(type t) param {
-    select t {
-      when int(8)   do return "int8_t";
-      when int(16)  do return "int16_t";
-      when int(32)  do return "int32_t";
-      when int(64)  do return "int64_t";
-      when uint(8)  do return "uint8_t";
-      when uint(16) do return "uint16_t";
-      when uint(32) do return "uint32_t";
-      when uint(64) do return "uint64_t";
-      when real(32) do return "float";
-      when real(64) do return "double";
-    }
-    return "unknown";
-  }
-
-  proc getExternFuncName(param op: string, type t) param: string {
-    return "chpl_gpu_"+op+"_reduce_"+chplTypeToCTypeName(t);
-  }
-
-  use CTypes;
-
-  param externFunc = getExternFuncName(op, t);
-
-  if op == "sum" || op == "min" || op == "max" {
-    var val: t;
-    extern externFunc proc reduce_fn(data, size, ref val);
-    reduce_fn(c_ptrTo(A), A.size, val);
-    return val;
-  }
-  else {
-    var idx: int(32);
-    var val: t;
-    extern externFunc proc reduce_fn(data, size, ref val, ref idx);
-    reduce_fn(c_ptrTo(A), A.size, val, idx);
-    return (idx, val);
-  }
-}
-
-inline proc gpuSumReduce(ref A: [] ?t): t { return chpl_doGpuReduce("sum", A); }
-inline proc gpuMinReduce(ref A: [] ?t): t { return chpl_doGpuReduce("min", A); }
-inline proc gpuMaxReduce(ref A: [] ?t): t { return chpl_doGpuReduce("max", A); }
-inline proc gpuMinLocReduce(ref A: [] ?t) { return chpl_doGpuReduce("minloc", A); }
-inline proc gpuMaxLocReduce(ref A: [] ?t) { return chpl_doGpuReduce("maxloc", A); }
+use GPU;
 
 config const n = 100;
 
-
 proc testType(type t) {
   proc test(param op: string, type t) {
     on here.gpus[0] {

From 8d0e3e07011f58e2e79f9ae82b434c4d96e74640 Mon Sep 17 00:00:00 2001
From: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
Date: Sun, 8 Oct 2023 21:39:50 -0700
Subject: [PATCH 11/42] Simplify runtime macros

Signed-off-by: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
---
 runtime/include/chpl-gpu.h               | 46 +++++++++---------------
 runtime/src/gpu/nvidia/gpu-nvidia-aux.cc | 45 +++++++++--------------
 2 files changed, 34 insertions(+), 57 deletions(-)

diff --git a/runtime/include/chpl-gpu.h b/runtime/include/chpl-gpu.h
index 510b78c1f238..a52d40f53fab 100644
--- a/runtime/include/chpl-gpu.h
+++ b/runtime/include/chpl-gpu.h
@@ -150,44 +150,32 @@ bool chpl_gpu_can_access_peer(int dev1, int dev2);
 void chpl_gpu_set_peer_access(int dev1, int dev2, bool enable);
 
 
+#define DECL_REDUCE(MACRO, chpl_kind) \
+  MACRO(chpl_kind, int8_t) \
+  MACRO(chpl_kind, int16_t) \
+  MACRO(chpl_kind, int32_t) \
+  MACRO(chpl_kind, int64_t) \
+  MACRO(chpl_kind, uint8_t) \
+  MACRO(chpl_kind, uint16_t) \
+  MACRO(chpl_kind, uint32_t) \
+  MACRO(chpl_kind, uint64_t) \
+  MACRO(chpl_kind, float) \
+  MACRO(chpl_kind, double)
+
 #define DECL_ONE_REDUCE_RET_VAL(chpl_kind, data_type) \
 void chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
                                                data_type* val);
 
-#define DECL_REDUCE_RET_VAL(chpl_kind) \
-  DECL_ONE_REDUCE_RET_VAL(chpl_kind, int8_t) \
-  DECL_ONE_REDUCE_RET_VAL(chpl_kind, int16_t) \
-  DECL_ONE_REDUCE_RET_VAL(chpl_kind, int32_t) \
-  DECL_ONE_REDUCE_RET_VAL(chpl_kind, int64_t) \
-  DECL_ONE_REDUCE_RET_VAL(chpl_kind, uint8_t) \
-  DECL_ONE_REDUCE_RET_VAL(chpl_kind, uint16_t) \
-  DECL_ONE_REDUCE_RET_VAL(chpl_kind, uint32_t) \
-  DECL_ONE_REDUCE_RET_VAL(chpl_kind, uint64_t) \
-  DECL_ONE_REDUCE_RET_VAL(chpl_kind, float) \
-  DECL_ONE_REDUCE_RET_VAL(chpl_kind, double)
-
-DECL_REDUCE_RET_VAL(sum);
-DECL_REDUCE_RET_VAL(min);
-DECL_REDUCE_RET_VAL(max);
+DECL_REDUCE(DECL_ONE_REDUCE_RET_VAL, sum);
+DECL_REDUCE(DECL_ONE_REDUCE_RET_VAL, min);
+DECL_REDUCE(DECL_ONE_REDUCE_RET_VAL, max);
 
 #define DECL_ONE_REDUCE_RET_VAL_IDX(chpl_kind, data_type) \
 void chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
                                                data_type* val, int* idx);
 
-#define DECL_REDUCE_RET_VAL_IDX(chpl_kind) \
-  DECL_ONE_REDUCE_RET_VAL_IDX(chpl_kind, int8_t) \
-  DECL_ONE_REDUCE_RET_VAL_IDX(chpl_kind, int16_t) \
-  DECL_ONE_REDUCE_RET_VAL_IDX(chpl_kind, int32_t) \
-  DECL_ONE_REDUCE_RET_VAL_IDX(chpl_kind, int64_t) \
-  DECL_ONE_REDUCE_RET_VAL_IDX(chpl_kind, uint8_t) \
-  DECL_ONE_REDUCE_RET_VAL_IDX(chpl_kind, uint16_t) \
-  DECL_ONE_REDUCE_RET_VAL_IDX(chpl_kind, uint32_t) \
-  DECL_ONE_REDUCE_RET_VAL_IDX(chpl_kind, uint64_t) \
-  DECL_ONE_REDUCE_RET_VAL_IDX(chpl_kind, float) \
-  DECL_ONE_REDUCE_RET_VAL_IDX(chpl_kind, double)
-
-DECL_REDUCE_RET_VAL_IDX(minloc);
-DECL_REDUCE_RET_VAL_IDX(maxloc);
+DECL_REDUCE(DECL_ONE_REDUCE_RET_VAL_IDX, minloc);
+DECL_REDUCE(DECL_ONE_REDUCE_RET_VAL_IDX, maxloc);
 
 #endif // HAS_GPU_LOCALE
 
diff --git a/runtime/src/gpu/nvidia/gpu-nvidia-aux.cc b/runtime/src/gpu/nvidia/gpu-nvidia-aux.cc
index 2eb127a7c0e0..16f719c72b44 100644
--- a/runtime/src/gpu/nvidia/gpu-nvidia-aux.cc
+++ b/runtime/src/gpu/nvidia/gpu-nvidia-aux.cc
@@ -4,6 +4,18 @@
 #include "chpl-gpu.h"
 #include "../common/cuda-utils.h"
 
+#define DEF_REDUCE(MACRO, cub_kind, chpl_kind) \
+  MACRO(cub_kind, chpl_kind, int8_t)  \
+  MACRO(cub_kind, chpl_kind, int16_t)  \
+  MACRO(cub_kind, chpl_kind, int32_t)  \
+  MACRO(cub_kind, chpl_kind, int64_t)  \
+  MACRO(cub_kind, chpl_kind, uint8_t)  \
+  MACRO(cub_kind, chpl_kind, uint16_t)  \
+  MACRO(cub_kind, chpl_kind, uint32_t)  \
+  MACRO(cub_kind, chpl_kind, uint64_t)  \
+  MACRO(cub_kind, chpl_kind, float)   \
+  MACRO(cub_kind, chpl_kind, double);
+
 #define DEF_ONE_REDUCE_RET_VAL(cub_kind, chpl_kind, data_type) \
 void chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
                                                data_type* val) {\
@@ -17,21 +29,9 @@ void chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
   CUDA_CALL(cuMemcpyDtoH(val, result, sizeof(data_type))); \
 }
 
-#define DEF_REDUCE_RET_VAL(cub_kind, chpl_kind) \
-  DEF_ONE_REDUCE_RET_VAL(cub_kind, chpl_kind, int8_t)  \
-  DEF_ONE_REDUCE_RET_VAL(cub_kind, chpl_kind, int16_t)  \
-  DEF_ONE_REDUCE_RET_VAL(cub_kind, chpl_kind, int32_t)  \
-  DEF_ONE_REDUCE_RET_VAL(cub_kind, chpl_kind, int64_t)  \
-  DEF_ONE_REDUCE_RET_VAL(cub_kind, chpl_kind, uint8_t)  \
-  DEF_ONE_REDUCE_RET_VAL(cub_kind, chpl_kind, uint16_t)  \
-  DEF_ONE_REDUCE_RET_VAL(cub_kind, chpl_kind, uint32_t)  \
-  DEF_ONE_REDUCE_RET_VAL(cub_kind, chpl_kind, uint64_t)  \
-  DEF_ONE_REDUCE_RET_VAL(cub_kind, chpl_kind, float)   \
-  DEF_ONE_REDUCE_RET_VAL(cub_kind, chpl_kind, double);
-
-DEF_REDUCE_RET_VAL(Sum, sum)
-DEF_REDUCE_RET_VAL(Min, min)
-DEF_REDUCE_RET_VAL(Max, max)
+DEF_REDUCE(DEF_ONE_REDUCE_RET_VAL, Sum, sum)
+DEF_REDUCE(DEF_ONE_REDUCE_RET_VAL, Min, min)
+DEF_REDUCE(DEF_ONE_REDUCE_RET_VAL, Max, max)
 
 #define DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, data_type) \
 void chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
@@ -50,17 +50,6 @@ void chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
   *idx = result_host.key; \
 }
 
-#define DEF_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind) \
-  DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, int8_t)  \
-  DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, int16_t)  \
-  DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, int32_t)  \
-  DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, int64_t)  \
-  DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, uint8_t)  \
-  DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, uint16_t)  \
-  DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, uint32_t)  \
-  DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, uint64_t)  \
-  DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, float)   \
-  DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, double);
+DEF_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, ArgMin, minloc)
+DEF_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, ArgMax, maxloc)
 
-DEF_REDUCE_RET_VAL_IDX(ArgMin, minloc)
-DEF_REDUCE_RET_VAL_IDX(ArgMax, maxloc)

From c56df6bed883e104a5a3c9196439bd68cfa78f2c Mon Sep 17 00:00:00 2001
From: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
Date: Fri, 13 Oct 2023 15:34:48 -0700
Subject: [PATCH 12/42] Start separating impl implementations

Signed-off-by: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
---
 runtime/src/chpl-gpu.c                   | 36 ++++++++++++++++++++++++
 runtime/src/gpu/nvidia/gpu-nvidia-aux.cc |  5 ++--
 2 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/runtime/src/chpl-gpu.c b/runtime/src/chpl-gpu.c
index bded55034def..bf3f6d9ae955 100644
--- a/runtime/src/chpl-gpu.c
+++ b/runtime/src/chpl-gpu.c
@@ -697,4 +697,40 @@ void chpl_gpu_set_peer_access(int dev1, int dev2, bool enable) {
   chpl_gpu_impl_set_peer_access(dev1, dev2, enable);
 }
 
+void chpl_gpu_sum_reduce_int8_t(int8_t *data, int n, int* val) {
+  CHPL_GPU_DEBUG("chpl_gpu_sum_reduce_int8_t called\n");
+
+  int dev = chpl_task_getRequestedSubloc();
+  chpl_gpu_impl_use_device(dev);
+  void* stream = get_stream(dev);
+
+  chpl_gpu_impl_sum_reduce_int8_t(data, n, val, stream);
+
+  if (chpl_gpu_sync_with_host) {
+    CHPL_GPU_DEBUG("Eagerly synchronizing stream %p\n", stream);
+    wait_stream(stream);
+  }
+
+  CHPL_GPU_DEBUG("chpl_gpu_sum_reduce_int8_t returned\n");
+}
+
+#define DEF_ONE_REDUCE_RET_VAL(kind, data_type)\
+void chpl_gpu_##kind##_reduce_##data_type(data_type *data, int n, \
+                                          data_type* val) { \
+  CHPL_GPU_DEBUG("chpl_gpu_" #kind "_reduce_" #data_type " called\n"); \
+  \
+  int dev = chpl_task_getRequestedSubloc(); \
+  chpl_gpu_impl_use_device(dev); \
+  void* stream = get_stream(dev); \
+  \
+  chpl_gpu_impl_sum_reduce_int8_t(data, n, val, stream);
+
+  if (chpl_gpu_sync_with_host) {
+    CHPL_GPU_DEBUG("Eagerly synchronizing stream %p\n", stream);
+    wait_stream(stream);
+  }
+
+  CHPL_GPU_DEBUG("chpl_gpu_sum_reduce_int8_t returned\n");
+}
+
 #endif
diff --git a/runtime/src/gpu/nvidia/gpu-nvidia-aux.cc b/runtime/src/gpu/nvidia/gpu-nvidia-aux.cc
index 16f719c72b44..75f3fd7bb986 100644
--- a/runtime/src/gpu/nvidia/gpu-nvidia-aux.cc
+++ b/runtime/src/gpu/nvidia/gpu-nvidia-aux.cc
@@ -28,10 +28,10 @@ void chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
   cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (data_type*)result, n); \
   CUDA_CALL(cuMemcpyDtoH(val, result, sizeof(data_type))); \
 }
-
 DEF_REDUCE(DEF_ONE_REDUCE_RET_VAL, Sum, sum)
 DEF_REDUCE(DEF_ONE_REDUCE_RET_VAL, Min, min)
 DEF_REDUCE(DEF_ONE_REDUCE_RET_VAL, Max, max)
+#undef DEF_ONE_REDUCE_RET_VAL
 
 #define DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, data_type) \
 void chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
@@ -49,7 +49,8 @@ void chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
   *val = result_host.value; \
   *idx = result_host.key; \
 }
-
 DEF_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, ArgMin, minloc)
 DEF_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, ArgMax, maxloc)
+#undef DEF_ONE_REDUCE_RET_VAL_IDX
 
+#undef DEF_REDUCE

From 7aa3642bb1578e74efeb1dfcf46de4477e4e87bd Mon Sep 17 00:00:00 2001
From: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
Date: Mon, 16 Oct 2023 16:07:19 -0700
Subject: [PATCH 13/42] Seperate runtime interface from the implementation
 interface

Signed-off-by: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
---
 runtime/include/chpl-gpu-impl.h          | 21 ++++++++
 runtime/include/chpl-gpu.h               | 28 ++++-------
 runtime/src/chpl-gpu.c                   | 62 +++++++++++++++---------
 runtime/src/gpu/nvidia/Makefile.share    |  2 -
 runtime/src/gpu/nvidia/gpu-nvidia-aux.cc | 56 ++++++++++-----------
 5 files changed, 99 insertions(+), 70 deletions(-)

diff --git a/runtime/include/chpl-gpu-impl.h b/runtime/include/chpl-gpu-impl.h
index 1dbd1a4563b3..6c01a07aea1f 100644
--- a/runtime/include/chpl-gpu-impl.h
+++ b/runtime/include/chpl-gpu-impl.h
@@ -76,6 +76,27 @@ void chpl_gpu_impl_stream_destroy(void* stream);
 bool chpl_gpu_impl_stream_ready(void* stream);
 void chpl_gpu_impl_stream_synchronize(void* stream);
 
+#define DECL_ONE_REDUCE_IMPL_RET_VAL(chpl_kind, data_type) \
+void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
+                                                    data_type* val,\
+                                                    void* stream);
+
+GPU_REDUCE(DECL_ONE_REDUCE_IMPL_RET_VAL, sum)
+GPU_REDUCE(DECL_ONE_REDUCE_IMPL_RET_VAL, min)
+GPU_REDUCE(DECL_ONE_REDUCE_IMPL_RET_VAL, max)
+
+#undef DECL_ONE_REDUCE_RET_VAL
+
+#define DECL_ONE_REDUCE_IMPL_RET_VAL_IDX(chpl_kind, data_type) \
+void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
+                                                    data_type* val, int* idx,\
+                                                    void* stream);
+
+GPU_REDUCE(DECL_ONE_REDUCE_IMPL_RET_VAL_IDX, minloc)
+GPU_REDUCE(DECL_ONE_REDUCE_IMPL_RET_VAL_IDX, maxloc)
+
+#undef DECL_ONE_REDUCE_RET_VAL_IDX
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/runtime/include/chpl-gpu.h b/runtime/include/chpl-gpu.h
index a52d40f53fab..f3b5ae31dc8b 100644
--- a/runtime/include/chpl-gpu.h
+++ b/runtime/include/chpl-gpu.h
@@ -24,6 +24,7 @@
 #include <stdbool.h>
 #include "chpl-tasks.h"
 #include "chpl-mem-desc.h"
+#include "gpu/chpl-gpu-reduce-util.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -149,33 +150,24 @@ size_t chpl_gpu_get_alloc_size(void* ptr);
 bool chpl_gpu_can_access_peer(int dev1, int dev2);
 void chpl_gpu_set_peer_access(int dev1, int dev2, bool enable);
 
-
-#define DECL_REDUCE(MACRO, chpl_kind) \
-  MACRO(chpl_kind, int8_t) \
-  MACRO(chpl_kind, int16_t) \
-  MACRO(chpl_kind, int32_t) \
-  MACRO(chpl_kind, int64_t) \
-  MACRO(chpl_kind, uint8_t) \
-  MACRO(chpl_kind, uint16_t) \
-  MACRO(chpl_kind, uint32_t) \
-  MACRO(chpl_kind, uint64_t) \
-  MACRO(chpl_kind, float) \
-  MACRO(chpl_kind, double)
-
 #define DECL_ONE_REDUCE_RET_VAL(chpl_kind, data_type) \
 void chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
                                                data_type* val);
 
-DECL_REDUCE(DECL_ONE_REDUCE_RET_VAL, sum);
-DECL_REDUCE(DECL_ONE_REDUCE_RET_VAL, min);
-DECL_REDUCE(DECL_ONE_REDUCE_RET_VAL, max);
+GPU_REDUCE(DECL_ONE_REDUCE_RET_VAL, sum);
+GPU_REDUCE(DECL_ONE_REDUCE_RET_VAL, min);
+GPU_REDUCE(DECL_ONE_REDUCE_RET_VAL, max);
+
+#undef DECL_ONE_REDUCE_RET_VAL
 
 #define DECL_ONE_REDUCE_RET_VAL_IDX(chpl_kind, data_type) \
 void chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
                                                data_type* val, int* idx);
 
-DECL_REDUCE(DECL_ONE_REDUCE_RET_VAL_IDX, minloc);
-DECL_REDUCE(DECL_ONE_REDUCE_RET_VAL_IDX, maxloc);
+GPU_REDUCE(DECL_ONE_REDUCE_RET_VAL_IDX, minloc);
+GPU_REDUCE(DECL_ONE_REDUCE_RET_VAL_IDX, maxloc);
+
+#undef DECL_ONE_REDUCE_RET_VAL_IDX
 
 #endif // HAS_GPU_LOCALE
 
diff --git a/runtime/src/chpl-gpu.c b/runtime/src/chpl-gpu.c
index bf3f6d9ae955..7a5508e7e0e7 100644
--- a/runtime/src/chpl-gpu.c
+++ b/runtime/src/chpl-gpu.c
@@ -43,6 +43,8 @@ bool chpl_gpu_use_stream_per_task = true;
 #include "chpl-env.h"
 #include "chpl-comm-compiler-macros.h"
 
+#include "gpu/chpl-gpu-reduce-util.h"
+
 void chpl_gpu_init(void) {
   chpl_gpu_impl_init(&chpl_gpu_num_devices);
 
@@ -697,23 +699,6 @@ void chpl_gpu_set_peer_access(int dev1, int dev2, bool enable) {
   chpl_gpu_impl_set_peer_access(dev1, dev2, enable);
 }
 
-void chpl_gpu_sum_reduce_int8_t(int8_t *data, int n, int* val) {
-  CHPL_GPU_DEBUG("chpl_gpu_sum_reduce_int8_t called\n");
-
-  int dev = chpl_task_getRequestedSubloc();
-  chpl_gpu_impl_use_device(dev);
-  void* stream = get_stream(dev);
-
-  chpl_gpu_impl_sum_reduce_int8_t(data, n, val, stream);
-
-  if (chpl_gpu_sync_with_host) {
-    CHPL_GPU_DEBUG("Eagerly synchronizing stream %p\n", stream);
-    wait_stream(stream);
-  }
-
-  CHPL_GPU_DEBUG("chpl_gpu_sum_reduce_int8_t returned\n");
-}
-
 #define DEF_ONE_REDUCE_RET_VAL(kind, data_type)\
 void chpl_gpu_##kind##_reduce_##data_type(data_type *data, int n, \
                                           data_type* val) { \
@@ -723,14 +708,45 @@ void chpl_gpu_##kind##_reduce_##data_type(data_type *data, int n, \
   chpl_gpu_impl_use_device(dev); \
   void* stream = get_stream(dev); \
   \
-  chpl_gpu_impl_sum_reduce_int8_t(data, n, val, stream);
+  chpl_gpu_impl_##kind##_reduce_##data_type(data, n, val, stream); \
+  \
+  if (chpl_gpu_sync_with_host) { \
+    CHPL_GPU_DEBUG("Eagerly synchronizing stream %p\n", stream); \
+    wait_stream(stream); \
+  } \
+  \
+  CHPL_GPU_DEBUG("chpl_gpu_sum_reduce_int8_t returned\n"); \
+}
 
-  if (chpl_gpu_sync_with_host) {
-    CHPL_GPU_DEBUG("Eagerly synchronizing stream %p\n", stream);
-    wait_stream(stream);
-  }
+GPU_REDUCE(DEF_ONE_REDUCE_RET_VAL, sum)
+GPU_REDUCE(DEF_ONE_REDUCE_RET_VAL, min)
+GPU_REDUCE(DEF_ONE_REDUCE_RET_VAL, max)
+
+#undef DEF_ONE_REDUCE_RET_VAL
 
-  CHPL_GPU_DEBUG("chpl_gpu_sum_reduce_int8_t returned\n");
+#define DEF_ONE_REDUCE_RET_VAL_IDX(kind, data_type)\
+void chpl_gpu_##kind##_reduce_##data_type(data_type *data, int n, \
+                                          data_type* val, int* idx) { \
+  CHPL_GPU_DEBUG("chpl_gpu_" #kind "_reduce_" #data_type " called\n"); \
+  \
+  int dev = chpl_task_getRequestedSubloc(); \
+  chpl_gpu_impl_use_device(dev); \
+  void* stream = get_stream(dev); \
+  \
+  chpl_gpu_impl_##kind##_reduce_##data_type(data, n, val, idx, stream); \
+  \
+  if (chpl_gpu_sync_with_host) { \
+    CHPL_GPU_DEBUG("Eagerly synchronizing stream %p\n", stream); \
+    wait_stream(stream); \
+  } \
+  \
+  CHPL_GPU_DEBUG("chpl_gpu_sum_reduce_int8_t returned\n"); \
 }
 
+GPU_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, minloc);
+GPU_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, maxloc);
+
+#undef DEF_ONE_REDUCE_RET_VAL_IDX
+
+
 #endif
diff --git a/runtime/src/gpu/nvidia/Makefile.share b/runtime/src/gpu/nvidia/Makefile.share
index 2716a17c89d7..41930a17bc01 100644
--- a/runtime/src/gpu/nvidia/Makefile.share
+++ b/runtime/src/gpu/nvidia/Makefile.share
@@ -21,8 +21,6 @@ SRCS = $(GPU_SRCS)
 
 GPU_OBJS = $(addprefix $(GPU_OBJDIR)/,$(addsuffix .o,$(basename $(GPU_SRCS))))
 
-$(info $$GPU_OBJS is [${GPU_OBJS}])
-
 RUNTIME_CXXFLAGS += -x cuda -Wno-unknown-cuda-version
 
 $(RUNTIME_OBJ_DIR)/gpu-nvidia-aux.o: gpu-nvidia-aux.cc \
diff --git a/runtime/src/gpu/nvidia/gpu-nvidia-aux.cc b/runtime/src/gpu/nvidia/gpu-nvidia-aux.cc
index 75f3fd7bb986..89fc8789e5a9 100644
--- a/runtime/src/gpu/nvidia/gpu-nvidia-aux.cc
+++ b/runtime/src/gpu/nvidia/gpu-nvidia-aux.cc
@@ -2,55 +2,57 @@
 #include <cub/cub.cuh>
 
 #include "chpl-gpu.h"
+#include "chpl-gpu-impl.h"
 #include "../common/cuda-utils.h"
-
-#define DEF_REDUCE(MACRO, cub_kind, chpl_kind) \
-  MACRO(cub_kind, chpl_kind, int8_t)  \
-  MACRO(cub_kind, chpl_kind, int16_t)  \
-  MACRO(cub_kind, chpl_kind, int32_t)  \
-  MACRO(cub_kind, chpl_kind, int64_t)  \
-  MACRO(cub_kind, chpl_kind, uint8_t)  \
-  MACRO(cub_kind, chpl_kind, uint16_t)  \
-  MACRO(cub_kind, chpl_kind, uint32_t)  \
-  MACRO(cub_kind, chpl_kind, uint64_t)  \
-  MACRO(cub_kind, chpl_kind, float)   \
-  MACRO(cub_kind, chpl_kind, double);
+#include "gpu/chpl-gpu-reduce-util.h"
 
 #define DEF_ONE_REDUCE_RET_VAL(cub_kind, chpl_kind, data_type) \
-void chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
-                                               data_type* val) {\
+void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
+                                                    data_type* val,\
+                                                    void* stream) {\
   CUdeviceptr result; \
   CUDA_CALL(cuMemAlloc(&result, sizeof(data_type))); \
   void* temp = NULL; \
   size_t temp_bytes = 0; \
-  cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (data_type*)result, n); \
+  cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (data_type*)result, n,\
+                              (CUstream)stream); \
   CUDA_CALL(cuMemAlloc(((CUdeviceptr*)&temp), temp_bytes)); \
-  cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (data_type*)result, n); \
-  CUDA_CALL(cuMemcpyDtoH(val, result, sizeof(data_type))); \
+  cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (data_type*)result, n,\
+                              (CUstream)stream); \
+  CUDA_CALL(cuMemcpyDtoHAsync(val, result, sizeof(data_type),\
+                              (CUstream)stream)); \
 }
-DEF_REDUCE(DEF_ONE_REDUCE_RET_VAL, Sum, sum)
-DEF_REDUCE(DEF_ONE_REDUCE_RET_VAL, Min, min)
-DEF_REDUCE(DEF_ONE_REDUCE_RET_VAL, Max, max)
+
+GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL, Sum, sum)
+GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL, Min, min)
+GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL, Max, max)
+
 #undef DEF_ONE_REDUCE_RET_VAL
 
 #define DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, data_type) \
-void chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
-                                               data_type* val, int* idx) {\
+void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
+                                                    data_type* val, int* idx,\
+                                                    void* stream) {\
   using kvp = cub::KeyValuePair<int,data_type>; \
   CUdeviceptr result; \
   CUDA_CALL(cuMemAlloc(&result, sizeof(kvp))); \
   void* temp = NULL; \
   size_t temp_bytes = 0; \
-  cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (kvp*)result, n); \
+  cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (kvp*)result, n,\
+                              (CUstream)stream);\
   CUDA_CALL(cuMemAlloc(((CUdeviceptr*)&temp), temp_bytes)); \
-  cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (kvp*)result, n); \
+  cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (kvp*)result, n,\
+                              (CUstream)stream);\
   kvp result_host; \
-  CUDA_CALL(cuMemcpyDtoH(&result_host, result, sizeof(kvp))); \
+  CUDA_CALL(cuMemcpyDtoHAsync(&result_host, result, sizeof(kvp),\
+                              (CUstream)stream)); \
   *val = result_host.value; \
   *idx = result_host.key; \
 }
-DEF_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, ArgMin, minloc)
-DEF_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, ArgMax, maxloc)
+
+GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, ArgMin, minloc)
+GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, ArgMax, maxloc)
+
 #undef DEF_ONE_REDUCE_RET_VAL_IDX
 
 #undef DEF_REDUCE

From cf1e526a0b016cddc6a9ef4b792c3e3a6a202fed Mon Sep 17 00:00:00 2001
From: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
Date: Mon, 16 Oct 2023 16:14:56 -0700
Subject: [PATCH 14/42] Runtime cleanup

Signed-off-by: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
---
 runtime/Makefile.help                         |  1 -
 runtime/src/gpu/nvidia/Makefile.share         |  4 ++--
 ...gpu-nvidia-aux.cc => gpu-nvidia-reduce.cc} | 24 +++++++++++++++++++
 3 files changed, 26 insertions(+), 3 deletions(-)
 rename runtime/src/gpu/nvidia/{gpu-nvidia-aux.cc => gpu-nvidia-reduce.cc} (74%)

diff --git a/runtime/Makefile.help b/runtime/Makefile.help
index 910277c18dab..cf0d29a72fb4 100644
--- a/runtime/Makefile.help
+++ b/runtime/Makefile.help
@@ -96,7 +96,6 @@ RUNTIME_OBJS = \
 	$(THREADS_OBJS) \
 	$(TIMERS_OBJS) \
 	$(COMM_OBJS) \
-	$(CUB_OBJS) \
 	$(GPU_OBJS) \
 	$(MEM_COMMON_OBJS) \
 	$(QIO_OBJS) \
diff --git a/runtime/src/gpu/nvidia/Makefile.share b/runtime/src/gpu/nvidia/Makefile.share
index 41930a17bc01..a0c109e22f0d 100644
--- a/runtime/src/gpu/nvidia/Makefile.share
+++ b/runtime/src/gpu/nvidia/Makefile.share
@@ -15,7 +15,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-GPU_SRCS = gpu-nvidia.c gpu-nvidia-aux.cc
+GPU_SRCS = gpu-nvidia.c gpu-nvidia-reduce.cc
 
 SRCS = $(GPU_SRCS)
 
@@ -23,6 +23,6 @@ GPU_OBJS = $(addprefix $(GPU_OBJDIR)/,$(addsuffix .o,$(basename $(GPU_SRCS))))
 
 RUNTIME_CXXFLAGS += -x cuda -Wno-unknown-cuda-version
 
-$(RUNTIME_OBJ_DIR)/gpu-nvidia-aux.o: gpu-nvidia-aux.cc \
+$(RUNTIME_OBJ_DIR)/gpu-nvidia-reduce.o: gpu-nvidia-reduce.cc \
                                          $(RUNTIME_OBJ_DIR_STAMP)
 	$(CXX) -c $(CXX11_STD) $(RUNTIME_CXXFLAGS) $(RUNTIME_INCLS) -o $@ $<
diff --git a/runtime/src/gpu/nvidia/gpu-nvidia-aux.cc b/runtime/src/gpu/nvidia/gpu-nvidia-reduce.cc
similarity index 74%
rename from runtime/src/gpu/nvidia/gpu-nvidia-aux.cc
rename to runtime/src/gpu/nvidia/gpu-nvidia-reduce.cc
index 89fc8789e5a9..6c9f7a350c34 100644
--- a/runtime/src/gpu/nvidia/gpu-nvidia-aux.cc
+++ b/runtime/src/gpu/nvidia/gpu-nvidia-reduce.cc
@@ -1,3 +1,24 @@
+/*
+ * Copyright 2020-2023 Hewlett Packard Enterprise Development LP
+ * Copyright 2004-2019 Cray Inc.
+ * Other additional copyright holders may be indicated within.  *
+ * The entirety of this work is licensed under the Apache License,
+ * Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef HAS_GPU_LOCALE
+
 #include <cuda.h>
 #include <cub/cub.cuh>
 
@@ -56,3 +77,6 @@ GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, ArgMax, maxloc)
 #undef DEF_ONE_REDUCE_RET_VAL_IDX
 
 #undef DEF_REDUCE
+
+#endif // HAS_GPU_LOCALE
+

From 1489028227f050a0ec1837395518204c9dcd2938 Mon Sep 17 00:00:00 2001
From: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
Date: Mon, 16 Oct 2023 16:46:18 -0700
Subject: [PATCH 15/42] Add a common header

Signed-off-by: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
---
 runtime/include/gpu/chpl-gpu-reduce-util.h | 47 ++++++++++++++++++++++
 1 file changed, 47 insertions(+)
 create mode 100644 runtime/include/gpu/chpl-gpu-reduce-util.h

diff --git a/runtime/include/gpu/chpl-gpu-reduce-util.h b/runtime/include/gpu/chpl-gpu-reduce-util.h
new file mode 100644
index 000000000000..6076fb0b98a5
--- /dev/null
+++ b/runtime/include/gpu/chpl-gpu-reduce-util.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2020-2023 Hewlett Packard Enterprise Development LP
+ * Copyright 2004-2019 Cray Inc.
+ * Other additional copyright holders may be indicated within.  *
+ * The entirety of this work is licensed under the Apache License,
+ * Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef HAS_GPU_LOCALE
+
+#define GPU_IMPL_REDUCE(MACRO, impl_kind, chpl_kind) \
+  MACRO(impl_kind, chpl_kind, int8_t)  \
+  MACRO(impl_kind, chpl_kind, int16_t)  \
+  MACRO(impl_kind, chpl_kind, int32_t)  \
+  MACRO(impl_kind, chpl_kind, int64_t)  \
+  MACRO(impl_kind, chpl_kind, uint8_t)  \
+  MACRO(impl_kind, chpl_kind, uint16_t)  \
+  MACRO(impl_kind, chpl_kind, uint32_t)  \
+  MACRO(impl_kind, chpl_kind, uint64_t)  \
+  MACRO(impl_kind, chpl_kind, float)   \
+  MACRO(impl_kind, chpl_kind, double);
+
+#define GPU_REDUCE(MACRO, chpl_kind) \
+  MACRO(chpl_kind, int8_t)  \
+  MACRO(chpl_kind, int16_t)  \
+  MACRO(chpl_kind, int32_t)  \
+  MACRO(chpl_kind, int64_t)  \
+  MACRO(chpl_kind, uint8_t)  \
+  MACRO(chpl_kind, uint16_t)  \
+  MACRO(chpl_kind, uint32_t)  \
+  MACRO(chpl_kind, uint64_t)  \
+  MACRO(chpl_kind, float)   \
+  MACRO(chpl_kind, double);
+
+#endif // HAS_GPU_LOCALE
+

From 60f921c918e13875c41f920a3d135332d2206146 Mon Sep 17 00:00:00 2001
From: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
Date: Mon, 16 Oct 2023 17:06:59 -0700
Subject: [PATCH 16/42] Blind implementation of reductions on AMD

Signed-off-by: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
---
 runtime/src/gpu/amd/Makefile.include  |  1 +
 runtime/src/gpu/amd/Makefile.share    | 12 +++--
 runtime/src/gpu/amd/gpu-amd-reduce.cc | 78 +++++++++++++++++++++++++++
 3 files changed, 88 insertions(+), 3 deletions(-)
 create mode 100644 runtime/src/gpu/amd/gpu-amd-reduce.cc

diff --git a/runtime/src/gpu/amd/Makefile.include b/runtime/src/gpu/amd/Makefile.include
index 7ba37d695b24..7bce5c886202 100644
--- a/runtime/src/gpu/amd/Makefile.include
+++ b/runtime/src/gpu/amd/Makefile.include
@@ -21,5 +21,6 @@ GPU_SUBDIR = src/gpu/amd
 GPU_OBJDIR = $(RUNTIME_BUILD)/$(GPU_SUBDIR)
 
 ALL_SRCS += $(CURDIR)/$(GPU_SUBDIR)/*.c
+ALL_SRCS += $(CURDIR)/$(GPU_SUBDIR)/*.cc
 
 include $(RUNTIME_ROOT)/$(GPU_SUBDIR)/Makefile.share
diff --git a/runtime/src/gpu/amd/Makefile.share b/runtime/src/gpu/amd/Makefile.share
index e8ad87a160ab..e8b6db7040d9 100644
--- a/runtime/src/gpu/amd/Makefile.share
+++ b/runtime/src/gpu/amd/Makefile.share
@@ -15,8 +15,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-SRCS = gpu-amd.c
+GPU_SRCS = gpu-amd.c gpu-amd-reduce.cc
 
-GPU_SRCS = gpu-amd.c
+SRCS = $(GPU_SRCS)
 
-GPU_OBJS = $(GPU_SRCS:%.c=$(GPU_OBJDIR)/%.o)
+GPU_OBJS = $(addprefix $(GPU_OBJDIR)/,$(addsuffix .o,$(basename $(GPU_SRCS))))
+
+RUNTIME_CXXFLAGS += -x hip
+
+$(RUNTIME_OBJ_DIR)/gpu-amd-reduce.o: gpu-amd-reduce.cc \
+                                         $(RUNTIME_OBJ_DIR_STAMP)
+	$(CXX) -c $(CXX11_STD) $(RUNTIME_CXXFLAGS) $(RUNTIME_INCLS) -o $@ $<
diff --git a/runtime/src/gpu/amd/gpu-amd-reduce.cc b/runtime/src/gpu/amd/gpu-amd-reduce.cc
new file mode 100644
index 000000000000..3cbad5b629b4
--- /dev/null
+++ b/runtime/src/gpu/amd/gpu-amd-reduce.cc
@@ -0,0 +1,78 @@
+/*
+ * Copyright 2020-2023 Hewlett Packard Enterprise Development LP
+ * Copyright 2004-2019 Cray Inc.
+ * Other additional copyright holders may be indicated within.  *
+ * The entirety of this work is licensed under the Apache License,
+ * Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef HAS_GPU_LOCALE
+
+#include <hip/hip_common.h>
+#include <rocprim/rocprim.hpp>
+
+#include "chpl-gpu.h"
+#include "chpl-gpu-impl.h"
+#include "gpu/chpl-gpu-reduce-util.h"
+
+#if ROCM_VERSION_MAJOR >= 5
+#define DEF_ONE_REDUCE_RET_VAL(impl_kind, chpl_kind, data_type) \
+void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
+                                                    data_type* val,\
+                                                    void* stream) {\
+  hipDeviceptr_t result; \
+  ROCM_CALL(hipMalloc(&result, sizeof(data_type))); \
+  void* temp = NULL; \
+  size_t temp_bytes = 0; \
+  rocmprim::reduce(temp, temp_bytes, data, (data_type*)result, n,\
+                   rocmprim::impl_kind<data_type>,\
+                   (hipStream_t)stream); \
+  ROCM_CALL(hipMalloc(((CUdeviceptr*)&temp), temp_bytes)); \
+  rocmprim::reduce(temp, temp_bytes, data, (data_type*)result, n,\
+                   rocmprim::impl_kind<data_type>,\
+                   (hipStream_t)stream); \
+  ROCM_CALL(hipMemcpyDtoHAsync(val, result, sizeof(data_type),\
+                              (CUstream)stream)); \
+}
+#else
+#define DEF_ONE_REDUCE_RET_VAL(impl_kind, chpl_kind, data_type) \
+void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
+                                                    data_type* val,\
+                                                    void* stream) {\
+  chpl_internal_error("Reduction is not supported with AMD GPUs using ROCm version <5\n");\
+}
+#endif
+
+GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL, plus, sum)
+GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL, minimum, min)
+GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL, maximum, max)
+
+#undef DEF_ONE_REDUCE_RET_VAL
+
+#define DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, data_type) \
+void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
+                                                    data_type* val, int* idx,\
+                                                    void* stream) {\
+  chpl_internal_error(chpl_kind # " reduction is not supported with AMD GPUs\n");\
+}
+
+GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, unknown, minloc)
+GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, unknown, maxloc)
+
+#undef DEF_ONE_REDUCE_RET_VAL_IDX
+
+#undef DEF_REDUCE
+
+#endif // HAS_GPU_LOCALE
+

From ab9b47519307db27b7d053538060a27816f217bf Mon Sep 17 00:00:00 2001
From: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
Date: Mon, 16 Oct 2023 17:08:44 -0700
Subject: [PATCH 17/42] Add a compilerError in the module code

Signed-off-by: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
---
 modules/standard/GPU.chpl | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/modules/standard/GPU.chpl b/modules/standard/GPU.chpl
index 566d9461b473..5d81627b1f3b 100644
--- a/modules/standard/GPU.chpl
+++ b/modules/standard/GPU.chpl
@@ -426,6 +426,9 @@ module GPU
       return val;
     }
     else {
+      if CHPL_GPU == "amd" {
+        compilerError(op + " reduction is not supported on AMD GPUs");
+      }
       var idx: int(32);
       var val: t;
       extern externFunc proc reduce_fn(data, size, ref val, ref idx);

From 5ab3e5e55714565a01fa8e156ddf7987cce730c0 Mon Sep 17 00:00:00 2001
From: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
Date: Wed, 18 Oct 2023 17:39:39 -0500
Subject: [PATCH 18/42] Test/limit usage on AMD

Signed-off-by: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
---
 modules/standard/GPU.chpl               |  6 +--
 runtime/src/gpu/amd/Makefile.share      |  5 +-
 runtime/src/gpu/amd/gpu-amd-reduce.cc   | 64 +++++++++++++++++--------
 runtime/src/gpu/amd/gpu-amd.c           |  7 +--
 test/gpu/native/reduction/basic.chpl    |  7 ++-
 util/chplenv/compile_link_args_utils.py |  8 ++++
 6 files changed, 66 insertions(+), 31 deletions(-)

diff --git a/modules/standard/GPU.chpl b/modules/standard/GPU.chpl
index 5d81627b1f3b..e09d7f7fbac7 100644
--- a/modules/standard/GPU.chpl
+++ b/modules/standard/GPU.chpl
@@ -395,6 +395,9 @@ module GPU
   // ============================
 
   private inline proc doGpuReduce(param op: string, ref A: [] ?t) {
+    if CHPL_GPU == "amd" then
+      compilerError("gpu*Reduce functions are not supported on AMD GPUs");
+
     proc chplTypeToCTypeName(type t) param {
       select t {
         when int(8)   do return "int8_t";
@@ -426,9 +429,6 @@ module GPU
       return val;
     }
     else {
-      if CHPL_GPU == "amd" {
-        compilerError(op + " reduction is not supported on AMD GPUs");
-      }
       var idx: int(32);
       var val: t;
       extern externFunc proc reduce_fn(data, size, ref val, ref idx);
diff --git a/runtime/src/gpu/amd/Makefile.share b/runtime/src/gpu/amd/Makefile.share
index e8b6db7040d9..79f63b4497f0 100644
--- a/runtime/src/gpu/amd/Makefile.share
+++ b/runtime/src/gpu/amd/Makefile.share
@@ -15,7 +15,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-GPU_SRCS = gpu-amd.c gpu-amd-reduce.cc
+#GPU_SRCS = gpu-amd.c gpu-amd-reduce.cc
+GPU_SRCS = gpu-amd-reduce.cc gpu-amd.c 
 
 SRCS = $(GPU_SRCS)
 
@@ -25,4 +26,4 @@ RUNTIME_CXXFLAGS += -x hip
 
 $(RUNTIME_OBJ_DIR)/gpu-amd-reduce.o: gpu-amd-reduce.cc \
                                          $(RUNTIME_OBJ_DIR_STAMP)
-	$(CXX) -c $(CXX11_STD) $(RUNTIME_CXXFLAGS) $(RUNTIME_INCLS) -o $@ $<
+	$(CXX) -c -std=c++14 $(RUNTIME_CXXFLAGS) $(RUNTIME_INCLS) -o $@ $<
diff --git a/runtime/src/gpu/amd/gpu-amd-reduce.cc b/runtime/src/gpu/amd/gpu-amd-reduce.cc
index 3cbad5b629b4..68aa83f7619b 100644
--- a/runtime/src/gpu/amd/gpu-amd-reduce.cc
+++ b/runtime/src/gpu/amd/gpu-amd-reduce.cc
@@ -19,31 +19,45 @@
 
 #ifdef HAS_GPU_LOCALE
 
+/* TODO uncomment these when the implementations are in
 #include <hip/hip_common.h>
-#include <rocprim/rocprim.hpp>
+#include <hipcub/hipcub.hpp>
+#include <rocm_version.h>
+*/
 
 #include "chpl-gpu.h"
 #include "chpl-gpu-impl.h"
 #include "gpu/chpl-gpu-reduce-util.h"
+#include "gpu/amd/util.h"
 
-#if ROCM_VERSION_MAJOR >= 5
+// Engin: I can't get neither hipCUB nor rocprim to work. (hipCUB is a light
+// wrapper around rocprim anyways). I filed
+// https://github.com/ROCmSoftwarePlatform/hipCUB/issues/304, but I don't know
+// if/when I'll hear back something. For now, I am merging the code that's
+// supposed to work but doesn't instead of removing them from my branch.
+#if 1
 #define DEF_ONE_REDUCE_RET_VAL(impl_kind, chpl_kind, data_type) \
 void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
                                                     data_type* val,\
                                                     void* stream) {\
-  hipDeviceptr_t result; \
-  ROCM_CALL(hipMalloc(&result, sizeof(data_type))); \
+  chpl_internal_error("This function shouldn't have been called. Reduction is not supported with AMD GPUs\n");\
+}
+#elif ROCM_VERSION_MAJOR >= 5
+#define DEF_ONE_REDUCE_RET_VAL(impl_kind, chpl_kind, data_type) \
+void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
+                                                    data_type* val,\
+                                                    void* stream) {\
+  data_type* result; \
+  ROCM_CALL(hipMalloc(&result, sizeof(data_type)));\
   void* temp = NULL; \
   size_t temp_bytes = 0; \
-  rocmprim::reduce(temp, temp_bytes, data, (data_type*)result, n,\
-                   rocmprim::impl_kind<data_type>,\
-                   (hipStream_t)stream); \
-  ROCM_CALL(hipMalloc(((CUdeviceptr*)&temp), temp_bytes)); \
-  rocmprim::reduce(temp, temp_bytes, data, (data_type*)result, n,\
-                   rocmprim::impl_kind<data_type>,\
-                   (hipStream_t)stream); \
+  ROCM_CALL(hipcub::DeviceReduce::impl_kind(temp, temp_bytes, data, (data_type*)result, n,\
+                                  0, true));\
+  ROCM_CALL(hipMalloc(((hipDeviceptr_t*)&temp), temp_bytes)); \
+  ROCM_CALL(hipcub::DeviceReduce::impl_kind(temp, temp_bytes, data, (data_type*)result, n,\
+                                  0, true));\
   ROCM_CALL(hipMemcpyDtoHAsync(val, result, sizeof(data_type),\
-                              (CUstream)stream)); \
+                              (hipStream_t)stream)); \
 }
 #else
 #define DEF_ONE_REDUCE_RET_VAL(impl_kind, chpl_kind, data_type) \
@@ -52,23 +66,35 @@ void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
                                                     void* stream) {\
   chpl_internal_error("Reduction is not supported with AMD GPUs using ROCm version <5\n");\
 }
-#endif
+#endif // 1
 
-GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL, plus, sum)
-GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL, minimum, min)
-GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL, maximum, max)
+GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL, Sum, sum)
+GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL, Min, min)
+GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL, Max, max)
 
 #undef DEF_ONE_REDUCE_RET_VAL
 
+#if 1
+#define DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, data_type) \
+void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
+                                                    data_type* val, int* idx,\
+                                                    void* stream) {\
+  chpl_internal_error("This function shouldn't have been called. Reduction is not supported with AMD GPUs\n");\
+}
+#else
 #define DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, data_type) \
 void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
                                                     data_type* val, int* idx,\
                                                     void* stream) {\
-  chpl_internal_error(chpl_kind # " reduction is not supported with AMD GPUs\n");\
+  // TODO I don't know any other specific issues with these versions. Should be
+  // able to whip up the implementation quickly once we figure out what's going
+  // wrong here.
+  chpl_internal_error("Unimplemented");
 }
+#endif // 1
 
-GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, unknown, minloc)
-GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, unknown, maxloc)
+GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, ArgMin, minloc)
+GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, ArgMax, maxloc)
 
 #undef DEF_ONE_REDUCE_RET_VAL_IDX
 
diff --git a/runtime/src/gpu/amd/gpu-amd.c b/runtime/src/gpu/amd/gpu-amd.c
index b3b54752d5b7..475e8ccaaeb2 100644
--- a/runtime/src/gpu/amd/gpu-amd.c
+++ b/runtime/src/gpu/amd/gpu-amd.c
@@ -29,6 +29,7 @@
 #include "chplcgfns.h"
 #include "chpl-env-gen.h"
 #include "chpl-linefile-support.h"
+#include "gpu/amd/util.h"
 
 #include <assert.h>
 
@@ -37,7 +38,7 @@
 #include <hip/hip_runtime_api.h>
 #include <hip/hip_common.h>
 
-static void chpl_gpu_rocm_check(int err, const char* file, int line) {
+void chpl_gpu_rocm_check(int err, const char* file, int line) {
   if(err == hipErrorContextAlreadyInUse) { return; }
   if(err != hipSuccess) {
     const int msg_len = 256;
@@ -51,10 +52,6 @@ static void chpl_gpu_rocm_check(int err, const char* file, int line) {
   }
 }
 
-#define ROCM_CALL(call) do {\
-  chpl_gpu_rocm_check((int)call, __FILE__, __LINE__);\
-} while(0);
-
 static inline
 void* chpl_gpu_load_module(const char* fatbin_data) {
   hipModule_t rocm_module;
diff --git a/test/gpu/native/reduction/basic.chpl b/test/gpu/native/reduction/basic.chpl
index 5a1ceef66952..21fe1bb56d50 100644
--- a/test/gpu/native/reduction/basic.chpl
+++ b/test/gpu/native/reduction/basic.chpl
@@ -1,4 +1,5 @@
 use GPU;
+use ChplConfig;
 
 config const n = 100;
 
@@ -26,8 +27,10 @@ proc testType(type t) {
   test("sum", t);
   test("min", t);
   test("max", t);
-  test("minloc", t);
-  test("maxloc", t);
+  if CHPL_GPU != "amd" {
+    test("minloc", t);
+    test("maxloc", t);
+  }
   writeln();
 }
 
diff --git a/util/chplenv/compile_link_args_utils.py b/util/chplenv/compile_link_args_utils.py
index fc9b325da3df..b1b46bd05d26 100644
--- a/util/chplenv/compile_link_args_utils.py
+++ b/util/chplenv/compile_link_args_utils.py
@@ -69,6 +69,14 @@ def get_runtime_includes_and_defines():
             system.append("-isystem" + os.path.join(sdk_path, "hip", "include"))
             system.append("-isystem" + os.path.join(sdk_path, "hsa", "include"))
 
+
+            # We need runtime to use the lld that ships with the rocm
+            # installation so that we can properly link hipCUB. I believe this
+            # requirement is coming from using "-x hip" with clang
+            lld_path = os.path.join(sdk_path, "llvm/bin")
+            system.append("-B " + lld_path)
+            bundled.append("-B " + lld_path)
+
     if mem == "jemalloc":
         # set -DCHPL_JEMALLOC_PREFIX=chpl_je_
         # this is needed since it affects code inside of headers

From 0819e3110d8f7b013956723d1981f0d39a3cf68a Mon Sep 17 00:00:00 2001
From: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
Date: Wed, 18 Oct 2023 16:42:18 -0700
Subject: [PATCH 19/42] Fix an issue, add perf test

Signed-off-by: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
---
 modules/standard/GPU.chpl                     | 16 ++++----
 .../native/reduction/reduceThroughput.chpl    | 39 +++++++++++++++++++
 2 files changed, 47 insertions(+), 8 deletions(-)
 create mode 100644 test/gpu/native/reduction/reduceThroughput.chpl

diff --git a/modules/standard/GPU.chpl b/modules/standard/GPU.chpl
index e09d7f7fbac7..20a299904176 100644
--- a/modules/standard/GPU.chpl
+++ b/modules/standard/GPU.chpl
@@ -394,7 +394,7 @@ module GPU
   // Reductions
   // ============================
 
-  private inline proc doGpuReduce(param op: string, ref A: [] ?t) {
+  private inline proc doGpuReduce(param op: string, const ref A: [] ?t) {
     if CHPL_GPU == "amd" then
       compilerError("gpu*Reduce functions are not supported on AMD GPUs");
 
@@ -425,22 +425,22 @@ module GPU
     if op == "sum" || op == "min" || op == "max" {
       var val: t;
       extern externFunc proc reduce_fn(data, size, ref val);
-      reduce_fn(c_ptrTo(A), A.size, val);
+      reduce_fn(c_ptrToConst(A), A.size, val);
       return val;
     }
     else {
       var idx: int(32);
       var val: t;
       extern externFunc proc reduce_fn(data, size, ref val, ref idx);
-      reduce_fn(c_ptrTo(A), A.size, val, idx);
+      reduce_fn(c_ptrToConst(A), A.size, val, idx);
       return (idx, val);
     }
   }
 
-  inline proc gpuSumReduce(ref A: [] ?t) do return doGpuReduce("sum", A);
-  inline proc gpuMinReduce(ref A: [] ?t) do return doGpuReduce("min", A);
-  inline proc gpuMaxReduce(ref A: [] ?t) do return doGpuReduce("max", A);
-  inline proc gpuMinLocReduce(ref A: [] ?t) do return doGpuReduce("minloc", A);
-  inline proc gpuMaxLocReduce(ref A: [] ?t) do return doGpuReduce("maxloc", A);
+  inline proc gpuSumReduce(const ref A: [] ?t) do return doGpuReduce("sum", A);
+  inline proc gpuMinReduce(const ref A: [] ?t) do return doGpuReduce("min", A);
+  inline proc gpuMaxReduce(const ref A: [] ?t) do return doGpuReduce("max", A);
+  inline proc gpuMinLocReduce(const ref A: [] ?t) do return doGpuReduce("minloc", A);
+  inline proc gpuMaxLocReduce(const ref A: [] ?t) do return doGpuReduce("maxloc", A);
 
 }
diff --git a/test/gpu/native/reduction/reduceThroughput.chpl b/test/gpu/native/reduction/reduceThroughput.chpl
new file mode 100644
index 000000000000..2fe0a196dd65
--- /dev/null
+++ b/test/gpu/native/reduction/reduceThroughput.chpl
@@ -0,0 +1,39 @@
+use GPU;
+use Time;
+use ChplConfig;
+
+config type dataType = int;
+config param useGpu = CHPL_LOCALE_MODEL == "gpu";
+
+config const n = 100;
+config const reportPerf = true;
+
+var t: stopwatch;
+
+inline proc doSumReduce(const ref Arr) {
+  if useGpu then
+    return gpuSumReduce(Arr);
+  else {
+    return + reduce Arr;
+  }
+}
+
+writeln("Using ", if useGpu then "gpu" else "cpu");
+
+on if useGpu then here.gpus[0] else here {
+  var Arr: [0..<n] dataType;
+
+  forall i in Arr.domain do Arr[i] = (i%100):dataType;
+
+  var result;
+  t.start();
+  result = doSumReduce(Arr);
+  t.stop();
+  writeln(result);
+}
+
+if reportPerf {
+  writeln("Time (s): ", t.elapsed());
+  writeln("Throughput (GiOP/s): ", n/t.elapsed()/(1<<30));
+  writeln("Throughput (GiB/s): ", numBytes(dataType)*n/t.elapsed()/(1<<30));
+}

From 3a78ab22b2111df3bf77190875c7f79a72cc1f1b Mon Sep 17 00:00:00 2001
From: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
Date: Wed, 25 Oct 2023 14:46:10 -0700
Subject: [PATCH 20/42] Initial attempt to do multi-chunk reduction

Signed-off-by: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
---
 modules/standard/GPU.chpl            | 78 ++++++++++++++++++++++++++--
 runtime/src/chpl-gpu.c               |  4 +-
 test/gpu/native/reduction/basic.chpl |  6 +--
 3 files changed, 77 insertions(+), 11 deletions(-)

diff --git a/modules/standard/GPU.chpl b/modules/standard/GPU.chpl
index 20a299904176..ec0bec0b8f9e 100644
--- a/modules/standard/GPU.chpl
+++ b/modules/standard/GPU.chpl
@@ -394,6 +394,9 @@ module GPU
   // Reductions
   // ============================
 
+  @chpldoc.nodoc
+  config param gpuDebugReduce = false;
+
   private inline proc doGpuReduce(param op: string, const ref A: [] ?t) {
     if CHPL_GPU == "amd" then
       compilerError("gpu*Reduce functions are not supported on AMD GPUs");
@@ -418,22 +421,87 @@ module GPU
       return "chpl_gpu_"+op+"_reduce_"+chplTypeToCTypeName(t);
     }
 
+    inline proc subReduceVal(param op, ref accum: ?t, val: t) {
+      select op {
+        when "sum" do accum += val;
+        when "min" do accum = min(accum, val);
+        when "max" do accum = max(accum, val);
+      }
+    }
+
+    inline proc subReduceValIdx(param op, const baseOffset, ref accum: ?t,
+                                val: t) {
+      select op {
+        when "minloc" do
+          if accum[1] > val[1] then accum = val;
+        when "maxloc" do
+          if accum[1] < val[1] then accum = val;
+      }
+
+      accum[0] += baseOffset;
+    }
+
+    iter offsetsThatCanFitIn32Bits(size: int) {
+      use Math only divCeil;
+      const numChunks = divCeil(size, max(int(32)));
+      const standardChunkSize = divCeil(size, numChunks);
+
+      if gpuDebugReduce then
+        writeln("Will use ", numChunks, " chunks of size ", standardChunkSize);
+
+      foreach chunk in 0..<numChunks {
+        const start = chunk*standardChunkSize;
+        const curChunkSize = if start+standardChunkSize <= size
+                               then standardChunkSize
+                               else size-standardChunkSize;
+        if gpuDebugReduce then
+          writef("Chunk %i: (start=%i, curChunkSize=%i)", chunk, start,
+                 curChunkSize);
+
+        yield (start, curChunkSize);
+      }
+    }
+
     use CTypes;
 
     param externFunc = getExternFuncName(op, t);
 
     if op == "sum" || op == "min" || op == "max" {
       var val: t;
+      if op == "min" then
+        val = max(t);
+      else if op == "max" then
+        val = min(t);
+
       extern externFunc proc reduce_fn(data, size, ref val);
-      reduce_fn(c_ptrToConst(A), A.size, val);
+
+      const basePtr = c_ptrToConst(A);
+      for (offset,size) in offsetsThatCanFitIn32Bits(A.size) {
+        var curVal: t;
+        reduce_fn(basePtr+offset, size, curVal);
+        subReduceVal(op, val, curVal);
+      }
+
       return val;
     }
     else {
-      var idx: int(32);
-      var val: t;
+      var ret: (int, t);
+      if op == "minloc" then
+        ret[1] = max(t);
+      else if op == "maxloc" then
+        ret[1] = min(t);
+
       extern externFunc proc reduce_fn(data, size, ref val, ref idx);
-      reduce_fn(c_ptrToConst(A), A.size, val, idx);
-      return (idx, val);
+
+      const basePtr = c_ptrToConst(A);
+      for (offset,size) in offsetsThatCanFitIn32Bits(A.size) {
+        var curIdx: int(32);
+        var curVal: t;
+        reduce_fn(basePtr+offset, size, curVal, curIdx);
+        subReduceValIdx(op, offset, ret, (curIdx, curVal));
+      }
+
+      return ret;
     }
   }
 
diff --git a/runtime/src/chpl-gpu.c b/runtime/src/chpl-gpu.c
index 7a5508e7e0e7..c0182682a33b 100644
--- a/runtime/src/chpl-gpu.c
+++ b/runtime/src/chpl-gpu.c
@@ -715,7 +715,7 @@ void chpl_gpu_##kind##_reduce_##data_type(data_type *data, int n, \
     wait_stream(stream); \
   } \
   \
-  CHPL_GPU_DEBUG("chpl_gpu_sum_reduce_int8_t returned\n"); \
+  CHPL_GPU_DEBUG("chpl_gpu_" #kind "_reduce_" #data_type " returned\n"); \
 }
 
 GPU_REDUCE(DEF_ONE_REDUCE_RET_VAL, sum)
@@ -740,7 +740,7 @@ void chpl_gpu_##kind##_reduce_##data_type(data_type *data, int n, \
     wait_stream(stream); \
   } \
   \
-  CHPL_GPU_DEBUG("chpl_gpu_sum_reduce_int8_t returned\n"); \
+  CHPL_GPU_DEBUG("chpl_gpu_" #kind "_reduce_" #data_type " returned\n"); \
 }
 
 GPU_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, minloc);
diff --git a/test/gpu/native/reduction/basic.chpl b/test/gpu/native/reduction/basic.chpl
index 21fe1bb56d50..67a2d4d3251d 100644
--- a/test/gpu/native/reduction/basic.chpl
+++ b/test/gpu/native/reduction/basic.chpl
@@ -27,10 +27,8 @@ proc testType(type t) {
   test("sum", t);
   test("min", t);
   test("max", t);
-  if CHPL_GPU != "amd" {
-    test("minloc", t);
-    test("maxloc", t);
-  }
+  test("minloc", t);
+  test("maxloc", t);
   writeln();
 }
 

From 19ad754db95ea8d4d8c9fe01738637128691a3b1 Mon Sep 17 00:00:00 2001
From: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
Date: Wed, 25 Oct 2023 16:14:39 -0700
Subject: [PATCH 21/42] Add a new test and fix an issue exposed by it

Signed-off-by: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
---
 modules/standard/GPU.chpl                     | 16 +++++++++++++---
 test/gpu/native/reduction/largeArrays.chpl    | 19 +++++++++++++++++++
 .../gpu/native/reduction/largeArrays.execopts |  6 ++++++
 test/gpu/native/reduction/largeArrays.good    |  1 +
 4 files changed, 39 insertions(+), 3 deletions(-)
 create mode 100644 test/gpu/native/reduction/largeArrays.chpl
 create mode 100644 test/gpu/native/reduction/largeArrays.execopts
 create mode 100644 test/gpu/native/reduction/largeArrays.good

diff --git a/modules/standard/GPU.chpl b/modules/standard/GPU.chpl
index ec0bec0b8f9e..27ab172bdefa 100644
--- a/modules/standard/GPU.chpl
+++ b/modules/standard/GPU.chpl
@@ -442,8 +442,14 @@ module GPU
     }
 
     iter offsetsThatCanFitIn32Bits(size: int) {
+      // Engin: I've tried to get max(int(32)) to work as this bug is about CUB
+      // using `int` as the size in the interface. However, getting close to
+      // max(int(32)) also triggers the bug. So, I am choosing this as a
+      // round/safe value for the time being.
+      param chunkSize = 2_000_000_000;
+
       use Math only divCeil;
-      const numChunks = divCeil(size, max(int(32)));
+      const numChunks = divCeil(size, chunkSize);
       const standardChunkSize = divCeil(size, numChunks);
 
       if gpuDebugReduce then
@@ -453,9 +459,9 @@ module GPU
         const start = chunk*standardChunkSize;
         const curChunkSize = if start+standardChunkSize <= size
                                then standardChunkSize
-                               else size-standardChunkSize;
+                               else size-start;
         if gpuDebugReduce then
-          writef("Chunk %i: (start=%i, curChunkSize=%i)", chunk, start,
+          writef("Chunk %i: (start=%i, curChunkSize=%i) ", chunk, start,
                  curChunkSize);
 
         yield (start, curChunkSize);
@@ -479,6 +485,8 @@ module GPU
       for (offset,size) in offsetsThatCanFitIn32Bits(A.size) {
         var curVal: t;
         reduce_fn(basePtr+offset, size, curVal);
+        if gpuDebugReduce then
+          writef(" (curVal=%i)\n", curVal);
         subReduceVal(op, val, curVal);
       }
 
@@ -499,6 +507,8 @@ module GPU
         var curVal: t;
         reduce_fn(basePtr+offset, size, curVal, curIdx);
         subReduceValIdx(op, offset, ret, (curIdx, curVal));
+        if gpuDebugReduce then
+          writef(" (curIdx=%i curVal=%i)\n", curIdx, curVal);
       }
 
       return ret;
diff --git a/test/gpu/native/reduction/largeArrays.chpl b/test/gpu/native/reduction/largeArrays.chpl
new file mode 100644
index 000000000000..5a7924d2961f
--- /dev/null
+++ b/test/gpu/native/reduction/largeArrays.chpl
@@ -0,0 +1,19 @@
+use GPU;
+
+config const printResult = false;
+config const n = 100;
+
+var result: uint(8);
+on here.gpus[0] {
+  var Arr: [0..#n] uint(8) = 1;
+
+  result = gpuSumReduce(Arr);
+}
+
+if printResult then writeln("Result: ", result);
+
+// it is all 1's initially. It'll certainly overflow and the remainder will be
+// the expected return value
+const expected = n%(max(uint(8))+1);
+if result != expected then
+  writef("Invalid result. Expected %u, actual %u\n", expected, result);
diff --git a/test/gpu/native/reduction/largeArrays.execopts b/test/gpu/native/reduction/largeArrays.execopts
new file mode 100644
index 000000000000..0af28add7fca
--- /dev/null
+++ b/test/gpu/native/reduction/largeArrays.execopts
@@ -0,0 +1,6 @@
+--n=2_000_000_000
+--n=2_000_000_001
+--n=2_147_483_647
+--n=4_294_967_293
+--n=4_294_967_294
+--n=4_294_967_295
diff --git a/test/gpu/native/reduction/largeArrays.good b/test/gpu/native/reduction/largeArrays.good
new file mode 100644
index 000000000000..abf41f07fd48
--- /dev/null
+++ b/test/gpu/native/reduction/largeArrays.good
@@ -0,0 +1 @@
+warning: The prototype GPU support implies --no-checks. This may impact debuggability. To suppress this warning, compile with --no-checks explicitly

From 06be1a80cff3aaf55fdf93b09ed48a9916705168 Mon Sep 17 00:00:00 2001
From: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
Date: Wed, 25 Oct 2023 16:31:01 -0700
Subject: [PATCH 22/42] Add new test

Signed-off-by: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
---
 .../native/reduction/largeArraysMinMax.chpl   | 27 +++++++++++++++++++
 .../reduction/largeArraysMinMax.execopts      |  7 +++++
 .../native/reduction/largeArraysMinMax.good   |  1 +
 3 files changed, 35 insertions(+)
 create mode 100644 test/gpu/native/reduction/largeArraysMinMax.chpl
 create mode 100644 test/gpu/native/reduction/largeArraysMinMax.execopts
 create mode 100644 test/gpu/native/reduction/largeArraysMinMax.good

diff --git a/test/gpu/native/reduction/largeArraysMinMax.chpl b/test/gpu/native/reduction/largeArraysMinMax.chpl
new file mode 100644
index 000000000000..99566d86e3bc
--- /dev/null
+++ b/test/gpu/native/reduction/largeArraysMinMax.chpl
@@ -0,0 +1,27 @@
+
+use GPU;
+
+config const kind = "min";
+const isMin = kind=="min";
+assert(isMin || kind=="max");
+
+config const n = 2*max(int(32));
+config const setIdx = n-1;
+assert(n>setIdx);
+
+config const printResult = false;
+const expected = if isMin then 7:uint(8) else 13:uint(8);
+
+var result: uint(8);
+on here.gpus[0] {
+  var Arr: [0..#n] uint(8) = 10;
+
+  Arr[setIdx] = expected;
+
+  result = if isMin then gpuMinReduce(Arr) else gpuMaxReduce(Arr);
+}
+
+if printResult then writeln("Result: ", result);
+
+if result != expected then
+  writef("Invalid result. Expected %u, actual %u\n", expected, result);
diff --git a/test/gpu/native/reduction/largeArraysMinMax.execopts b/test/gpu/native/reduction/largeArraysMinMax.execopts
new file mode 100644
index 000000000000..2742bc714a67
--- /dev/null
+++ b/test/gpu/native/reduction/largeArraysMinMax.execopts
@@ -0,0 +1,7 @@
+ #largeArraysMinMax
+--setIdx=0
+--setIdx=1_999_999_999
+--setIdx=2_000_000_000
+--setIdx=2_000_000_001
+--setIdx=2_147_483_647
+
diff --git a/test/gpu/native/reduction/largeArraysMinMax.good b/test/gpu/native/reduction/largeArraysMinMax.good
new file mode 100644
index 000000000000..abf41f07fd48
--- /dev/null
+++ b/test/gpu/native/reduction/largeArraysMinMax.good
@@ -0,0 +1 @@
+warning: The prototype GPU support implies --no-checks. This may impact debuggability. To suppress this warning, compile with --no-checks explicitly

From d012a9b466896ca1f9c4b20fa3fe08073daf28cd Mon Sep 17 00:00:00 2001
From: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
Date: Wed, 25 Oct 2023 17:02:36 -0700
Subject: [PATCH 23/42] Expand test to minloc,maxloc. Fix a bug

Signed-off-by: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
---
 modules/standard/GPU.chpl                     |  8 +++----
 .../native/reduction/largeArraysMinMax.chpl   | 23 +++++++++++++------
 .../reduction/largeArraysMinMax.compopts      |  2 ++
 3 files changed, 21 insertions(+), 12 deletions(-)
 create mode 100644 test/gpu/native/reduction/largeArraysMinMax.compopts

diff --git a/modules/standard/GPU.chpl b/modules/standard/GPU.chpl
index 27ab172bdefa..ba2b01b2300b 100644
--- a/modules/standard/GPU.chpl
+++ b/modules/standard/GPU.chpl
@@ -433,12 +433,10 @@ module GPU
                                 val: t) {
       select op {
         when "minloc" do
-          if accum[1] > val[1] then accum = val;
+          if accum[1] > val[1] then accum = (val[0]+baseOffset, val[1]);
         when "maxloc" do
-          if accum[1] < val[1] then accum = val;
+          if accum[1] < val[1] then accum = (val[0]+baseOffset, val[1]);
       }
-
-      accum[0] += baseOffset;
     }
 
     iter offsetsThatCanFitIn32Bits(size: int) {
@@ -508,7 +506,7 @@ module GPU
         reduce_fn(basePtr+offset, size, curVal, curIdx);
         subReduceValIdx(op, offset, ret, (curIdx, curVal));
         if gpuDebugReduce then
-          writef(" (curIdx=%i curVal=%i)\n", curIdx, curVal);
+          writef(" (curIdx=%i curVal=%i ret=%?)\n", curIdx, curVal, ret);
       }
 
       return ret;
diff --git a/test/gpu/native/reduction/largeArraysMinMax.chpl b/test/gpu/native/reduction/largeArraysMinMax.chpl
index 99566d86e3bc..dc660bc1d0f7 100644
--- a/test/gpu/native/reduction/largeArraysMinMax.chpl
+++ b/test/gpu/native/reduction/largeArraysMinMax.chpl
@@ -1,6 +1,7 @@
-
 use GPU;
 
+config param withLoc = false;
+
 config const kind = "min";
 const isMin = kind=="min";
 assert(isMin || kind=="max");
@@ -10,18 +11,26 @@ config const setIdx = n-1;
 assert(n>setIdx);
 
 config const printResult = false;
-const expected = if isMin then 7:uint(8) else 13:uint(8);
 
-var result: uint(8);
+const expectedVal = if isMin then 7:uint(8) else 13:uint(8);
+const expected = if withLoc then (setIdx, expectedVal) else expectedVal;
+
+inline proc doReduce(Arr) {
+  if withLoc then
+    return if isMin then gpuMinLocReduce(Arr) else gpuMaxLocReduce(Arr);
+  else
+    return if isMin then gpuMinReduce(Arr) else gpuMaxReduce(Arr);
+}
+
+var result: if withLoc then (int, uint(8)) else uint(8);
 on here.gpus[0] {
   var Arr: [0..#n] uint(8) = 10;
+  Arr[setIdx] = expectedVal;
 
-  Arr[setIdx] = expected;
-
-  result = if isMin then gpuMinReduce(Arr) else gpuMaxReduce(Arr);
+  result = doReduce(Arr);
 }
 
 if printResult then writeln("Result: ", result);
 
 if result != expected then
-  writef("Invalid result. Expected %u, actual %u\n", expected, result);
+  writef("Invalid result. Expected %?, actual %?\n", expected, result);
diff --git a/test/gpu/native/reduction/largeArraysMinMax.compopts b/test/gpu/native/reduction/largeArraysMinMax.compopts
new file mode 100644
index 000000000000..087ba6a454e9
--- /dev/null
+++ b/test/gpu/native/reduction/largeArraysMinMax.compopts
@@ -0,0 +1,2 @@
+-swithLoc=false
+-swithLoc=true

From e85a00ceb4f994d255e6c1d34d83f92ed46d21c4 Mon Sep 17 00:00:00 2001
From: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
Date: Wed, 25 Oct 2023 17:28:18 -0700
Subject: [PATCH 24/42] Make the new functions work with cpu-as-device. Add
 skipifs

Signed-off-by: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
---
 modules/standard/GPU.chpl                     | 23 ++++++++++++++-
 runtime/src/gpu/cpu/gpu-cpu.c                 | 29 +++++++++++++++++++
 test/gpu/native/reduction/SKIPIF              |  2 ++
 test/gpu/native/reduction/largeArrays.skipif  |  4 +++
 .../native/reduction/largeArraysMinMax.skipif |  4 +++
 5 files changed, 61 insertions(+), 1 deletion(-)
 create mode 100644 test/gpu/native/reduction/SKIPIF
 create mode 100644 test/gpu/native/reduction/largeArrays.skipif
 create mode 100644 test/gpu/native/reduction/largeArraysMinMax.skipif

diff --git a/modules/standard/GPU.chpl b/modules/standard/GPU.chpl
index ba2b01b2300b..f866c779e814 100644
--- a/modules/standard/GPU.chpl
+++ b/modules/standard/GPU.chpl
@@ -398,8 +398,29 @@ module GPU
   config param gpuDebugReduce = false;
 
   private inline proc doGpuReduce(param op: string, const ref A: [] ?t) {
-    if CHPL_GPU == "amd" then
+    if op != "sum" && op != "min" && op != "max" &&
+       op != "minloc" && op != "maxloc" {
+
+      compilerError("Unexpected reduction kind in doGpuReduce: ", op);
+    }
+
+
+    if CHPL_GPU == "amd" {
       compilerError("gpu*Reduce functions are not supported on AMD GPUs");
+    }
+    else if CHPL_GPU == "cpu" {
+      select op {
+        when "sum" do return + reduce A;
+        when "min" do return min reduce A;
+        when "max" do return max reduce A;
+        when "minloc" do return minloc reduce zip (A.domain, A);
+        when "maxloc" do return maxloc reduce zip (A.domain, A);
+      }
+    }
+    else {
+      compilerAssert(CHPL_GPU=="nvidia");
+    }
+
 
     proc chplTypeToCTypeName(type t) param {
       select t {
diff --git a/runtime/src/gpu/cpu/gpu-cpu.c b/runtime/src/gpu/cpu/gpu-cpu.c
index e76f0dcb359d..a3ae1754e57c 100644
--- a/runtime/src/gpu/cpu/gpu-cpu.c
+++ b/runtime/src/gpu/cpu/gpu-cpu.c
@@ -157,4 +157,33 @@ bool chpl_gpu_impl_stream_ready(void* stream) {
 void chpl_gpu_impl_stream_synchronize(void* stream) {
 }
 
+#define DEF_ONE_REDUCE_RET_VAL(impl_kind, chpl_kind, data_type) \
+void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
+                                                    data_type* val,\
+                                                    void* stream) {\
+  chpl_internal_error("This function shouldn't have been called. "\
+                      "cpu-as-device mode should handle reductions in "\
+                      "the module code\n");\
+}
+
+GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL, Sum, sum)
+GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL, Min, min)
+GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL, Max, max)
+
+#undef DEF_ONE_REDUCE_RET_VAL
+
+#define DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, data_type) \
+void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
+                                                    data_type* val, int* idx,\
+                                                    void* stream) {\
+  chpl_internal_error("This function shouldn't have been called. "\
+                      "cpu-as-device mode should handle reductions in "\
+                      "the module code\n");\
+}
+
+GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, ArgMin, minloc)
+GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, ArgMax, maxloc)
+
+#undef DEF_ONE_REDUCE_RET_VAL_IDX
+
 #endif // HAS_GPU_LOCALE
diff --git a/test/gpu/native/reduction/SKIPIF b/test/gpu/native/reduction/SKIPIF
new file mode 100644
index 000000000000..75268eb04f42
--- /dev/null
+++ b/test/gpu/native/reduction/SKIPIF
@@ -0,0 +1,2 @@
+# Reductions are not supported with AMD gpus yet.
+CHPL_GPU==amd
diff --git a/test/gpu/native/reduction/largeArrays.skipif b/test/gpu/native/reduction/largeArrays.skipif
new file mode 100644
index 000000000000..dc81184aa7f6
--- /dev/null
+++ b/test/gpu/native/reduction/largeArrays.skipif
@@ -0,0 +1,4 @@
+# this test is to exercise a code path that will never be run with
+# cpu-as-device. Moreover, the large size makes this mode take too long to
+# finish. So, I am skipping this test.
+CHPL_GPU==cpu
diff --git a/test/gpu/native/reduction/largeArraysMinMax.skipif b/test/gpu/native/reduction/largeArraysMinMax.skipif
new file mode 100644
index 000000000000..dc81184aa7f6
--- /dev/null
+++ b/test/gpu/native/reduction/largeArraysMinMax.skipif
@@ -0,0 +1,4 @@
+# this test is to exercise a code path that will never be run with
+# cpu-as-device. Moreover, the large size makes this mode take too long to
+# finish. So, I am skipping this test.
+CHPL_GPU==cpu

From 186ad540ef148c5867d231dbadbad22845aee170 Mon Sep 17 00:00:00 2001
From: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
Date: Wed, 25 Oct 2023 20:44:30 -0700
Subject: [PATCH 25/42] Add the missing good file

Signed-off-by: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
---
 test/gpu/native/reduction/reduceThroughput.good | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 test/gpu/native/reduction/reduceThroughput.good

diff --git a/test/gpu/native/reduction/reduceThroughput.good b/test/gpu/native/reduction/reduceThroughput.good
new file mode 100644
index 000000000000..4b12519e9850
--- /dev/null
+++ b/test/gpu/native/reduction/reduceThroughput.good
@@ -0,0 +1,3 @@
+warning: The prototype GPU support implies --no-checks. This may impact debuggability. To suppress this warning, compile with --no-checks explicitly
+Using gpu
+4950

From 4b6953f2116af377adb81b9c1e27294027693ba4 Mon Sep 17 00:00:00 2001
From: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
Date: Wed, 25 Oct 2023 21:15:14 -0700
Subject: [PATCH 26/42] Remove a trailing whitespace

Signed-off-by: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
---
 runtime/src/gpu/amd/Makefile.share | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/runtime/src/gpu/amd/Makefile.share b/runtime/src/gpu/amd/Makefile.share
index 79f63b4497f0..410b4f724e82 100644
--- a/runtime/src/gpu/amd/Makefile.share
+++ b/runtime/src/gpu/amd/Makefile.share
@@ -15,8 +15,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-#GPU_SRCS = gpu-amd.c gpu-amd-reduce.cc
-GPU_SRCS = gpu-amd-reduce.cc gpu-amd.c 
+GPU_SRCS = gpu-amd-reduce.cc gpu-amd.c
 
 SRCS = $(GPU_SRCS)
 

From aca38c46ab9dd3c0052c62e47007bf06318fc1db Mon Sep 17 00:00:00 2001
From: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
Date: Thu, 26 Oct 2023 09:52:14 -0700
Subject: [PATCH 27/42] Revert some of the AMD changes

Signed-off-by: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
---
 runtime/src/gpu/amd/gpu-amd.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/runtime/src/gpu/amd/gpu-amd.c b/runtime/src/gpu/amd/gpu-amd.c
index 475e8ccaaeb2..b3b54752d5b7 100644
--- a/runtime/src/gpu/amd/gpu-amd.c
+++ b/runtime/src/gpu/amd/gpu-amd.c
@@ -29,7 +29,6 @@
 #include "chplcgfns.h"
 #include "chpl-env-gen.h"
 #include "chpl-linefile-support.h"
-#include "gpu/amd/util.h"
 
 #include <assert.h>
 
@@ -38,7 +37,7 @@
 #include <hip/hip_runtime_api.h>
 #include <hip/hip_common.h>
 
-void chpl_gpu_rocm_check(int err, const char* file, int line) {
+static void chpl_gpu_rocm_check(int err, const char* file, int line) {
   if(err == hipErrorContextAlreadyInUse) { return; }
   if(err != hipSuccess) {
     const int msg_len = 256;
@@ -52,6 +51,10 @@ void chpl_gpu_rocm_check(int err, const char* file, int line) {
   }
 }
 
+#define ROCM_CALL(call) do {\
+  chpl_gpu_rocm_check((int)call, __FILE__, __LINE__);\
+} while(0);
+
 static inline
 void* chpl_gpu_load_module(const char* fatbin_data) {
   hipModule_t rocm_module;

From 257a6aea6afb32f858a52862bbc49f3d8be547d2 Mon Sep 17 00:00:00 2001
From: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
Date: Thu, 26 Oct 2023 09:52:42 -0700
Subject: [PATCH 28/42] Add the missing execopts

Signed-off-by: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
---
 test/gpu/native/reduction/reduceThroughput.execopts | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 test/gpu/native/reduction/reduceThroughput.execopts

diff --git a/test/gpu/native/reduction/reduceThroughput.execopts b/test/gpu/native/reduction/reduceThroughput.execopts
new file mode 100644
index 000000000000..367868626392
--- /dev/null
+++ b/test/gpu/native/reduction/reduceThroughput.execopts
@@ -0,0 +1 @@
+--reportPerf=false

From 9c49498f4064c36d3d99df0dc102035564a6b041 Mon Sep 17 00:00:00 2001
From: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
Date: Thu, 26 Oct 2023 15:49:56 -0700
Subject: [PATCH 29/42] Remove an include

Signed-off-by: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
---
 runtime/src/gpu/amd/gpu-amd-reduce.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/runtime/src/gpu/amd/gpu-amd-reduce.cc b/runtime/src/gpu/amd/gpu-amd-reduce.cc
index 68aa83f7619b..e9a39dfbc25d 100644
--- a/runtime/src/gpu/amd/gpu-amd-reduce.cc
+++ b/runtime/src/gpu/amd/gpu-amd-reduce.cc
@@ -28,7 +28,6 @@
 #include "chpl-gpu.h"
 #include "chpl-gpu-impl.h"
 #include "gpu/chpl-gpu-reduce-util.h"
-#include "gpu/amd/util.h"
 
 // Engin: I can't get neither hipCUB nor rocprim to work. (hipCUB is a light
 // wrapper around rocprim anyways). I filed

From 1627385447572afa4ba93bce6413b8fb844b0be7 Mon Sep 17 00:00:00 2001
From: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
Date: Fri, 3 Nov 2023 09:21:17 -0700
Subject: [PATCH 30/42] Relocate tests into a new noAmd directory

Signed-off-by: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
---
 test/gpu/native/{reduction => noAmd}/SKIPIF                      | 0
 test/gpu/native/{ => noAmd}/reduction/basic.chpl                 | 0
 test/gpu/native/{ => noAmd}/reduction/basic.good                 | 0
 test/gpu/native/{ => noAmd}/reduction/largeArrays.chpl           | 0
 test/gpu/native/{ => noAmd}/reduction/largeArrays.execopts       | 0
 test/gpu/native/{ => noAmd}/reduction/largeArrays.good           | 0
 test/gpu/native/{ => noAmd}/reduction/largeArrays.skipif         | 0
 test/gpu/native/{ => noAmd}/reduction/largeArraysMinMax.chpl     | 0
 test/gpu/native/{ => noAmd}/reduction/largeArraysMinMax.compopts | 0
 test/gpu/native/{ => noAmd}/reduction/largeArraysMinMax.execopts | 0
 test/gpu/native/{ => noAmd}/reduction/largeArraysMinMax.good     | 0
 test/gpu/native/{ => noAmd}/reduction/largeArraysMinMax.skipif   | 0
 test/gpu/native/{ => noAmd}/reduction/reduceThroughput.chpl      | 0
 test/gpu/native/{ => noAmd}/reduction/reduceThroughput.execopts  | 0
 test/gpu/native/{ => noAmd}/reduction/reduceThroughput.good      | 0
 15 files changed, 0 insertions(+), 0 deletions(-)
 rename test/gpu/native/{reduction => noAmd}/SKIPIF (100%)
 rename test/gpu/native/{ => noAmd}/reduction/basic.chpl (100%)
 rename test/gpu/native/{ => noAmd}/reduction/basic.good (100%)
 rename test/gpu/native/{ => noAmd}/reduction/largeArrays.chpl (100%)
 rename test/gpu/native/{ => noAmd}/reduction/largeArrays.execopts (100%)
 rename test/gpu/native/{ => noAmd}/reduction/largeArrays.good (100%)
 rename test/gpu/native/{ => noAmd}/reduction/largeArrays.skipif (100%)
 rename test/gpu/native/{ => noAmd}/reduction/largeArraysMinMax.chpl (100%)
 rename test/gpu/native/{ => noAmd}/reduction/largeArraysMinMax.compopts (100%)
 rename test/gpu/native/{ => noAmd}/reduction/largeArraysMinMax.execopts (100%)
 rename test/gpu/native/{ => noAmd}/reduction/largeArraysMinMax.good (100%)
 rename test/gpu/native/{ => noAmd}/reduction/largeArraysMinMax.skipif (100%)
 rename test/gpu/native/{ => noAmd}/reduction/reduceThroughput.chpl (100%)
 rename test/gpu/native/{ => noAmd}/reduction/reduceThroughput.execopts (100%)
 rename test/gpu/native/{ => noAmd}/reduction/reduceThroughput.good (100%)

diff --git a/test/gpu/native/reduction/SKIPIF b/test/gpu/native/noAmd/SKIPIF
similarity index 100%
rename from test/gpu/native/reduction/SKIPIF
rename to test/gpu/native/noAmd/SKIPIF
diff --git a/test/gpu/native/reduction/basic.chpl b/test/gpu/native/noAmd/reduction/basic.chpl
similarity index 100%
rename from test/gpu/native/reduction/basic.chpl
rename to test/gpu/native/noAmd/reduction/basic.chpl
diff --git a/test/gpu/native/reduction/basic.good b/test/gpu/native/noAmd/reduction/basic.good
similarity index 100%
rename from test/gpu/native/reduction/basic.good
rename to test/gpu/native/noAmd/reduction/basic.good
diff --git a/test/gpu/native/reduction/largeArrays.chpl b/test/gpu/native/noAmd/reduction/largeArrays.chpl
similarity index 100%
rename from test/gpu/native/reduction/largeArrays.chpl
rename to test/gpu/native/noAmd/reduction/largeArrays.chpl
diff --git a/test/gpu/native/reduction/largeArrays.execopts b/test/gpu/native/noAmd/reduction/largeArrays.execopts
similarity index 100%
rename from test/gpu/native/reduction/largeArrays.execopts
rename to test/gpu/native/noAmd/reduction/largeArrays.execopts
diff --git a/test/gpu/native/reduction/largeArrays.good b/test/gpu/native/noAmd/reduction/largeArrays.good
similarity index 100%
rename from test/gpu/native/reduction/largeArrays.good
rename to test/gpu/native/noAmd/reduction/largeArrays.good
diff --git a/test/gpu/native/reduction/largeArrays.skipif b/test/gpu/native/noAmd/reduction/largeArrays.skipif
similarity index 100%
rename from test/gpu/native/reduction/largeArrays.skipif
rename to test/gpu/native/noAmd/reduction/largeArrays.skipif
diff --git a/test/gpu/native/reduction/largeArraysMinMax.chpl b/test/gpu/native/noAmd/reduction/largeArraysMinMax.chpl
similarity index 100%
rename from test/gpu/native/reduction/largeArraysMinMax.chpl
rename to test/gpu/native/noAmd/reduction/largeArraysMinMax.chpl
diff --git a/test/gpu/native/reduction/largeArraysMinMax.compopts b/test/gpu/native/noAmd/reduction/largeArraysMinMax.compopts
similarity index 100%
rename from test/gpu/native/reduction/largeArraysMinMax.compopts
rename to test/gpu/native/noAmd/reduction/largeArraysMinMax.compopts
diff --git a/test/gpu/native/reduction/largeArraysMinMax.execopts b/test/gpu/native/noAmd/reduction/largeArraysMinMax.execopts
similarity index 100%
rename from test/gpu/native/reduction/largeArraysMinMax.execopts
rename to test/gpu/native/noAmd/reduction/largeArraysMinMax.execopts
diff --git a/test/gpu/native/reduction/largeArraysMinMax.good b/test/gpu/native/noAmd/reduction/largeArraysMinMax.good
similarity index 100%
rename from test/gpu/native/reduction/largeArraysMinMax.good
rename to test/gpu/native/noAmd/reduction/largeArraysMinMax.good
diff --git a/test/gpu/native/reduction/largeArraysMinMax.skipif b/test/gpu/native/noAmd/reduction/largeArraysMinMax.skipif
similarity index 100%
rename from test/gpu/native/reduction/largeArraysMinMax.skipif
rename to test/gpu/native/noAmd/reduction/largeArraysMinMax.skipif
diff --git a/test/gpu/native/reduction/reduceThroughput.chpl b/test/gpu/native/noAmd/reduction/reduceThroughput.chpl
similarity index 100%
rename from test/gpu/native/reduction/reduceThroughput.chpl
rename to test/gpu/native/noAmd/reduction/reduceThroughput.chpl
diff --git a/test/gpu/native/reduction/reduceThroughput.execopts b/test/gpu/native/noAmd/reduction/reduceThroughput.execopts
similarity index 100%
rename from test/gpu/native/reduction/reduceThroughput.execopts
rename to test/gpu/native/noAmd/reduction/reduceThroughput.execopts
diff --git a/test/gpu/native/reduction/reduceThroughput.good b/test/gpu/native/noAmd/reduction/reduceThroughput.good
similarity index 100%
rename from test/gpu/native/reduction/reduceThroughput.good
rename to test/gpu/native/noAmd/reduction/reduceThroughput.good

From ab8a348f0032a45ed796d5db369611eaf919b290 Mon Sep 17 00:00:00 2001
From: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
Date: Fri, 3 Nov 2023 09:48:24 -0700
Subject: [PATCH 31/42] Add a user facing error message for unknown types and a
 test to lock the behavior

Signed-off-by: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
---
 modules/standard/GPU.chpl                           | 2 ++
 test/gpu/native/noAmd/reduction/stringError.chpl    | 7 +++++++
 test/gpu/native/noAmd/reduction/stringError.good    | 5 +++++
 test/gpu/native/noAmd/reduction/stringError.prediff | 4 ++++
 4 files changed, 18 insertions(+)
 create mode 100644 test/gpu/native/noAmd/reduction/stringError.chpl
 create mode 100644 test/gpu/native/noAmd/reduction/stringError.good
 create mode 100755 test/gpu/native/noAmd/reduction/stringError.prediff

diff --git a/modules/standard/GPU.chpl b/modules/standard/GPU.chpl
index f866c779e814..cfc5b1313b41 100644
--- a/modules/standard/GPU.chpl
+++ b/modules/standard/GPU.chpl
@@ -435,6 +435,8 @@ module GPU
         when real(32) do return "float";
         when real(64) do return "double";
       }
+
+      compilerError("Arrays with ", t:string, " elements cannot be reduced");
       return "unknown";
     }
 
diff --git a/test/gpu/native/noAmd/reduction/stringError.chpl b/test/gpu/native/noAmd/reduction/stringError.chpl
new file mode 100644
index 000000000000..9045408aed4b
--- /dev/null
+++ b/test/gpu/native/noAmd/reduction/stringError.chpl
@@ -0,0 +1,7 @@
+use GPU;
+
+on here.gpus[0] {
+  var Arr: [1..10] string;
+
+  writeln(gpuSumReduce(Arr));
+}
diff --git a/test/gpu/native/noAmd/reduction/stringError.good b/test/gpu/native/noAmd/reduction/stringError.good
new file mode 100644
index 000000000000..b08203390ef8
--- /dev/null
+++ b/test/gpu/native/noAmd/reduction/stringError.good
@@ -0,0 +1,5 @@
+warning: The prototype GPU support implies --no-checks. This may impact debuggability. To suppress this warning, compile with --no-checks explicitly
+$CHPL_HOME/modules/standard/GPU.chpl:prediffed: In function 'doGpuReduce':
+$CHPL_HOME/modules/standard/GPU.chpl:prediffed: error: Arrays with string elements cannot be reduced
+  $CHPL_HOME/modules/standard/GPU.chpl:prediffed: called as doGpuReduce(param op = "sum", A: [domain(1,int(64),one)] string) from function 'gpuSumReduce'
+  stringError.chpl:prediffed: called as gpuSumReduce(A: [domain(1,int(64),one)] string)
diff --git a/test/gpu/native/noAmd/reduction/stringError.prediff b/test/gpu/native/noAmd/reduction/stringError.prediff
new file mode 100755
index 000000000000..082c5199fbe7
--- /dev/null
+++ b/test/gpu/native/noAmd/reduction/stringError.prediff
@@ -0,0 +1,4 @@
+#!/bin/sh
+
+sed -e "s/\.chpl:[0-9]\+:/.chpl:prediffed:/" $2 > $2.tmp
+mv $2.tmp $2

From 7725662163bdfceccd6c6b4d07ca0dea9f5ad134 Mon Sep 17 00:00:00 2001
From: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
Date: Fri, 3 Nov 2023 09:52:03 -0700
Subject: [PATCH 32/42] Add more fall-through otherwises

Signed-off-by: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
---
 modules/standard/GPU.chpl | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/modules/standard/GPU.chpl b/modules/standard/GPU.chpl
index cfc5b1313b41..e31e217314b3 100644
--- a/modules/standard/GPU.chpl
+++ b/modules/standard/GPU.chpl
@@ -415,6 +415,7 @@ module GPU
         when "max" do return max reduce A;
         when "minloc" do return minloc reduce zip (A.domain, A);
         when "maxloc" do return maxloc reduce zip (A.domain, A);
+        otherwise do compilerError("Unknown reduction operation");
       }
     }
     else {
@@ -434,9 +435,9 @@ module GPU
         when uint(64) do return "uint64_t";
         when real(32) do return "float";
         when real(64) do return "double";
+        otherwise do
+          compilerError("Arrays with ", t:string, " elements cannot be reduced");
       }
-
-      compilerError("Arrays with ", t:string, " elements cannot be reduced");
       return "unknown";
     }
 
@@ -449,6 +450,7 @@ module GPU
         when "sum" do accum += val;
         when "min" do accum = min(accum, val);
         when "max" do accum = max(accum, val);
+        otherwise do compilerError("Unknown reduction type");
       }
     }
 
@@ -459,6 +461,7 @@ module GPU
           if accum[1] > val[1] then accum = (val[0]+baseOffset, val[1]);
         when "maxloc" do
           if accum[1] < val[1] then accum = (val[0]+baseOffset, val[1]);
+        otherwise do compilerError("Unknown reduction type");
       }
     }
 

From 6da77ee8958d42a4e8a9b9f0187cb404383729f1 Mon Sep 17 00:00:00 2001
From: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
Date: Fri, 3 Nov 2023 10:08:33 -0700
Subject: [PATCH 33/42] Start adding documentation

Signed-off-by: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
---
 modules/standard/GPU.chpl | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/modules/standard/GPU.chpl b/modules/standard/GPU.chpl
index e31e217314b3..87ddee297b6a 100644
--- a/modules/standard/GPU.chpl
+++ b/modules/standard/GPU.chpl
@@ -539,10 +539,48 @@ module GPU
     }
   }
 
+  /* 
+     Perform sum-reduction on an array. The array must be allocated on
+     GPU-accessible memory. The function should not be called from inside a
+     GPU-eligible loop. Only arrays with ``int``, ``uint`` and ``real`` types
+     are supported. A simple example is as follows.
+   
+     .. code-block:: chapel
+
+       on here.gpus[0] {
+         var Arr: [1..n] int;  // will be allocated in GPU-accesible memory
+         ...
+         var sum = gpuSumReduce(Arr);
+       }
+  */
   inline proc gpuSumReduce(const ref A: [] ?t) do return doGpuReduce("sum", A);
+
+  /* 
+     Perform min-reduction on an array. The array must be allocated on
+     GPU-accessible memory. The function must be called from the host. Only
+     arrays with ``int``, ``uint`` and ``real`` types are supported.
+  */
   inline proc gpuMinReduce(const ref A: [] ?t) do return doGpuReduce("min", A);
+
+  /* 
+     Perform max-reduction on an array. The array must be allocated on
+     GPU-accessible memory. The function must be called from the host. Only
+     arrays with ``int``, ``uint`` and ``real`` types are supported.
+  */
   inline proc gpuMaxReduce(const ref A: [] ?t) do return doGpuReduce("max", A);
+
+  /* 
+     Perform minloc-reduction on an array. The array must be allocated on
+     GPU-accessible memory. The function must be called from the host. Only
+     arrays with ``int``, ``uint`` and ``real`` types are supported.
+  */
   inline proc gpuMinLocReduce(const ref A: [] ?t) do return doGpuReduce("minloc", A);
+
+  /* 
+     Perform maxloc-reduction on an array. The array must be allocated on
+     GPU-accessible memory. The function must be called from the host. Only
+     arrays with ``int``, ``uint`` and ``real`` types are supported.
+  */
   inline proc gpuMaxLocReduce(const ref A: [] ?t) do return doGpuReduce("maxloc", A);
 
 }

From 889ef7325b834106b66a811591762d02db92a25e Mon Sep 17 00:00:00 2001
From: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
Date: Fri, 3 Nov 2023 10:12:11 -0700
Subject: [PATCH 34/42] Add one more fallthrough, unify error messages

Signed-off-by: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
---
 modules/standard/GPU.chpl | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/modules/standard/GPU.chpl b/modules/standard/GPU.chpl
index 87ddee297b6a..4aa1d6ef7495 100644
--- a/modules/standard/GPU.chpl
+++ b/modules/standard/GPU.chpl
@@ -415,7 +415,7 @@ module GPU
         when "max" do return max reduce A;
         when "minloc" do return minloc reduce zip (A.domain, A);
         when "maxloc" do return maxloc reduce zip (A.domain, A);
-        otherwise do compilerError("Unknown reduction operation");
+        otherwise do compilerError("Unknown reduction operation: ", op);
       }
     }
     else {
@@ -450,7 +450,7 @@ module GPU
         when "sum" do accum += val;
         when "min" do accum = min(accum, val);
         when "max" do accum = max(accum, val);
-        otherwise do compilerError("Unknown reduction type");
+        otherwise do compilerError("Unknown reduction operation: ", op);
       }
     }
 
@@ -461,7 +461,7 @@ module GPU
           if accum[1] > val[1] then accum = (val[0]+baseOffset, val[1]);
         when "maxloc" do
           if accum[1] < val[1] then accum = (val[0]+baseOffset, val[1]);
-        otherwise do compilerError("Unknown reduction type");
+        otherwise do compilerError("Unknown reduction operation: ", op);
       }
     }
 
@@ -516,7 +516,7 @@ module GPU
 
       return val;
     }
-    else {
+    else if op == "minloc" || op == "maxloc" {
       var ret: (int, t);
       if op == "minloc" then
         ret[1] = max(t);
@@ -537,6 +537,9 @@ module GPU
 
       return ret;
     }
+    else {
+      compilerError("Unknown reduction operation: ", op);
+    }
   }
 
   /* 

From 377f97dc60671eaebd5e8c079c337545024ca198 Mon Sep 17 00:00:00 2001
From: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
Date: Fri, 3 Nov 2023 10:15:52 -0700
Subject: [PATCH 35/42] Free runtime memory that we were leaking before

Signed-off-by: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
---
 runtime/src/gpu/nvidia/gpu-nvidia-reduce.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/runtime/src/gpu/nvidia/gpu-nvidia-reduce.cc b/runtime/src/gpu/nvidia/gpu-nvidia-reduce.cc
index 6c9f7a350c34..6e8828cae16d 100644
--- a/runtime/src/gpu/nvidia/gpu-nvidia-reduce.cc
+++ b/runtime/src/gpu/nvidia/gpu-nvidia-reduce.cc
@@ -42,6 +42,7 @@ void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
                               (CUstream)stream); \
   CUDA_CALL(cuMemcpyDtoHAsync(val, result, sizeof(data_type),\
                               (CUstream)stream)); \
+  CUDA_CALL(cuMemFree(result)); \
 }
 
 GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL, Sum, sum)
@@ -69,6 +70,7 @@ void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
                               (CUstream)stream)); \
   *val = result_host.value; \
   *idx = result_host.key; \
+  CUDA_CALL(cuMemFree(result)); \
 }
 
 GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, ArgMin, minloc)

From 1675b9e81990fd14172c3746ab65439238f43463 Mon Sep 17 00:00:00 2001
From: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
Date: Fri, 3 Nov 2023 11:21:01 -0700
Subject: [PATCH 36/42] A big refactor to reduce code duplication significantly

Signed-off-by: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
---
 modules/standard/GPU.chpl                   | 100 ++++++++++----------
 runtime/include/chpl-gpu-impl.h             |  23 ++---
 runtime/include/chpl-gpu.h                  |  22 ++---
 runtime/src/chpl-gpu.c                      |  37 ++------
 runtime/src/gpu/amd/gpu-amd-reduce.cc       |   4 +-
 runtime/src/gpu/nvidia/gpu-nvidia-reduce.cc |   5 +-
 6 files changed, 79 insertions(+), 112 deletions(-)

diff --git a/modules/standard/GPU.chpl b/modules/standard/GPU.chpl
index 4aa1d6ef7495..63452e5b95e4 100644
--- a/modules/standard/GPU.chpl
+++ b/modules/standard/GPU.chpl
@@ -445,18 +445,31 @@ module GPU
       return "chpl_gpu_"+op+"_reduce_"+chplTypeToCTypeName(t);
     }
 
-    inline proc subReduceVal(param op, ref accum: ?t, val: t) {
-      select op {
-        when "sum" do accum += val;
-        when "min" do accum = min(accum, val);
-        when "max" do accum = max(accum, val);
-        otherwise do compilerError("Unknown reduction operation: ", op);
-      }
+    proc isValReduce(param op) param {
+      return op=="sum" || op=="min" || op=="max";
     }
 
-    inline proc subReduceValIdx(param op, const baseOffset, ref accum: ?t,
-                                val: t) {
+    proc isValIdxReduce(param op) param {
+      return op=="minloc" || op=="maxloc";
+    }
+
+    inline proc subReduceValIdx(param op, const baseOffset, ref accum, val) {
+      // do some type checking to be safe
+      compilerAssert(isTupleValue(val));
+      if isTupleValue(accum) {
+        compilerAssert(isValIdxReduce(op));
+        compilerAssert(val[1].type == accum[1].type);
+
+      }
+      else {
+        compilerAssert(isValReduce(op));
+        compilerAssert(val[1].type == accum.type);
+      }
+
       select op {
+        when "sum" do accum += val[1];
+        when "min" do accum = min(accum, val[1]);
+        when "max" do accum = max(accum, val[1]);
         when "minloc" do
           if accum[1] > val[1] then accum = (val[0]+baseOffset, val[1]);
         when "maxloc" do
@@ -494,52 +507,41 @@ module GPU
 
     use CTypes;
 
+    // find the extern function we'll use
     param externFunc = getExternFuncName(op, t);
-
-    if op == "sum" || op == "min" || op == "max" {
-      var val: t;
-      if op == "min" then
-        val = max(t);
-      else if op == "max" then
-        val = min(t);
-
-      extern externFunc proc reduce_fn(data, size, ref val);
-
-      const basePtr = c_ptrToConst(A);
-      for (offset,size) in offsetsThatCanFitIn32Bits(A.size) {
-        var curVal: t;
-        reduce_fn(basePtr+offset, size, curVal);
-        if gpuDebugReduce then
-          writef(" (curVal=%i)\n", curVal);
-        subReduceVal(op, val, curVal);
-      }
-
-      return val;
+    extern externFunc proc reduce_fn(data, size, ref val, ref idx);
+
+    // initialize the return value
+    var ret;
+    if isValReduce(op) {
+      var retTmp: t;
+      if op == "min" then retTmp = max(t);
+      else if op == "max" then retTmp = min(t);
+      ret = retTmp;
     }
-    else if op == "minloc" || op == "maxloc" {
-      var ret: (int, t);
-      if op == "minloc" then
-        ret[1] = max(t);
-      else if op == "maxloc" then
-        ret[1] = min(t);
-
-      extern externFunc proc reduce_fn(data, size, ref val, ref idx);
-
-      const basePtr = c_ptrToConst(A);
-      for (offset,size) in offsetsThatCanFitIn32Bits(A.size) {
-        var curIdx: int(32);
-        var curVal: t;
-        reduce_fn(basePtr+offset, size, curVal, curIdx);
-        subReduceValIdx(op, offset, ret, (curIdx, curVal));
-        if gpuDebugReduce then
-          writef(" (curIdx=%i curVal=%i ret=%?)\n", curIdx, curVal, ret);
-      }
-
-      return ret;
+    else if isValIdxReduce(op) {
+      var retTmp: (int, t);
+      if op == "minloc" then retTmp[1] = max(t);
+      else if op == "maxloc" then retTmp[1] = min(t);
+      ret = retTmp;
     }
     else {
       compilerError("Unknown reduction operation: ", op);
+      ret = 0;
     }
+
+    // perform the reduction
+    const basePtr = c_ptrToConst(A);
+    for (offset,size) in offsetsThatCanFitIn32Bits(A.size) {
+      var curIdx: int(32) = -1; // should remain -1 for sum, min, max
+      var curVal: t;
+      reduce_fn(basePtr+offset, size, curVal, curIdx);
+      subReduceValIdx(op, offset, ret, (curIdx, curVal));
+      if gpuDebugReduce then
+        writef(" (curIdx=%i curVal=%i ret=%?)\n", curIdx, curVal, ret);
+    }
+
+    return ret;
   }
 
   /* 
diff --git a/runtime/include/chpl-gpu-impl.h b/runtime/include/chpl-gpu-impl.h
index 6c01a07aea1f..972c57a91e9c 100644
--- a/runtime/include/chpl-gpu-impl.h
+++ b/runtime/include/chpl-gpu-impl.h
@@ -76,26 +76,17 @@ void chpl_gpu_impl_stream_destroy(void* stream);
 bool chpl_gpu_impl_stream_ready(void* stream);
 void chpl_gpu_impl_stream_synchronize(void* stream);
 
-#define DECL_ONE_REDUCE_IMPL_RET_VAL(chpl_kind, data_type) \
-void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
-                                                    data_type* val,\
-                                                    void* stream);
-
-GPU_REDUCE(DECL_ONE_REDUCE_IMPL_RET_VAL, sum)
-GPU_REDUCE(DECL_ONE_REDUCE_IMPL_RET_VAL, min)
-GPU_REDUCE(DECL_ONE_REDUCE_IMPL_RET_VAL, max)
-
-#undef DECL_ONE_REDUCE_RET_VAL
-
-#define DECL_ONE_REDUCE_IMPL_RET_VAL_IDX(chpl_kind, data_type) \
+#define DECL_ONE_REDUCE_IMPL(chpl_kind, data_type) \
 void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
                                                     data_type* val, int* idx,\
                                                     void* stream);
+GPU_REDUCE(DECL_ONE_REDUCE_IMPL, sum)
+GPU_REDUCE(DECL_ONE_REDUCE_IMPL, min)
+GPU_REDUCE(DECL_ONE_REDUCE_IMPL, max)
+GPU_REDUCE(DECL_ONE_REDUCE_IMPL, minloc)
+GPU_REDUCE(DECL_ONE_REDUCE_IMPL, maxloc)
 
-GPU_REDUCE(DECL_ONE_REDUCE_IMPL_RET_VAL_IDX, minloc)
-GPU_REDUCE(DECL_ONE_REDUCE_IMPL_RET_VAL_IDX, maxloc)
-
-#undef DECL_ONE_REDUCE_RET_VAL_IDX
+#undef DECL_ONE_REDUCE_IMPL
 
 #ifdef __cplusplus
 }
diff --git a/runtime/include/chpl-gpu.h b/runtime/include/chpl-gpu.h
index f3b5ae31dc8b..189f77583246 100644
--- a/runtime/include/chpl-gpu.h
+++ b/runtime/include/chpl-gpu.h
@@ -150,24 +150,18 @@ size_t chpl_gpu_get_alloc_size(void* ptr);
 bool chpl_gpu_can_access_peer(int dev1, int dev2);
 void chpl_gpu_set_peer_access(int dev1, int dev2, bool enable);
 
-#define DECL_ONE_REDUCE_RET_VAL(chpl_kind, data_type) \
-void chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
-                                               data_type* val);
-
-GPU_REDUCE(DECL_ONE_REDUCE_RET_VAL, sum);
-GPU_REDUCE(DECL_ONE_REDUCE_RET_VAL, min);
-GPU_REDUCE(DECL_ONE_REDUCE_RET_VAL, max);
-
-#undef DECL_ONE_REDUCE_RET_VAL
-
-#define DECL_ONE_REDUCE_RET_VAL_IDX(chpl_kind, data_type) \
+#define DECL_ONE_REDUCE(chpl_kind, data_type) \
 void chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
                                                data_type* val, int* idx);
 
-GPU_REDUCE(DECL_ONE_REDUCE_RET_VAL_IDX, minloc);
-GPU_REDUCE(DECL_ONE_REDUCE_RET_VAL_IDX, maxloc);
+GPU_REDUCE(DECL_ONE_REDUCE, sum);
+GPU_REDUCE(DECL_ONE_REDUCE, min);
+GPU_REDUCE(DECL_ONE_REDUCE, max);
+GPU_REDUCE(DECL_ONE_REDUCE, minloc);
+GPU_REDUCE(DECL_ONE_REDUCE, maxloc);
+
+#undef DECL_ONE_REDUCE
 
-#undef DECL_ONE_REDUCE_RET_VAL_IDX
 
 #endif // HAS_GPU_LOCALE
 
diff --git a/runtime/src/chpl-gpu.c b/runtime/src/chpl-gpu.c
index c0182682a33b..cdd91a599e3a 100644
--- a/runtime/src/chpl-gpu.c
+++ b/runtime/src/chpl-gpu.c
@@ -699,32 +699,7 @@ void chpl_gpu_set_peer_access(int dev1, int dev2, bool enable) {
   chpl_gpu_impl_set_peer_access(dev1, dev2, enable);
 }
 
-#define DEF_ONE_REDUCE_RET_VAL(kind, data_type)\
-void chpl_gpu_##kind##_reduce_##data_type(data_type *data, int n, \
-                                          data_type* val) { \
-  CHPL_GPU_DEBUG("chpl_gpu_" #kind "_reduce_" #data_type " called\n"); \
-  \
-  int dev = chpl_task_getRequestedSubloc(); \
-  chpl_gpu_impl_use_device(dev); \
-  void* stream = get_stream(dev); \
-  \
-  chpl_gpu_impl_##kind##_reduce_##data_type(data, n, val, stream); \
-  \
-  if (chpl_gpu_sync_with_host) { \
-    CHPL_GPU_DEBUG("Eagerly synchronizing stream %p\n", stream); \
-    wait_stream(stream); \
-  } \
-  \
-  CHPL_GPU_DEBUG("chpl_gpu_" #kind "_reduce_" #data_type " returned\n"); \
-}
-
-GPU_REDUCE(DEF_ONE_REDUCE_RET_VAL, sum)
-GPU_REDUCE(DEF_ONE_REDUCE_RET_VAL, min)
-GPU_REDUCE(DEF_ONE_REDUCE_RET_VAL, max)
-
-#undef DEF_ONE_REDUCE_RET_VAL
-
-#define DEF_ONE_REDUCE_RET_VAL_IDX(kind, data_type)\
+#define DEF_ONE_REDUCE(kind, data_type)\
 void chpl_gpu_##kind##_reduce_##data_type(data_type *data, int n, \
                                           data_type* val, int* idx) { \
   CHPL_GPU_DEBUG("chpl_gpu_" #kind "_reduce_" #data_type " called\n"); \
@@ -743,10 +718,12 @@ void chpl_gpu_##kind##_reduce_##data_type(data_type *data, int n, \
   CHPL_GPU_DEBUG("chpl_gpu_" #kind "_reduce_" #data_type " returned\n"); \
 }
 
-GPU_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, minloc);
-GPU_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, maxloc);
-
-#undef DEF_ONE_REDUCE_RET_VAL_IDX
+GPU_REDUCE(DEF_ONE_REDUCE, sum)
+GPU_REDUCE(DEF_ONE_REDUCE, min)
+GPU_REDUCE(DEF_ONE_REDUCE, max)
+GPU_REDUCE(DEF_ONE_REDUCE, minloc)
+GPU_REDUCE(DEF_ONE_REDUCE, maxloc)
 
+#undef DEF_ONE_REDUCE
 
 #endif
diff --git a/runtime/src/gpu/amd/gpu-amd-reduce.cc b/runtime/src/gpu/amd/gpu-amd-reduce.cc
index e9a39dfbc25d..d8a3f01cbaac 100644
--- a/runtime/src/gpu/amd/gpu-amd-reduce.cc
+++ b/runtime/src/gpu/amd/gpu-amd-reduce.cc
@@ -37,14 +37,14 @@
 #if 1
 #define DEF_ONE_REDUCE_RET_VAL(impl_kind, chpl_kind, data_type) \
 void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
-                                                    data_type* val,\
+                                                    data_type* val, int* idx\
                                                     void* stream) {\
   chpl_internal_error("This function shouldn't have been called. Reduction is not supported with AMD GPUs\n");\
 }
 #elif ROCM_VERSION_MAJOR >= 5
 #define DEF_ONE_REDUCE_RET_VAL(impl_kind, chpl_kind, data_type) \
 void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
-                                                    data_type* val,\
+                                                    data_type* val, int* idx\
                                                     void* stream) {\
   data_type* result; \
   ROCM_CALL(hipMalloc(&result, sizeof(data_type)));\
diff --git a/runtime/src/gpu/nvidia/gpu-nvidia-reduce.cc b/runtime/src/gpu/nvidia/gpu-nvidia-reduce.cc
index 6e8828cae16d..daae4dce16b6 100644
--- a/runtime/src/gpu/nvidia/gpu-nvidia-reduce.cc
+++ b/runtime/src/gpu/nvidia/gpu-nvidia-reduce.cc
@@ -27,9 +27,12 @@
 #include "../common/cuda-utils.h"
 #include "gpu/chpl-gpu-reduce-util.h"
 
+// this version doesn't do anything with `idx`. Having a unified interface makes
+// the implementation in the rest of the runtime and the modules more
+// straightforward.
 #define DEF_ONE_REDUCE_RET_VAL(cub_kind, chpl_kind, data_type) \
 void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
-                                                    data_type* val,\
+                                                    data_type* val, int* idx,\
                                                     void* stream) {\
   CUdeviceptr result; \
   CUDA_CALL(cuMemAlloc(&result, sizeof(data_type))); \

From dd10162974e3039d29d19cac47f32e6ea1e8cef9 Mon Sep 17 00:00:00 2001
From: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
Date: Fri, 3 Nov 2023 11:31:55 -0700
Subject: [PATCH 37/42] Fix a bug for non-zero-based arrays, add test

Signed-off-by: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
---
 modules/standard/GPU.chpl                     |  3 +++
 .../native/noAmd/reduction/nonZeroBased.chpl  | 21 +++++++++++++++++++
 .../native/noAmd/reduction/nonZeroBased.good  | 10 +++++++++
 3 files changed, 34 insertions(+)
 create mode 100644 test/gpu/native/noAmd/reduction/nonZeroBased.chpl
 create mode 100644 test/gpu/native/noAmd/reduction/nonZeroBased.good

diff --git a/modules/standard/GPU.chpl b/modules/standard/GPU.chpl
index 63452e5b95e4..fa91ea3ea36f 100644
--- a/modules/standard/GPU.chpl
+++ b/modules/standard/GPU.chpl
@@ -541,6 +541,9 @@ module GPU
         writef(" (curIdx=%i curVal=%i ret=%?)\n", curIdx, curVal, ret);
     }
 
+    if isValIdxReduce(op) then
+      ret[0] += A.domain.first;
+
     return ret;
   }
 
diff --git a/test/gpu/native/noAmd/reduction/nonZeroBased.chpl b/test/gpu/native/noAmd/reduction/nonZeroBased.chpl
new file mode 100644
index 000000000000..ae337b07f240
--- /dev/null
+++ b/test/gpu/native/noAmd/reduction/nonZeroBased.chpl
@@ -0,0 +1,21 @@
+use GPU;
+use Random;
+
+proc test(ref Arr) {
+  fillRandom(Arr, min=0, max=100, seed=17);
+
+  writeln(Arr);
+  writeln(gpuMinLocReduce(Arr));
+  writeln(gpuMaxLocReduce(Arr));
+}
+
+on here.gpus[0] {
+  var A: [-20..#10] int;
+  test(A);
+
+  var B: [0..#10] int;
+  test(B);
+
+  var C: [10..#10] int;
+  test(C);
+}
diff --git a/test/gpu/native/noAmd/reduction/nonZeroBased.good b/test/gpu/native/noAmd/reduction/nonZeroBased.good
new file mode 100644
index 000000000000..7f00b4427b24
--- /dev/null
+++ b/test/gpu/native/noAmd/reduction/nonZeroBased.good
@@ -0,0 +1,10 @@
+warning: The prototype GPU support implies --no-checks. This may impact debuggability. To suppress this warning, compile with --no-checks explicitly
+64 6 5 12 14 91 11 91 11 58
+(-18, 5)
+(-15, 91)
+64 6 5 12 14 91 11 91 11 58
+(2, 5)
+(5, 91)
+64 6 5 12 14 91 11 91 11 58
+(12, 5)
+(15, 91)

From 7f007b79c24a39338105f9c2d52c798e6b91e182 Mon Sep 17 00:00:00 2001
From: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
Date: Fri, 3 Nov 2023 11:41:38 -0700
Subject: [PATCH 38/42] Remove trailing whitespaces

Signed-off-by: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
---
 modules/standard/GPU.chpl | 83 +++++++++++++++++++++++++++------------
 1 file changed, 58 insertions(+), 25 deletions(-)

diff --git a/modules/standard/GPU.chpl b/modules/standard/GPU.chpl
index fa91ea3ea36f..23c2e2436485 100644
--- a/modules/standard/GPU.chpl
+++ b/modules/standard/GPU.chpl
@@ -547,47 +547,80 @@ module GPU
     return ret;
   }
 
-  /* 
-     Perform sum-reduction on an array. The array must be allocated on
-     GPU-accessible memory. The function should not be called from inside a
-     GPU-eligible loop. Only arrays with ``int``, ``uint`` and ``real`` types
-     are supported. A simple example is as follows.
-   
+  /*
+    Add all elements of an array together on the GPU (that is, perform a
+    sum-reduction). The array must be in GPU-accessible memory and the function
+    must be called from outside a GPU-eligible loop. Only arrays with int, uint,
+    and real types are supported. A simple example is the following:
+
      .. code-block:: chapel
 
        on here.gpus[0] {
-         var Arr: [1..n] int;  // will be allocated in GPU-accesible memory
-         ...
-         var sum = gpuSumReduce(Arr);
+         var Arr = [3, 2, 1, 5, 4]; // will be GPU-accessible
+         writeln(gpuSumReduce(Arr)); // 15
        }
   */
   inline proc gpuSumReduce(const ref A: [] ?t) do return doGpuReduce("sum", A);
 
-  /* 
-     Perform min-reduction on an array. The array must be allocated on
-     GPU-accessible memory. The function must be called from the host. Only
-     arrays with ``int``, ``uint`` and ``real`` types are supported.
+  /*
+    Return the minimum element of an array on the GPU (that is, perform a
+    min-reduction). The array must be in GPU-accessible memory and the function
+    must be called from outside a GPU-eligible loop. Only arrays with int, uint,
+    and real types are supported. A simple example is the following:
+
+     .. code-block:: chapel
+
+       on here.gpus[0] {
+         var Arr = [3, 2, 1, 5, 4]; // will be GPU-accessible
+         writeln(gpuMinReduce(Arr)); // 1
+       }
   */
   inline proc gpuMinReduce(const ref A: [] ?t) do return doGpuReduce("min", A);
 
-  /* 
-     Perform max-reduction on an array. The array must be allocated on
-     GPU-accessible memory. The function must be called from the host. Only
-     arrays with ``int``, ``uint`` and ``real`` types are supported.
+  /*
+    Return the maximum element of an array on the GPU (that is, perform a
+    max-reduction). The array must be in GPU-accessible memory and the function
+    must be called from outside a GPU-eligible loop. Only arrays with int, uint,
+    and real types are supported. A simple example is the following:
+
+     .. code-block:: chapel
+
+       on here.gpus[0] {
+         var Arr = [3, 2, 1, 5, 4]; // will be GPU-accessible
+         writeln(gpuMaxReduce(Arr)); // 5
+       }
   */
   inline proc gpuMaxReduce(const ref A: [] ?t) do return doGpuReduce("max", A);
 
-  /* 
-     Perform minloc-reduction on an array. The array must be allocated on
-     GPU-accessible memory. The function must be called from the host. Only
-     arrays with ``int``, ``uint`` and ``real`` types are supported.
+  /*
+    For an array on the GPU, return a tuple with the index and the value of the
+    minimum element (that is, perform a minloc-reduction). The array must be in
+    GPU-accessible memory and the function must be called from outside a
+    GPU-eligible loop.  Only arrays with int, uint, and real types are
+    supported. A simple example is the following:
+
+     .. code-block:: chapel
+
+       on here.gpus[0] {
+         var Arr = [3, 2, 1, 5, 4]; // will be GPU-accessible
+         writeln(gpuMinLocReduce(Arr)); // (2, 1)
+       }
   */
   inline proc gpuMinLocReduce(const ref A: [] ?t) do return doGpuReduce("minloc", A);
 
-  /* 
-     Perform maxloc-reduction on an array. The array must be allocated on
-     GPU-accessible memory. The function must be called from the host. Only
-     arrays with ``int``, ``uint`` and ``real`` types are supported.
+  /*
+    For an array on the GPU, return a tuple with the index and the value of the
+    maximum element (that is, perform a maxloc-reduction). The array must be in
+    GPU-accessible memory and the function must be called from outside a
+    GPU-eligible loop.  Only arrays with int, uint, and real types are
+    supported. A simple example is the following:
+
+     .. code-block:: chapel
+
+       on here.gpus[0] {
+         var Arr = [3, 2, 1, 5, 4]; // will be GPU-accessible
+         writeln(gpuMaxLocReduce(Arr)); // (3, 5)
+       }
   */
   inline proc gpuMaxLocReduce(const ref A: [] ?t) do return doGpuReduce("maxloc", A);
 

From 727f44311a6393c4725ae8a7f828e2f2b504752d Mon Sep 17 00:00:00 2001
From: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
Date: Fri, 3 Nov 2023 11:47:19 -0700
Subject: [PATCH 39/42] More clarifications in doc

Signed-off-by: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
---
 modules/standard/GPU.chpl | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/modules/standard/GPU.chpl b/modules/standard/GPU.chpl
index 23c2e2436485..36612bf99d58 100644
--- a/modules/standard/GPU.chpl
+++ b/modules/standard/GPU.chpl
@@ -594,32 +594,34 @@ module GPU
 
   /*
     For an array on the GPU, return a tuple with the index and the value of the
-    minimum element (that is, perform a minloc-reduction). The array must be in
-    GPU-accessible memory and the function must be called from outside a
-    GPU-eligible loop.  Only arrays with int, uint, and real types are
-    supported. A simple example is the following:
+    minimum element (that is, perform a minloc-reduction).If there are multiple
+    elements with the same minimum value, the index of the first one is
+    returned. The array must be in GPU-accessible memory and the function must
+    be called from outside a GPU-eligible loop.  Only arrays with int, uint, and
+    real types are supported. A simple example is the following:
 
      .. code-block:: chapel
 
        on here.gpus[0] {
          var Arr = [3, 2, 1, 5, 4]; // will be GPU-accessible
-         writeln(gpuMinLocReduce(Arr)); // (2, 1)
+         writeln(gpuMinLocReduce(Arr)); // (2, 1). Note that Arr[2]==1.
        }
   */
   inline proc gpuMinLocReduce(const ref A: [] ?t) do return doGpuReduce("minloc", A);
 
   /*
     For an array on the GPU, return a tuple with the index and the value of the
-    maximum element (that is, perform a maxloc-reduction). The array must be in
-    GPU-accessible memory and the function must be called from outside a
-    GPU-eligible loop.  Only arrays with int, uint, and real types are
-    supported. A simple example is the following:
+    maximum element (that is, perform a maxloc-reduction). If there are multiple
+    elements with the same maximum value, the index of the first one is
+    returned. The array must be in GPU-accessible memory and the function must
+    be called from outside a GPU-eligible loop.  Only arrays with int, uint, and
+    real types are supported. A simple example is the following:
 
      .. code-block:: chapel
 
        on here.gpus[0] {
          var Arr = [3, 2, 1, 5, 4]; // will be GPU-accessible
-         writeln(gpuMaxLocReduce(Arr)); // (3, 5)
+         writeln(gpuMaxLocReduce(Arr)); // (3, 5). Note that Arr[3]==5.
        }
   */
   inline proc gpuMaxLocReduce(const ref A: [] ?t) do return doGpuReduce("maxloc", A);

From 79ea359dcd3b1be75742be7574b32dc4bdf79ff4 Mon Sep 17 00:00:00 2001
From: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
Date: Fri, 3 Nov 2023 11:48:37 -0700
Subject: [PATCH 40/42] Add a missing space

Signed-off-by: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
---
 modules/standard/GPU.chpl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/standard/GPU.chpl b/modules/standard/GPU.chpl
index 36612bf99d58..a45d7540ad09 100644
--- a/modules/standard/GPU.chpl
+++ b/modules/standard/GPU.chpl
@@ -594,7 +594,7 @@ module GPU
 
   /*
     For an array on the GPU, return a tuple with the index and the value of the
-    minimum element (that is, perform a minloc-reduction).If there are multiple
+    minimum element (that is, perform a minloc-reduction). If there are multiple
     elements with the same minimum value, the index of the first one is
     returned. The array must be in GPU-accessible memory and the function must
     be called from outside a GPU-eligible loop.  Only arrays with int, uint, and

From cc39ef743307e46fa1c9d39eb0690c24d4ca6637 Mon Sep 17 00:00:00 2001
From: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
Date: Mon, 6 Nov 2023 13:54:32 -0800
Subject: [PATCH 41/42] Add missing commas in AMD runtime

Signed-off-by: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
---
 runtime/src/gpu/amd/gpu-amd-reduce.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/runtime/src/gpu/amd/gpu-amd-reduce.cc b/runtime/src/gpu/amd/gpu-amd-reduce.cc
index d8a3f01cbaac..638a15451427 100644
--- a/runtime/src/gpu/amd/gpu-amd-reduce.cc
+++ b/runtime/src/gpu/amd/gpu-amd-reduce.cc
@@ -37,14 +37,14 @@
 #if 1
 #define DEF_ONE_REDUCE_RET_VAL(impl_kind, chpl_kind, data_type) \
 void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
-                                                    data_type* val, int* idx\
+                                                    data_type* val, int* idx,\
                                                     void* stream) {\
   chpl_internal_error("This function shouldn't have been called. Reduction is not supported with AMD GPUs\n");\
 }
 #elif ROCM_VERSION_MAJOR >= 5
 #define DEF_ONE_REDUCE_RET_VAL(impl_kind, chpl_kind, data_type) \
 void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
-                                                    data_type* val, int* idx\
+                                                    data_type* val, int* idx,\
                                                     void* stream) {\
   data_type* result; \
   ROCM_CALL(hipMalloc(&result, sizeof(data_type)));\

From 00ee96799c71392b8f67ecfb0dd3f593a6668d81 Mon Sep 17 00:00:00 2001
From: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
Date: Mon, 6 Nov 2023 14:11:50 -0800
Subject: [PATCH 42/42] Move skipif to the parent directory

Signed-off-by: Engin Kayraklioglu <e-kayrakli@users.noreply.github.com>
---
 test/gpu/native/{noAmd/SKIPIF => noAmd.skipif} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename test/gpu/native/{noAmd/SKIPIF => noAmd.skipif} (100%)

diff --git a/test/gpu/native/noAmd/SKIPIF b/test/gpu/native/noAmd.skipif
similarity index 100%
rename from test/gpu/native/noAmd/SKIPIF
rename to test/gpu/native/noAmd.skipif