From 096fdbac820ccaaaad1dda82eb47ef0514893ba5 Mon Sep 17 00:00:00 2001 From: Engin Kayraklioglu Date: Tue, 19 Sep 2023 10:03:19 -0700 Subject: [PATCH 01/42] Snapshot Signed-off-by: Engin Kayraklioglu --- runtime/include/chpl-gpu.h | 2 ++ runtime/make/Makefile.runtime.include | 1 + runtime/src/gpu/nvidia/Makefile.include | 3 +++ runtime/src/gpu/nvidia/gpu-nvidia.c | 19 +++++++++++++++++++ 4 files changed, 25 insertions(+) diff --git a/runtime/include/chpl-gpu.h b/runtime/include/chpl-gpu.h index 5fbc5276facc..e4b00361bc87 100644 --- a/runtime/include/chpl-gpu.h +++ b/runtime/include/chpl-gpu.h @@ -149,6 +149,8 @@ size_t chpl_gpu_get_alloc_size(void* ptr); bool chpl_gpu_can_access_peer(int dev1, int dev2); void chpl_gpu_set_peer_access(int dev1, int dev2, bool enable); +int chpl_gpu_aux_sum_reduce(int* data, int n); + #endif // HAS_GPU_LOCALE #ifdef __cplusplus diff --git a/runtime/make/Makefile.runtime.include b/runtime/make/Makefile.runtime.include index b19be2b8ae7e..dd474519c9f7 100644 --- a/runtime/make/Makefile.runtime.include +++ b/runtime/make/Makefile.runtime.include @@ -59,6 +59,7 @@ RUNTIME_INCLS += \ -I. \ -I$(RUNTIME_INCLUDE_ROOT)/localeModels/$(CHPL_MAKE_LOCALE_MODEL) \ -I$(RUNTIME_INCLUDE_ROOT)/localeModels \ + -I$(RUNTIME_INCLUDE_ROOT)/gpu/$(CHPL_MAKE_GPU)/cub-wrappers \ -I$(RUNTIME_INCLUDE_ROOT)/gpu/$(CHPL_MAKE_GPU) \ -I$(RUNTIME_INCLUDE_ROOT)/gpu \ -I$(RUNTIME_INCLUDE_ROOT)/comm/$(CHPL_MAKE_COMM) \ diff --git a/runtime/src/gpu/nvidia/Makefile.include b/runtime/src/gpu/nvidia/Makefile.include index cb4da19051a8..b98d7052dc41 100644 --- a/runtime/src/gpu/nvidia/Makefile.include +++ b/runtime/src/gpu/nvidia/Makefile.include @@ -22,4 +22,7 @@ GPU_OBJDIR = $(RUNTIME_BUILD)/$(GPU_SUBDIR) ALL_SRCS += $(CURDIR)/$(GPU_SUBDIR)/*.c +RUNTIME_INCLS += $(CURDIR)/$(GPU_SUBDIR)/cub-wrappers + + include $(RUNTIME_ROOT)/$(GPU_SUBDIR)/Makefile.share diff --git a/runtime/src/gpu/nvidia/gpu-nvidia.c b/runtime/src/gpu/nvidia/gpu-nvidia.c index bc5e2cc787dd..f6774ae7f54c 100644 --- a/runtime/src/gpu/nvidia/gpu-nvidia.c +++ b/runtime/src/gpu/nvidia/gpu-nvidia.c @@ -469,5 +469,24 @@ void chpl_gpu_impl_stream_synchronize(void* stream) { CUDA_CALL(cuStreamSynchronize(stream)); } } +/* +#include + +int* chpl_gpu_aux_sum_reduce(int* data, int n) { + + int* result; + void* temp = NULL; + size_t temp_bytes = 0; + cub::DeviceReduce::Sum(temp, temp_bytes, data, result, n); + + // Allocate temporary storage + cuMemAlloc(&(CUdeviceptr)temp, temp_bytes); + + // Run sum-reduction + cub::DeviceReduce::Sum(temp, temp_bytes, d_in, d_out, num_items); + + printf("Result %d\n", *d_out); +} +*/ #endif // HAS_GPU_LOCALE From 9020bf541d43381ebcc5ed39638422ea2b19157e Mon Sep 17 00:00:00 2001 From: Engin Kayraklioglu Date: Thu, 21 Sep 2023 17:16:29 -0700 Subject: [PATCH 02/42] Get things to link in a hacky way Signed-off-by: Engin Kayraklioglu --- runtime/Makefile.help | 3 ++ runtime/src/gpu/nvidia/Makefile | 2 + runtime/src/gpu/nvidia/cub-wrappers/Makefile | 41 +++++++++++++++++++ .../gpu/nvidia/cub-wrappers/Makefile.include | 25 +++++++++++ .../gpu/nvidia/cub-wrappers/Makefile.share | 29 +++++++++++++ .../cub-wrappers/gpu-nvidia-cub-wrappers.cc | 21 ++++++++++ 6 files changed, 121 insertions(+) create mode 100644 runtime/src/gpu/nvidia/cub-wrappers/Makefile create mode 100644 runtime/src/gpu/nvidia/cub-wrappers/Makefile.include create mode 100644 runtime/src/gpu/nvidia/cub-wrappers/Makefile.share create mode 100644 runtime/src/gpu/nvidia/cub-wrappers/gpu-nvidia-cub-wrappers.cc diff --git a/runtime/Makefile.help b/runtime/Makefile.help index cf0d29a72fb4..fa98f7761456 100644 --- a/runtime/Makefile.help +++ b/runtime/Makefile.help @@ -78,6 +78,8 @@ include $(call FIND_MAKEFILE,LAUNCHER,launch) include $(call FIND_MAKEFILE,MEM,mem) include $(call FIND_MAKEFILE,TOPO,topo) +include src/gpu/nvidia/cub-wrappers/Makefile.include + include src/qio/Makefile.include @@ -96,6 +98,7 @@ RUNTIME_OBJS = \ $(THREADS_OBJS) \ $(TIMERS_OBJS) \ $(COMM_OBJS) \ + $(CUB_OBJS) \ $(GPU_OBJS) \ $(MEM_COMMON_OBJS) \ $(QIO_OBJS) \ diff --git a/runtime/src/gpu/nvidia/Makefile b/runtime/src/gpu/nvidia/Makefile index 7b82be818066..25026a44bc4c 100644 --- a/runtime/src/gpu/nvidia/Makefile +++ b/runtime/src/gpu/nvidia/Makefile @@ -33,6 +33,8 @@ include Makefile.share TARGETS = $(GPU_OBJS) +SUBDIRS = cub-wrappers + include $(RUNTIME_ROOT)/make/Makefile.runtime.subdirrules # diff --git a/runtime/src/gpu/nvidia/cub-wrappers/Makefile b/runtime/src/gpu/nvidia/cub-wrappers/Makefile new file mode 100644 index 000000000000..7cbacd864502 --- /dev/null +++ b/runtime/src/gpu/nvidia/cub-wrappers/Makefile @@ -0,0 +1,41 @@ +# Copyright 2020-2023 Hewlett Packard Enterprise Development LP +# Copyright 2004-2019 Cray Inc. +# Other additional copyright holders may be indicated within. +# +# The entirety of this work is licensed under the Apache License, +# Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. +# +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +RUNTIME_ROOT = ../../../.. +RUNTIME_SUBDIR = src/gpu/nvidia/cub-wrappers + +ifndef CHPL_MAKE_HOME +export CHPL_MAKE_HOME=$(shell pwd)/$(RUNTIME_ROOT)/.. +endif + +# +# standard header +# +include $(RUNTIME_ROOT)/make/Makefile.runtime.head + +CUB_OBJDIR = $(RUNTIME_OBJDIR) +include Makefile.share + +TARGETS = $(CUB_OBJS) + +include $(RUNTIME_ROOT)/make/Makefile.runtime.subdirrules + +# +# standard footer +# +include $(RUNTIME_ROOT)/make/Makefile.runtime.foot diff --git a/runtime/src/gpu/nvidia/cub-wrappers/Makefile.include b/runtime/src/gpu/nvidia/cub-wrappers/Makefile.include new file mode 100644 index 000000000000..2b8ca29764af --- /dev/null +++ b/runtime/src/gpu/nvidia/cub-wrappers/Makefile.include @@ -0,0 +1,25 @@ +# Copyright 2020-2023 Hewlett Packard Enterprise Development LP +# Copyright 2004-2019 Cray Inc. +# Other additional copyright holders may be indicated within. +# +# The entirety of this work is licensed under the Apache License, +# Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. +# +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +CUB_SUBDIR = src/gpu/nvidia/cub-wrappers + +CUB_OBJDIR = $(RUNTIME_BUILD)/$(CUB_SUBDIR) + +ALL_SRCS += $(CURDIR)/$(CUB_SUBDIR)/*.cc + +include $(RUNTIME_ROOT)/$(CUB_SUBDIR)/Makefile.share diff --git a/runtime/src/gpu/nvidia/cub-wrappers/Makefile.share b/runtime/src/gpu/nvidia/cub-wrappers/Makefile.share new file mode 100644 index 000000000000..48f44ffde897 --- /dev/null +++ b/runtime/src/gpu/nvidia/cub-wrappers/Makefile.share @@ -0,0 +1,29 @@ +# Copyright 2020-2023 Hewlett Packard Enterprise Development LP +# Copyright 2004-2019 Cray Inc. +# Other additional copyright holders may be indicated within. +# +# The entirety of this work is licensed under the Apache License, +# Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. +# +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +CUB_SRCS = gpu-nvidia-cub-wrappers.cc + +SRCS = gpu-nvidia-cub-wrappers.cc + +CUB_OBJS = $(CUB_SRCS:%.cc=$(CUB_OBJDIR)/%.o) + +RUNTIME_CXXFLAGS += -x cuda -Wno-unknown-cuda-version + +$(RUNTIME_OBJ_DIR)/gpu-nvidia-cub-wrappers.o: gpu-nvidia-cub-wrappers.cc \ + $(RUNTIME_OBJ_DIR_STAMP) + $(CXX) -c $(CXX11_STD) $(RUNTIME_CXXFLAGS) $(RUNTIME_INCLS) -o $@ $< diff --git a/runtime/src/gpu/nvidia/cub-wrappers/gpu-nvidia-cub-wrappers.cc b/runtime/src/gpu/nvidia/cub-wrappers/gpu-nvidia-cub-wrappers.cc new file mode 100644 index 000000000000..dcf5f94fde10 --- /dev/null +++ b/runtime/src/gpu/nvidia/cub-wrappers/gpu-nvidia-cub-wrappers.cc @@ -0,0 +1,21 @@ +#include +#include + +#include "chpl-gpu.h" + +int chpl_gpu_aux_sum_reduce(int* data, int n) { + + int result = 0; + void* temp = NULL; + size_t temp_bytes = 0; + cub::DeviceReduce::Sum(temp, temp_bytes, data, &result, n); + + // Allocate temporary storage + cuMemAlloc(((CUdeviceptr*)&temp), temp_bytes); + + // Run sum-reduction + cub::DeviceReduce::Sum(temp, temp_bytes, data, &result, n); + + printf("Result %d\n", result); + return result; +} From df97abbc78cb5e31cdd72274bac90a79d4ded44a Mon Sep 17 00:00:00 2001 From: Engin Kayraklioglu Date: Thu, 21 Sep 2023 21:06:50 -0700 Subject: [PATCH 03/42] Get the initial example working Signed-off-by: Engin Kayraklioglu --- runtime/include/chpl-gpu.h | 2 +- .../cub-wrappers/gpu-nvidia-cub-wrappers.cc | 24 +++++++++++++------ 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/runtime/include/chpl-gpu.h b/runtime/include/chpl-gpu.h index e4b00361bc87..eddd3060c7ab 100644 --- a/runtime/include/chpl-gpu.h +++ b/runtime/include/chpl-gpu.h @@ -149,7 +149,7 @@ size_t chpl_gpu_get_alloc_size(void* ptr); bool chpl_gpu_can_access_peer(int dev1, int dev2); void chpl_gpu_set_peer_access(int dev1, int dev2, bool enable); -int chpl_gpu_aux_sum_reduce(int* data, int n); +int64_t chpl_gpu_aux_sum_reduce(int64_t* data, int n); #endif // HAS_GPU_LOCALE diff --git a/runtime/src/gpu/nvidia/cub-wrappers/gpu-nvidia-cub-wrappers.cc b/runtime/src/gpu/nvidia/cub-wrappers/gpu-nvidia-cub-wrappers.cc index dcf5f94fde10..4fc41364ede4 100644 --- a/runtime/src/gpu/nvidia/cub-wrappers/gpu-nvidia-cub-wrappers.cc +++ b/runtime/src/gpu/nvidia/cub-wrappers/gpu-nvidia-cub-wrappers.cc @@ -2,20 +2,30 @@ #include #include "chpl-gpu.h" +#include "../../common/cuda-utils.h" -int chpl_gpu_aux_sum_reduce(int* data, int n) { +int64_t chpl_gpu_aux_sum_reduce(int64_t* data, int n) { + + CUdeviceptr result; + + CUDA_CALL(cuMemAlloc(&result, sizeof(int64_t))); - int result = 0; void* temp = NULL; size_t temp_bytes = 0; - cub::DeviceReduce::Sum(temp, temp_bytes, data, &result, n); + cub::DeviceReduce::Sum(temp, temp_bytes, data, (int64_t*)result, n); + + printf("Allocation for scratch %zu\n", temp_bytes); // Allocate temporary storage - cuMemAlloc(((CUdeviceptr*)&temp), temp_bytes); + CUDA_CALL(cuMemAlloc(((CUdeviceptr*)&temp), temp_bytes)); + + printf("Temporary allocated %p\n", temp); // Run sum-reduction - cub::DeviceReduce::Sum(temp, temp_bytes, data, &result, n); + cub::DeviceReduce::Sum(temp, temp_bytes, data, (int64_t*)result, n, 0, true); - printf("Result %d\n", result); - return result; + int64_t result_host; + CUDA_CALL(cuMemcpyDtoH(&result_host, result, sizeof(int64_t))); + printf("Result %ld\n", result_host); + return result_host; } From 01bcf849d34219442c6136467343a1c18e40a4b9 Mon Sep 17 00:00:00 2001 From: Engin Kayraklioglu Date: Thu, 21 Sep 2023 21:08:53 -0700 Subject: [PATCH 04/42] Cleanup Signed-off-by: Engin Kayraklioglu --- runtime/src/gpu/nvidia/gpu-nvidia.c | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/runtime/src/gpu/nvidia/gpu-nvidia.c b/runtime/src/gpu/nvidia/gpu-nvidia.c index f6774ae7f54c..bc5e2cc787dd 100644 --- a/runtime/src/gpu/nvidia/gpu-nvidia.c +++ b/runtime/src/gpu/nvidia/gpu-nvidia.c @@ -469,24 +469,5 @@ void chpl_gpu_impl_stream_synchronize(void* stream) { CUDA_CALL(cuStreamSynchronize(stream)); } } -/* -#include - -int* chpl_gpu_aux_sum_reduce(int* data, int n) { - - int* result; - void* temp = NULL; - size_t temp_bytes = 0; - cub::DeviceReduce::Sum(temp, temp_bytes, data, result, n); - - // Allocate temporary storage - cuMemAlloc(&(CUdeviceptr)temp, temp_bytes); - - // Run sum-reduction - cub::DeviceReduce::Sum(temp, temp_bytes, d_in, d_out, num_items); - - printf("Result %d\n", *d_out); -} -*/ #endif // HAS_GPU_LOCALE From 38dc53fd6ddf1c5d11ca58123a52eaf3668e71e5 Mon Sep 17 00:00:00 2001 From: Engin Kayraklioglu Date: Fri, 6 Oct 2023 20:46:26 -0700 Subject: [PATCH 05/42] Drop additional makefile logic Signed-off-by: Engin Kayraklioglu --- runtime/src/gpu/nvidia/Makefile | 2 -- runtime/src/gpu/nvidia/Makefile.include | 3 +-- runtime/src/gpu/nvidia/Makefile.share | 14 +++++++++++--- ...pu-nvidia-cub-wrappers.cc => gpu-nvidia-aux.cc} | 10 +++++----- 4 files changed, 17 insertions(+), 12 deletions(-) rename runtime/src/gpu/nvidia/{cub-wrappers/gpu-nvidia-cub-wrappers.cc => gpu-nvidia-aux.cc} (76%) diff --git a/runtime/src/gpu/nvidia/Makefile b/runtime/src/gpu/nvidia/Makefile index 25026a44bc4c..7b82be818066 100644 --- a/runtime/src/gpu/nvidia/Makefile +++ b/runtime/src/gpu/nvidia/Makefile @@ -33,8 +33,6 @@ include Makefile.share TARGETS = $(GPU_OBJS) -SUBDIRS = cub-wrappers - include $(RUNTIME_ROOT)/make/Makefile.runtime.subdirrules # diff --git a/runtime/src/gpu/nvidia/Makefile.include b/runtime/src/gpu/nvidia/Makefile.include index b98d7052dc41..9e0f8711b311 100644 --- a/runtime/src/gpu/nvidia/Makefile.include +++ b/runtime/src/gpu/nvidia/Makefile.include @@ -21,8 +21,7 @@ GPU_SUBDIR = src/gpu/nvidia GPU_OBJDIR = $(RUNTIME_BUILD)/$(GPU_SUBDIR) ALL_SRCS += $(CURDIR)/$(GPU_SUBDIR)/*.c - -RUNTIME_INCLS += $(CURDIR)/$(GPU_SUBDIR)/cub-wrappers +ALL_SRCS += $(CURDIR)/$(GPU_SUBDIR)/*.cc include $(RUNTIME_ROOT)/$(GPU_SUBDIR)/Makefile.share diff --git a/runtime/src/gpu/nvidia/Makefile.share b/runtime/src/gpu/nvidia/Makefile.share index 36d2debdee0d..2716a17c89d7 100644 --- a/runtime/src/gpu/nvidia/Makefile.share +++ b/runtime/src/gpu/nvidia/Makefile.share @@ -15,8 +15,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -SRCS = gpu-nvidia.c +GPU_SRCS = gpu-nvidia.c gpu-nvidia-aux.cc -GPU_SRCS = gpu-nvidia.c +SRCS = $(GPU_SRCS) -GPU_OBJS = $(GPU_SRCS:%.c=$(GPU_OBJDIR)/%.o) +GPU_OBJS = $(addprefix $(GPU_OBJDIR)/,$(addsuffix .o,$(basename $(GPU_SRCS)))) + +$(info $$GPU_OBJS is [${GPU_OBJS}]) + +RUNTIME_CXXFLAGS += -x cuda -Wno-unknown-cuda-version + +$(RUNTIME_OBJ_DIR)/gpu-nvidia-aux.o: gpu-nvidia-aux.cc \ + $(RUNTIME_OBJ_DIR_STAMP) + $(CXX) -c $(CXX11_STD) $(RUNTIME_CXXFLAGS) $(RUNTIME_INCLS) -o $@ $< diff --git a/runtime/src/gpu/nvidia/cub-wrappers/gpu-nvidia-cub-wrappers.cc b/runtime/src/gpu/nvidia/gpu-nvidia-aux.cc similarity index 76% rename from runtime/src/gpu/nvidia/cub-wrappers/gpu-nvidia-cub-wrappers.cc rename to runtime/src/gpu/nvidia/gpu-nvidia-aux.cc index 4fc41364ede4..49ead366a6db 100644 --- a/runtime/src/gpu/nvidia/cub-wrappers/gpu-nvidia-cub-wrappers.cc +++ b/runtime/src/gpu/nvidia/gpu-nvidia-aux.cc @@ -2,7 +2,7 @@ #include #include "chpl-gpu.h" -#include "../../common/cuda-utils.h" +#include "../common/cuda-utils.h" int64_t chpl_gpu_aux_sum_reduce(int64_t* data, int n) { @@ -14,18 +14,18 @@ int64_t chpl_gpu_aux_sum_reduce(int64_t* data, int n) { size_t temp_bytes = 0; cub::DeviceReduce::Sum(temp, temp_bytes, data, (int64_t*)result, n); - printf("Allocation for scratch %zu\n", temp_bytes); + //printf("Allocation for scratch %zu\n", temp_bytes); // Allocate temporary storage CUDA_CALL(cuMemAlloc(((CUdeviceptr*)&temp), temp_bytes)); - printf("Temporary allocated %p\n", temp); + //printf("Temporary allocated %p\n", temp); // Run sum-reduction - cub::DeviceReduce::Sum(temp, temp_bytes, data, (int64_t*)result, n, 0, true); + cub::DeviceReduce::Sum(temp, temp_bytes, data, (int64_t*)result, n, 0); int64_t result_host; CUDA_CALL(cuMemcpyDtoH(&result_host, result, sizeof(int64_t))); - printf("Result %ld\n", result_host); + //printf("Result %ld\n", result_host); return result_host; } From 34173011bb681fe7632cc057e62a3186b46442e9 Mon Sep 17 00:00:00 2001 From: Engin Kayraklioglu Date: Fri, 6 Oct 2023 21:40:03 -0700 Subject: [PATCH 06/42] Finish implementing basic reductions Signed-off-by: Engin Kayraklioglu --- runtime/Makefile.help | 2 - runtime/include/chpl-gpu.h | 20 ++++++- runtime/make/Makefile.runtime.include | 1 - runtime/src/gpu/nvidia/cub-wrappers/Makefile | 41 --------------- .../gpu/nvidia/cub-wrappers/Makefile.include | 25 --------- .../gpu/nvidia/cub-wrappers/Makefile.share | 29 ----------- runtime/src/gpu/nvidia/gpu-nvidia-aux.cc | 52 ++++++++++--------- 7 files changed, 47 insertions(+), 123 deletions(-) delete mode 100644 runtime/src/gpu/nvidia/cub-wrappers/Makefile delete mode 100644 runtime/src/gpu/nvidia/cub-wrappers/Makefile.include delete mode 100644 runtime/src/gpu/nvidia/cub-wrappers/Makefile.share diff --git a/runtime/Makefile.help b/runtime/Makefile.help index fa98f7761456..910277c18dab 100644 --- a/runtime/Makefile.help +++ b/runtime/Makefile.help @@ -78,8 +78,6 @@ include $(call FIND_MAKEFILE,LAUNCHER,launch) include $(call FIND_MAKEFILE,MEM,mem) include $(call FIND_MAKEFILE,TOPO,topo) -include src/gpu/nvidia/cub-wrappers/Makefile.include - include src/qio/Makefile.include diff --git a/runtime/include/chpl-gpu.h b/runtime/include/chpl-gpu.h index eddd3060c7ab..1508cf0a87e0 100644 --- a/runtime/include/chpl-gpu.h +++ b/runtime/include/chpl-gpu.h @@ -149,7 +149,25 @@ size_t chpl_gpu_get_alloc_size(void* ptr); bool chpl_gpu_can_access_peer(int dev1, int dev2); void chpl_gpu_set_peer_access(int dev1, int dev2, bool enable); -int64_t chpl_gpu_aux_sum_reduce(int64_t* data, int n); + +#define DECL_ONE_BASIC_REDUCE(chpl_kind, data_type) \ +data_type chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n); + +#define DECL_BASIC_REDUCE(chpl_kind) \ + DECL_ONE_BASIC_REDUCE(chpl_kind, int8_t) \ + DECL_ONE_BASIC_REDUCE(chpl_kind, int16_t) \ + DECL_ONE_BASIC_REDUCE(chpl_kind, int32_t) \ + DECL_ONE_BASIC_REDUCE(chpl_kind, int64_t) \ + DECL_ONE_BASIC_REDUCE(chpl_kind, uint8_t) \ + DECL_ONE_BASIC_REDUCE(chpl_kind, uint16_t) \ + DECL_ONE_BASIC_REDUCE(chpl_kind, uint32_t) \ + DECL_ONE_BASIC_REDUCE(chpl_kind, uint64_t) \ + DECL_ONE_BASIC_REDUCE(chpl_kind, float) \ + DECL_ONE_BASIC_REDUCE(chpl_kind, double) + +DECL_BASIC_REDUCE(sum); +DECL_BASIC_REDUCE(min); +DECL_BASIC_REDUCE(max); #endif // HAS_GPU_LOCALE diff --git a/runtime/make/Makefile.runtime.include b/runtime/make/Makefile.runtime.include index dd474519c9f7..b19be2b8ae7e 100644 --- a/runtime/make/Makefile.runtime.include +++ b/runtime/make/Makefile.runtime.include @@ -59,7 +59,6 @@ RUNTIME_INCLS += \ -I. \ -I$(RUNTIME_INCLUDE_ROOT)/localeModels/$(CHPL_MAKE_LOCALE_MODEL) \ -I$(RUNTIME_INCLUDE_ROOT)/localeModels \ - -I$(RUNTIME_INCLUDE_ROOT)/gpu/$(CHPL_MAKE_GPU)/cub-wrappers \ -I$(RUNTIME_INCLUDE_ROOT)/gpu/$(CHPL_MAKE_GPU) \ -I$(RUNTIME_INCLUDE_ROOT)/gpu \ -I$(RUNTIME_INCLUDE_ROOT)/comm/$(CHPL_MAKE_COMM) \ diff --git a/runtime/src/gpu/nvidia/cub-wrappers/Makefile b/runtime/src/gpu/nvidia/cub-wrappers/Makefile deleted file mode 100644 index 7cbacd864502..000000000000 --- a/runtime/src/gpu/nvidia/cub-wrappers/Makefile +++ /dev/null @@ -1,41 +0,0 @@ -# Copyright 2020-2023 Hewlett Packard Enterprise Development LP -# Copyright 2004-2019 Cray Inc. -# Other additional copyright holders may be indicated within. -# -# The entirety of this work is licensed under the Apache License, -# Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. -# -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -RUNTIME_ROOT = ../../../.. -RUNTIME_SUBDIR = src/gpu/nvidia/cub-wrappers - -ifndef CHPL_MAKE_HOME -export CHPL_MAKE_HOME=$(shell pwd)/$(RUNTIME_ROOT)/.. -endif - -# -# standard header -# -include $(RUNTIME_ROOT)/make/Makefile.runtime.head - -CUB_OBJDIR = $(RUNTIME_OBJDIR) -include Makefile.share - -TARGETS = $(CUB_OBJS) - -include $(RUNTIME_ROOT)/make/Makefile.runtime.subdirrules - -# -# standard footer -# -include $(RUNTIME_ROOT)/make/Makefile.runtime.foot diff --git a/runtime/src/gpu/nvidia/cub-wrappers/Makefile.include b/runtime/src/gpu/nvidia/cub-wrappers/Makefile.include deleted file mode 100644 index 2b8ca29764af..000000000000 --- a/runtime/src/gpu/nvidia/cub-wrappers/Makefile.include +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright 2020-2023 Hewlett Packard Enterprise Development LP -# Copyright 2004-2019 Cray Inc. -# Other additional copyright holders may be indicated within. -# -# The entirety of this work is licensed under the Apache License, -# Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. -# -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -CUB_SUBDIR = src/gpu/nvidia/cub-wrappers - -CUB_OBJDIR = $(RUNTIME_BUILD)/$(CUB_SUBDIR) - -ALL_SRCS += $(CURDIR)/$(CUB_SUBDIR)/*.cc - -include $(RUNTIME_ROOT)/$(CUB_SUBDIR)/Makefile.share diff --git a/runtime/src/gpu/nvidia/cub-wrappers/Makefile.share b/runtime/src/gpu/nvidia/cub-wrappers/Makefile.share deleted file mode 100644 index 48f44ffde897..000000000000 --- a/runtime/src/gpu/nvidia/cub-wrappers/Makefile.share +++ /dev/null @@ -1,29 +0,0 @@ -# Copyright 2020-2023 Hewlett Packard Enterprise Development LP -# Copyright 2004-2019 Cray Inc. -# Other additional copyright holders may be indicated within. -# -# The entirety of this work is licensed under the Apache License, -# Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. -# -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -CUB_SRCS = gpu-nvidia-cub-wrappers.cc - -SRCS = gpu-nvidia-cub-wrappers.cc - -CUB_OBJS = $(CUB_SRCS:%.cc=$(CUB_OBJDIR)/%.o) - -RUNTIME_CXXFLAGS += -x cuda -Wno-unknown-cuda-version - -$(RUNTIME_OBJ_DIR)/gpu-nvidia-cub-wrappers.o: gpu-nvidia-cub-wrappers.cc \ - $(RUNTIME_OBJ_DIR_STAMP) - $(CXX) -c $(CXX11_STD) $(RUNTIME_CXXFLAGS) $(RUNTIME_INCLS) -o $@ $< diff --git a/runtime/src/gpu/nvidia/gpu-nvidia-aux.cc b/runtime/src/gpu/nvidia/gpu-nvidia-aux.cc index 49ead366a6db..17fda58d9336 100644 --- a/runtime/src/gpu/nvidia/gpu-nvidia-aux.cc +++ b/runtime/src/gpu/nvidia/gpu-nvidia-aux.cc @@ -4,28 +4,32 @@ #include "chpl-gpu.h" #include "../common/cuda-utils.h" -int64_t chpl_gpu_aux_sum_reduce(int64_t* data, int n) { - - CUdeviceptr result; - - CUDA_CALL(cuMemAlloc(&result, sizeof(int64_t))); - - void* temp = NULL; - size_t temp_bytes = 0; - cub::DeviceReduce::Sum(temp, temp_bytes, data, (int64_t*)result, n); - - //printf("Allocation for scratch %zu\n", temp_bytes); - - // Allocate temporary storage - CUDA_CALL(cuMemAlloc(((CUdeviceptr*)&temp), temp_bytes)); - - //printf("Temporary allocated %p\n", temp); - - // Run sum-reduction - cub::DeviceReduce::Sum(temp, temp_bytes, data, (int64_t*)result, n, 0); - - int64_t result_host; - CUDA_CALL(cuMemcpyDtoH(&result_host, result, sizeof(int64_t))); - //printf("Result %ld\n", result_host); - return result_host; +#define DEF_ONE_BASIC_REDUCE(cub_kind, chpl_kind, data_type) \ +data_type chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n) {\ + CUdeviceptr result; \ + CUDA_CALL(cuMemAlloc(&result, sizeof(data_type))); \ + void* temp = NULL; \ + size_t temp_bytes = 0; \ + cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (data_type*)result, n); \ + CUDA_CALL(cuMemAlloc(((CUdeviceptr*)&temp), temp_bytes)); \ + cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (data_type*)result, n); \ + data_type result_host; \ + CUDA_CALL(cuMemcpyDtoH(&result_host, result, sizeof(data_type))); \ + return result_host; \ } + +#define DEF_BASIC_REDUCE(cub_kind, chpl_kind) \ + DEF_ONE_BASIC_REDUCE(cub_kind, chpl_kind, int8_t) \ + DEF_ONE_BASIC_REDUCE(cub_kind, chpl_kind, int16_t) \ + DEF_ONE_BASIC_REDUCE(cub_kind, chpl_kind, int32_t) \ + DEF_ONE_BASIC_REDUCE(cub_kind, chpl_kind, int64_t) \ + DEF_ONE_BASIC_REDUCE(cub_kind, chpl_kind, uint8_t) \ + DEF_ONE_BASIC_REDUCE(cub_kind, chpl_kind, uint16_t) \ + DEF_ONE_BASIC_REDUCE(cub_kind, chpl_kind, uint32_t) \ + DEF_ONE_BASIC_REDUCE(cub_kind, chpl_kind, uint64_t) \ + DEF_ONE_BASIC_REDUCE(cub_kind, chpl_kind, float) \ + DEF_ONE_BASIC_REDUCE(cub_kind, chpl_kind, double); + +DEF_BASIC_REDUCE(Sum, sum) +DEF_BASIC_REDUCE(Min, min) +DEF_BASIC_REDUCE(Max, max) From 17118b35aa4c11d20f6e8fd43670991afe112a9b Mon Sep 17 00:00:00 2001 From: Engin Kayraklioglu Date: Fri, 6 Oct 2023 21:40:43 -0700 Subject: [PATCH 07/42] Add test Signed-off-by: Engin Kayraklioglu --- test/gpu/native/reduction/basic.chpl | 69 ++++++++++++++++++++++++++++ test/gpu/native/reduction/basic.good | 2 + 2 files changed, 71 insertions(+) create mode 100644 test/gpu/native/reduction/basic.chpl create mode 100644 test/gpu/native/reduction/basic.good diff --git a/test/gpu/native/reduction/basic.chpl b/test/gpu/native/reduction/basic.chpl new file mode 100644 index 000000000000..7865caa24376 --- /dev/null +++ b/test/gpu/native/reduction/basic.chpl @@ -0,0 +1,69 @@ + +inline proc chpl_reduceHelp(param op: string, ref A: [] ?t): t { + proc chplTypeToCTypeName(type t) param { + select t { + when int(8) do return "int8_t"; + when int(16) do return "int16_t"; + when int(32) do return "int32_t"; + when int(64) do return "int64_t"; + when uint(8) do return "uint8_t"; + when uint(16) do return "uint16_t"; + when uint(32) do return "uint32_t"; + when uint(64) do return "uint64_t"; + when real(32) do return "float"; + when real(64) do return "double"; + } + return "unknown"; + } + + proc externFuncName(param op: string, type t) param: string { + return "chpl_gpu_"+op+"_reduce_"+chplTypeToCTypeName(t); + } + + use CTypes; + extern externFuncName(op, t) proc reduce_fn(data, size): t; + return reduce_fn(c_ptrTo(A), A.size); +} + +inline proc gpuSumReduce(ref A: [] ?t): t { return chpl_reduceHelp("sum", A); } +inline proc gpuMinReduce(ref A: [] ?t): t { return chpl_reduceHelp("min", A); } +inline proc gpuMaxReduce(ref A: [] ?t): t { return chpl_reduceHelp("max", A); } + +config const n = 100; + + +proc testType(type t) { + proc test(param op: string, type t) { + on here.gpus[0] { + var Arr: [0..#n] t; + + foreach i in Arr.domain do Arr[i] = i:t; + + var res; + select op { + when "sum" do res=gpuSumReduce(Arr); + when "min" do res=gpuMinReduce(Arr); + when "max" do res=gpuMaxReduce(Arr); + } + + writeln(op, ": ", res); + } + } + + writeln("Testing type ", t:string); + test("sum", t); + test("min", t); + test("max", t); + writeln(); +} + +testType(int(8)); +testType(int(16)); +testType(int(32)); +testType(int(64)); +testType(uint(8)); +testType(uint(16)); +testType(uint(32)); +testType(uint(64)); +testType(real(32)); +testType(real(64)); diff --git a/test/gpu/native/reduction/basic.good b/test/gpu/native/reduction/basic.good new file mode 100644 index 000000000000..053fbaf2a4dd --- /dev/null +++ b/test/gpu/native/reduction/basic.good @@ -0,0 +1,2 @@ +warning: The prototype GPU support implies --no-checks. This may impact debuggability. To suppress this warning, compile with --no-checks explicitly +4950 From b5e689befde2993d0c60831b306922a4c5d3dc4d Mon Sep 17 00:00:00 2001 From: Engin Kayraklioglu Date: Sun, 8 Oct 2023 21:21:58 -0700 Subject: [PATCH 08/42] Add minloc and maxloc reduces Signed-off-by: Engin Kayraklioglu --- runtime/include/chpl-gpu.h | 51 +++++++++++------ runtime/src/gpu/nvidia/gpu-nvidia-aux.cc | 32 +++++++++++ test/gpu/native/reduction/basic.chpl | 22 +++++++- test/gpu/native/reduction/basic.good | 71 +++++++++++++++++++++++- 4 files changed, 156 insertions(+), 20 deletions(-) diff --git a/runtime/include/chpl-gpu.h b/runtime/include/chpl-gpu.h index 1508cf0a87e0..745b1d2f08b1 100644 --- a/runtime/include/chpl-gpu.h +++ b/runtime/include/chpl-gpu.h @@ -150,24 +150,43 @@ bool chpl_gpu_can_access_peer(int dev1, int dev2); void chpl_gpu_set_peer_access(int dev1, int dev2, bool enable); -#define DECL_ONE_BASIC_REDUCE(chpl_kind, data_type) \ +#define DECL_ONE_REDUCE_RET_VAL(chpl_kind, data_type) \ data_type chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n); -#define DECL_BASIC_REDUCE(chpl_kind) \ - DECL_ONE_BASIC_REDUCE(chpl_kind, int8_t) \ - DECL_ONE_BASIC_REDUCE(chpl_kind, int16_t) \ - DECL_ONE_BASIC_REDUCE(chpl_kind, int32_t) \ - DECL_ONE_BASIC_REDUCE(chpl_kind, int64_t) \ - DECL_ONE_BASIC_REDUCE(chpl_kind, uint8_t) \ - DECL_ONE_BASIC_REDUCE(chpl_kind, uint16_t) \ - DECL_ONE_BASIC_REDUCE(chpl_kind, uint32_t) \ - DECL_ONE_BASIC_REDUCE(chpl_kind, uint64_t) \ - DECL_ONE_BASIC_REDUCE(chpl_kind, float) \ - DECL_ONE_BASIC_REDUCE(chpl_kind, double) - -DECL_BASIC_REDUCE(sum); -DECL_BASIC_REDUCE(min); -DECL_BASIC_REDUCE(max); +#define DECL_REDUCE_RET_VAL(chpl_kind) \ + DECL_ONE_REDUCE_RET_VAL(chpl_kind, int8_t) \ + DECL_ONE_REDUCE_RET_VAL(chpl_kind, int16_t) \ + DECL_ONE_REDUCE_RET_VAL(chpl_kind, int32_t) \ + DECL_ONE_REDUCE_RET_VAL(chpl_kind, int64_t) \ + DECL_ONE_REDUCE_RET_VAL(chpl_kind, uint8_t) \ + DECL_ONE_REDUCE_RET_VAL(chpl_kind, uint16_t) \ + DECL_ONE_REDUCE_RET_VAL(chpl_kind, uint32_t) \ + DECL_ONE_REDUCE_RET_VAL(chpl_kind, uint64_t) \ + DECL_ONE_REDUCE_RET_VAL(chpl_kind, float) \ + DECL_ONE_REDUCE_RET_VAL(chpl_kind, double) + +DECL_REDUCE_RET_VAL(sum); +DECL_REDUCE_RET_VAL(min); +DECL_REDUCE_RET_VAL(max); + +#define DECL_ONE_REDUCE_RET_VAL_IDX(chpl_kind, data_type) \ +void chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ + data_type* val, int* idx); + +#define DECL_REDUCE_RET_VAL_IDX(chpl_kind) \ + DECL_ONE_REDUCE_RET_VAL_IDX(chpl_kind, int8_t) \ + DECL_ONE_REDUCE_RET_VAL_IDX(chpl_kind, int16_t) \ + DECL_ONE_REDUCE_RET_VAL_IDX(chpl_kind, int32_t) \ + DECL_ONE_REDUCE_RET_VAL_IDX(chpl_kind, int64_t) \ + DECL_ONE_REDUCE_RET_VAL_IDX(chpl_kind, uint8_t) \ + DECL_ONE_REDUCE_RET_VAL_IDX(chpl_kind, uint16_t) \ + DECL_ONE_REDUCE_RET_VAL_IDX(chpl_kind, uint32_t) \ + DECL_ONE_REDUCE_RET_VAL_IDX(chpl_kind, uint64_t) \ + DECL_ONE_REDUCE_RET_VAL_IDX(chpl_kind, float) \ + DECL_ONE_REDUCE_RET_VAL_IDX(chpl_kind, double) + +DECL_REDUCE_RET_VAL_IDX(minloc); +DECL_REDUCE_RET_VAL_IDX(maxloc); #endif // HAS_GPU_LOCALE diff --git a/runtime/src/gpu/nvidia/gpu-nvidia-aux.cc b/runtime/src/gpu/nvidia/gpu-nvidia-aux.cc index 17fda58d9336..86190481f58c 100644 --- a/runtime/src/gpu/nvidia/gpu-nvidia-aux.cc +++ b/runtime/src/gpu/nvidia/gpu-nvidia-aux.cc @@ -33,3 +33,35 @@ data_type chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n) {\ DEF_BASIC_REDUCE(Sum, sum) DEF_BASIC_REDUCE(Min, min) DEF_BASIC_REDUCE(Max, max) + +#define DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, data_type) \ +void chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ + data_type* val, int* idx) {\ + using kvp = cub::KeyValuePair; \ + CUdeviceptr result; \ + CUDA_CALL(cuMemAlloc(&result, sizeof(kvp))); \ + void* temp = NULL; \ + size_t temp_bytes = 0; \ + cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (kvp*)result, n); \ + CUDA_CALL(cuMemAlloc(((CUdeviceptr*)&temp), temp_bytes)); \ + cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (kvp*)result, n); \ + kvp result_host; \ + CUDA_CALL(cuMemcpyDtoH(&result_host, result, sizeof(kvp))); \ + *val = result_host.value; \ + *idx = result_host.key; \ +} + +#define DEF_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind) \ + DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, int8_t) \ + DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, int16_t) \ + DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, int32_t) \ + DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, int64_t) \ + DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, uint8_t) \ + DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, uint16_t) \ + DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, uint32_t) \ + DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, uint64_t) \ + DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, float) \ + DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, double); + +DEF_REDUCE_RET_VAL_IDX(ArgMin, minloc) +DEF_REDUCE_RET_VAL_IDX(ArgMax, maxloc) diff --git a/test/gpu/native/reduction/basic.chpl b/test/gpu/native/reduction/basic.chpl index 7865caa24376..1bb3d4b1c7be 100644 --- a/test/gpu/native/reduction/basic.chpl +++ b/test/gpu/native/reduction/basic.chpl @@ -1,5 +1,5 @@ -inline proc chpl_reduceHelp(param op: string, ref A: [] ?t): t { +inline proc chpl_reduceHelp(param op: string, ref A: [] ?t) { proc chplTypeToCTypeName(type t) param { select t { when int(8) do return "int8_t"; @@ -21,13 +21,25 @@ inline proc chpl_reduceHelp(param op: string, ref A: [] ?t): t { } use CTypes; - extern externFuncName(op, t) proc reduce_fn(data, size): t; - return reduce_fn(c_ptrTo(A), A.size); + + if op == "sum" || op == "min" || op == "max" { + extern externFuncName(op, t) proc reduce_fn(data, size): t; + return reduce_fn(c_ptrTo(A), A.size); + } + else { + var idx: int(32); + var val: t; + extern externFuncName(op, t) proc reduce_fn(data, size, ref val, ref idx); + reduce_fn(c_ptrTo(A), A.size, val, idx); + return (idx, val); + } } inline proc gpuSumReduce(ref A: [] ?t): t { return chpl_reduceHelp("sum", A); } inline proc gpuMinReduce(ref A: [] ?t): t { return chpl_reduceHelp("min", A); } inline proc gpuMaxReduce(ref A: [] ?t): t { return chpl_reduceHelp("max", A); } +inline proc gpuMinLocReduce(ref A: [] ?t) { return chpl_reduceHelp("minloc", A); } +inline proc gpuMaxLocReduce(ref A: [] ?t) { return chpl_reduceHelp("maxloc", A); } config const n = 100; @@ -44,6 +56,8 @@ proc testType(type t) { when "sum" do res=gpuSumReduce(Arr); when "min" do res=gpuMinReduce(Arr); when "max" do res=gpuMaxReduce(Arr); + when "minloc" do res=gpuMinLocReduce(Arr); + when "maxloc" do res=gpuMaxLocReduce(Arr); } writeln(op, ": ", res); @@ -54,6 +68,8 @@ proc testType(type t) { test("sum", t); test("min", t); test("max", t); + test("minloc", t); + test("maxloc", t); writeln(); } diff --git a/test/gpu/native/reduction/basic.good b/test/gpu/native/reduction/basic.good index 053fbaf2a4dd..4f9386297b60 100644 --- a/test/gpu/native/reduction/basic.good +++ b/test/gpu/native/reduction/basic.good @@ -1,2 +1,71 @@ warning: The prototype GPU support implies --no-checks. This may impact debuggability. To suppress this warning, compile with --no-checks explicitly -4950 +Testing type int(8) +sum: 86 +min: 0 +max: 99 +minloc: (0, 0) +maxloc: (99, 99) + +Testing type int(16) +sum: 4950 +min: 0 +max: 99 +minloc: (0, 0) +maxloc: (99, 99) + +Testing type int(32) +sum: 4950 +min: 0 +max: 99 +minloc: (0, 0) +maxloc: (99, 99) + +Testing type int(64) +sum: 4950 +min: 0 +max: 99 +minloc: (0, 0) +maxloc: (99, 99) + +Testing type uint(8) +sum: 86 +min: 0 +max: 99 +minloc: (0, 0) +maxloc: (99, 99) + +Testing type uint(16) +sum: 4950 +min: 0 +max: 99 +minloc: (0, 0) +maxloc: (99, 99) + +Testing type uint(32) +sum: 4950 +min: 0 +max: 99 +minloc: (0, 0) +maxloc: (99, 99) + +Testing type uint(64) +sum: 4950 +min: 0 +max: 99 +minloc: (0, 0) +maxloc: (99, 99) + +Testing type real(32) +sum: 4950.0 +min: 0.0 +max: 99.0 +minloc: (0, 0.0) +maxloc: (99, 99.0) + +Testing type real(64) +sum: 4950.0 +min: 0.0 +max: 99.0 +minloc: (0, 0.0) +maxloc: (99, 99.0) + From f0369318eade2d1828956e11e81156e92dc942c2 Mon Sep 17 00:00:00 2001 From: Engin Kayraklioglu Date: Sun, 8 Oct 2023 21:29:45 -0700 Subject: [PATCH 09/42] Refactor basic reduce runtime interface, too Signed-off-by: Engin Kayraklioglu --- runtime/include/chpl-gpu.h | 3 +- runtime/src/gpu/nvidia/gpu-nvidia-aux.cc | 37 ++++++++++++------------ test/gpu/native/reduction/basic.chpl | 24 ++++++++------- 3 files changed, 34 insertions(+), 30 deletions(-) diff --git a/runtime/include/chpl-gpu.h b/runtime/include/chpl-gpu.h index 745b1d2f08b1..510b78c1f238 100644 --- a/runtime/include/chpl-gpu.h +++ b/runtime/include/chpl-gpu.h @@ -151,7 +151,8 @@ void chpl_gpu_set_peer_access(int dev1, int dev2, bool enable); #define DECL_ONE_REDUCE_RET_VAL(chpl_kind, data_type) \ -data_type chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n); +void chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ + data_type* val); #define DECL_REDUCE_RET_VAL(chpl_kind) \ DECL_ONE_REDUCE_RET_VAL(chpl_kind, int8_t) \ diff --git a/runtime/src/gpu/nvidia/gpu-nvidia-aux.cc b/runtime/src/gpu/nvidia/gpu-nvidia-aux.cc index 86190481f58c..2eb127a7c0e0 100644 --- a/runtime/src/gpu/nvidia/gpu-nvidia-aux.cc +++ b/runtime/src/gpu/nvidia/gpu-nvidia-aux.cc @@ -4,8 +4,9 @@ #include "chpl-gpu.h" #include "../common/cuda-utils.h" -#define DEF_ONE_BASIC_REDUCE(cub_kind, chpl_kind, data_type) \ -data_type chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n) {\ +#define DEF_ONE_REDUCE_RET_VAL(cub_kind, chpl_kind, data_type) \ +void chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ + data_type* val) {\ CUdeviceptr result; \ CUDA_CALL(cuMemAlloc(&result, sizeof(data_type))); \ void* temp = NULL; \ @@ -13,26 +14,24 @@ data_type chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n) {\ cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (data_type*)result, n); \ CUDA_CALL(cuMemAlloc(((CUdeviceptr*)&temp), temp_bytes)); \ cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (data_type*)result, n); \ - data_type result_host; \ - CUDA_CALL(cuMemcpyDtoH(&result_host, result, sizeof(data_type))); \ - return result_host; \ + CUDA_CALL(cuMemcpyDtoH(val, result, sizeof(data_type))); \ } -#define DEF_BASIC_REDUCE(cub_kind, chpl_kind) \ - DEF_ONE_BASIC_REDUCE(cub_kind, chpl_kind, int8_t) \ - DEF_ONE_BASIC_REDUCE(cub_kind, chpl_kind, int16_t) \ - DEF_ONE_BASIC_REDUCE(cub_kind, chpl_kind, int32_t) \ - DEF_ONE_BASIC_REDUCE(cub_kind, chpl_kind, int64_t) \ - DEF_ONE_BASIC_REDUCE(cub_kind, chpl_kind, uint8_t) \ - DEF_ONE_BASIC_REDUCE(cub_kind, chpl_kind, uint16_t) \ - DEF_ONE_BASIC_REDUCE(cub_kind, chpl_kind, uint32_t) \ - DEF_ONE_BASIC_REDUCE(cub_kind, chpl_kind, uint64_t) \ - DEF_ONE_BASIC_REDUCE(cub_kind, chpl_kind, float) \ - DEF_ONE_BASIC_REDUCE(cub_kind, chpl_kind, double); +#define DEF_REDUCE_RET_VAL(cub_kind, chpl_kind) \ + DEF_ONE_REDUCE_RET_VAL(cub_kind, chpl_kind, int8_t) \ + DEF_ONE_REDUCE_RET_VAL(cub_kind, chpl_kind, int16_t) \ + DEF_ONE_REDUCE_RET_VAL(cub_kind, chpl_kind, int32_t) \ + DEF_ONE_REDUCE_RET_VAL(cub_kind, chpl_kind, int64_t) \ + DEF_ONE_REDUCE_RET_VAL(cub_kind, chpl_kind, uint8_t) \ + DEF_ONE_REDUCE_RET_VAL(cub_kind, chpl_kind, uint16_t) \ + DEF_ONE_REDUCE_RET_VAL(cub_kind, chpl_kind, uint32_t) \ + DEF_ONE_REDUCE_RET_VAL(cub_kind, chpl_kind, uint64_t) \ + DEF_ONE_REDUCE_RET_VAL(cub_kind, chpl_kind, float) \ + DEF_ONE_REDUCE_RET_VAL(cub_kind, chpl_kind, double); -DEF_BASIC_REDUCE(Sum, sum) -DEF_BASIC_REDUCE(Min, min) -DEF_BASIC_REDUCE(Max, max) +DEF_REDUCE_RET_VAL(Sum, sum) +DEF_REDUCE_RET_VAL(Min, min) +DEF_REDUCE_RET_VAL(Max, max) #define DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, data_type) \ void chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ diff --git a/test/gpu/native/reduction/basic.chpl b/test/gpu/native/reduction/basic.chpl index 1bb3d4b1c7be..0933ac1bfcfd 100644 --- a/test/gpu/native/reduction/basic.chpl +++ b/test/gpu/native/reduction/basic.chpl @@ -1,5 +1,5 @@ -inline proc chpl_reduceHelp(param op: string, ref A: [] ?t) { +inline proc chpl_doGpuReduce(param op: string, ref A: [] ?t) { proc chplTypeToCTypeName(type t) param { select t { when int(8) do return "int8_t"; @@ -16,30 +16,34 @@ inline proc chpl_reduceHelp(param op: string, ref A: [] ?t) { return "unknown"; } - proc externFuncName(param op: string, type t) param: string { + proc getExternFuncName(param op: string, type t) param: string { return "chpl_gpu_"+op+"_reduce_"+chplTypeToCTypeName(t); } use CTypes; + param externFunc = getExternFuncName(op, t); + if op == "sum" || op == "min" || op == "max" { - extern externFuncName(op, t) proc reduce_fn(data, size): t; - return reduce_fn(c_ptrTo(A), A.size); + var val: t; + extern externFunc proc reduce_fn(data, size, ref val); + reduce_fn(c_ptrTo(A), A.size, val); + return val; } else { var idx: int(32); var val: t; - extern externFuncName(op, t) proc reduce_fn(data, size, ref val, ref idx); + extern externFunc proc reduce_fn(data, size, ref val, ref idx); reduce_fn(c_ptrTo(A), A.size, val, idx); return (idx, val); } } -inline proc gpuSumReduce(ref A: [] ?t): t { return chpl_reduceHelp("sum", A); } -inline proc gpuMinReduce(ref A: [] ?t): t { return chpl_reduceHelp("min", A); } -inline proc gpuMaxReduce(ref A: [] ?t): t { return chpl_reduceHelp("max", A); } -inline proc gpuMinLocReduce(ref A: [] ?t) { return chpl_reduceHelp("minloc", A); } -inline proc gpuMaxLocReduce(ref A: [] ?t) { return chpl_reduceHelp("maxloc", A); } +inline proc gpuSumReduce(ref A: [] ?t): t { return chpl_doGpuReduce("sum", A); } +inline proc gpuMinReduce(ref A: [] ?t): t { return chpl_doGpuReduce("min", A); } +inline proc gpuMaxReduce(ref A: [] ?t): t { return chpl_doGpuReduce("max", A); } +inline proc gpuMinLocReduce(ref A: [] ?t) { return chpl_doGpuReduce("minloc", A); } +inline proc gpuMaxLocReduce(ref A: [] ?t) { return chpl_doGpuReduce("maxloc", A); } config const n = 100; From 41e30a9f6885affb16f371bbfff74103ea20ddab Mon Sep 17 00:00:00 2001 From: Engin Kayraklioglu Date: Sun, 8 Oct 2023 21:32:12 -0700 Subject: [PATCH 10/42] Move the actual reduction functions to the GPU module Signed-off-by: Engin Kayraklioglu --- modules/standard/GPU.chpl | 51 ++++++++++++++++++++++++++++ test/gpu/native/reduction/basic.chpl | 48 +------------------------- 2 files changed, 52 insertions(+), 47 deletions(-) diff --git a/modules/standard/GPU.chpl b/modules/standard/GPU.chpl index 62b333d6091b..566d9461b473 100644 --- a/modules/standard/GPU.chpl +++ b/modules/standard/GPU.chpl @@ -389,4 +389,55 @@ module GPU /* When run on a GPU, atomically compare the value in 'x' and 'cmp', if they are equal store 'val' in 'x'. The operation returns the old value of x. */ inline proc gpuAtomicCAS( ref x : ?T, cmp : T, val : T) : T { return gpuAtomicTernOp("CAS", x, cmp, val); } + + // ============================ + // Reductions + // ============================ + + private inline proc doGpuReduce(param op: string, ref A: [] ?t) { + proc chplTypeToCTypeName(type t) param { + select t { + when int(8) do return "int8_t"; + when int(16) do return "int16_t"; + when int(32) do return "int32_t"; + when int(64) do return "int64_t"; + when uint(8) do return "uint8_t"; + when uint(16) do return "uint16_t"; + when uint(32) do return "uint32_t"; + when uint(64) do return "uint64_t"; + when real(32) do return "float"; + when real(64) do return "double"; + } + return "unknown"; + } + + proc getExternFuncName(param op: string, type t) param: string { + return "chpl_gpu_"+op+"_reduce_"+chplTypeToCTypeName(t); + } + + use CTypes; + + param externFunc = getExternFuncName(op, t); + + if op == "sum" || op == "min" || op == "max" { + var val: t; + extern externFunc proc reduce_fn(data, size, ref val); + reduce_fn(c_ptrTo(A), A.size, val); + return val; + } + else { + var idx: int(32); + var val: t; + extern externFunc proc reduce_fn(data, size, ref val, ref idx); + reduce_fn(c_ptrTo(A), A.size, val, idx); + return (idx, val); + } + } + + inline proc gpuSumReduce(ref A: [] ?t) do return doGpuReduce("sum", A); + inline proc gpuMinReduce(ref A: [] ?t) do return doGpuReduce("min", A); + inline proc gpuMaxReduce(ref A: [] ?t) do return doGpuReduce("max", A); + inline proc gpuMinLocReduce(ref A: [] ?t) do return doGpuReduce("minloc", A); + inline proc gpuMaxLocReduce(ref A: [] ?t) do return doGpuReduce("maxloc", A); + } diff --git a/test/gpu/native/reduction/basic.chpl b/test/gpu/native/reduction/basic.chpl index 0933ac1bfcfd..5a1ceef66952 100644 --- a/test/gpu/native/reduction/basic.chpl +++ b/test/gpu/native/reduction/basic.chpl @@ -1,53 +1,7 @@ - -inline proc chpl_doGpuReduce(param op: string, ref A: [] ?t) { - proc chplTypeToCTypeName(type t) param { - select t { - when int(8) do return "int8_t"; - when int(16) do return "int16_t"; - when int(32) do return "int32_t"; - when int(64) do return "int64_t"; - when uint(8) do return "uint8_t"; - when uint(16) do return "uint16_t"; - when uint(32) do return "uint32_t"; - when uint(64) do return "uint64_t"; - when real(32) do return "float"; - when real(64) do return "double"; - } - return "unknown"; - } - - proc getExternFuncName(param op: string, type t) param: string { - return "chpl_gpu_"+op+"_reduce_"+chplTypeToCTypeName(t); - } - - use CTypes; - - param externFunc = getExternFuncName(op, t); - - if op == "sum" || op == "min" || op == "max" { - var val: t; - extern externFunc proc reduce_fn(data, size, ref val); - reduce_fn(c_ptrTo(A), A.size, val); - return val; - } - else { - var idx: int(32); - var val: t; - extern externFunc proc reduce_fn(data, size, ref val, ref idx); - reduce_fn(c_ptrTo(A), A.size, val, idx); - return (idx, val); - } -} - -inline proc gpuSumReduce(ref A: [] ?t): t { return chpl_doGpuReduce("sum", A); } -inline proc gpuMinReduce(ref A: [] ?t): t { return chpl_doGpuReduce("min", A); } -inline proc gpuMaxReduce(ref A: [] ?t): t { return chpl_doGpuReduce("max", A); } -inline proc gpuMinLocReduce(ref A: [] ?t) { return chpl_doGpuReduce("minloc", A); } -inline proc gpuMaxLocReduce(ref A: [] ?t) { return chpl_doGpuReduce("maxloc", A); } +use GPU; config const n = 100; - proc testType(type t) { proc test(param op: string, type t) { on here.gpus[0] { From 8d0e3e07011f58e2e79f9ae82b434c4d96e74640 Mon Sep 17 00:00:00 2001 From: Engin Kayraklioglu Date: Sun, 8 Oct 2023 21:39:50 -0700 Subject: [PATCH 11/42] Simplify runtime macros Signed-off-by: Engin Kayraklioglu --- runtime/include/chpl-gpu.h | 46 +++++++++--------------- runtime/src/gpu/nvidia/gpu-nvidia-aux.cc | 45 +++++++++-------------- 2 files changed, 34 insertions(+), 57 deletions(-) diff --git a/runtime/include/chpl-gpu.h b/runtime/include/chpl-gpu.h index 510b78c1f238..a52d40f53fab 100644 --- a/runtime/include/chpl-gpu.h +++ b/runtime/include/chpl-gpu.h @@ -150,44 +150,32 @@ bool chpl_gpu_can_access_peer(int dev1, int dev2); void chpl_gpu_set_peer_access(int dev1, int dev2, bool enable); +#define DECL_REDUCE(MACRO, chpl_kind) \ + MACRO(chpl_kind, int8_t) \ + MACRO(chpl_kind, int16_t) \ + MACRO(chpl_kind, int32_t) \ + MACRO(chpl_kind, int64_t) \ + MACRO(chpl_kind, uint8_t) \ + MACRO(chpl_kind, uint16_t) \ + MACRO(chpl_kind, uint32_t) \ + MACRO(chpl_kind, uint64_t) \ + MACRO(chpl_kind, float) \ + MACRO(chpl_kind, double) + #define DECL_ONE_REDUCE_RET_VAL(chpl_kind, data_type) \ void chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ data_type* val); -#define DECL_REDUCE_RET_VAL(chpl_kind) \ - DECL_ONE_REDUCE_RET_VAL(chpl_kind, int8_t) \ - DECL_ONE_REDUCE_RET_VAL(chpl_kind, int16_t) \ - DECL_ONE_REDUCE_RET_VAL(chpl_kind, int32_t) \ - DECL_ONE_REDUCE_RET_VAL(chpl_kind, int64_t) \ - DECL_ONE_REDUCE_RET_VAL(chpl_kind, uint8_t) \ - DECL_ONE_REDUCE_RET_VAL(chpl_kind, uint16_t) \ - DECL_ONE_REDUCE_RET_VAL(chpl_kind, uint32_t) \ - DECL_ONE_REDUCE_RET_VAL(chpl_kind, uint64_t) \ - DECL_ONE_REDUCE_RET_VAL(chpl_kind, float) \ - DECL_ONE_REDUCE_RET_VAL(chpl_kind, double) - -DECL_REDUCE_RET_VAL(sum); -DECL_REDUCE_RET_VAL(min); -DECL_REDUCE_RET_VAL(max); +DECL_REDUCE(DECL_ONE_REDUCE_RET_VAL, sum); +DECL_REDUCE(DECL_ONE_REDUCE_RET_VAL, min); +DECL_REDUCE(DECL_ONE_REDUCE_RET_VAL, max); #define DECL_ONE_REDUCE_RET_VAL_IDX(chpl_kind, data_type) \ void chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ data_type* val, int* idx); -#define DECL_REDUCE_RET_VAL_IDX(chpl_kind) \ - DECL_ONE_REDUCE_RET_VAL_IDX(chpl_kind, int8_t) \ - DECL_ONE_REDUCE_RET_VAL_IDX(chpl_kind, int16_t) \ - DECL_ONE_REDUCE_RET_VAL_IDX(chpl_kind, int32_t) \ - DECL_ONE_REDUCE_RET_VAL_IDX(chpl_kind, int64_t) \ - DECL_ONE_REDUCE_RET_VAL_IDX(chpl_kind, uint8_t) \ - DECL_ONE_REDUCE_RET_VAL_IDX(chpl_kind, uint16_t) \ - DECL_ONE_REDUCE_RET_VAL_IDX(chpl_kind, uint32_t) \ - DECL_ONE_REDUCE_RET_VAL_IDX(chpl_kind, uint64_t) \ - DECL_ONE_REDUCE_RET_VAL_IDX(chpl_kind, float) \ - DECL_ONE_REDUCE_RET_VAL_IDX(chpl_kind, double) - -DECL_REDUCE_RET_VAL_IDX(minloc); -DECL_REDUCE_RET_VAL_IDX(maxloc); +DECL_REDUCE(DECL_ONE_REDUCE_RET_VAL_IDX, minloc); +DECL_REDUCE(DECL_ONE_REDUCE_RET_VAL_IDX, maxloc); #endif // HAS_GPU_LOCALE diff --git a/runtime/src/gpu/nvidia/gpu-nvidia-aux.cc b/runtime/src/gpu/nvidia/gpu-nvidia-aux.cc index 2eb127a7c0e0..16f719c72b44 100644 --- a/runtime/src/gpu/nvidia/gpu-nvidia-aux.cc +++ b/runtime/src/gpu/nvidia/gpu-nvidia-aux.cc @@ -4,6 +4,18 @@ #include "chpl-gpu.h" #include "../common/cuda-utils.h" +#define DEF_REDUCE(MACRO, cub_kind, chpl_kind) \ + MACRO(cub_kind, chpl_kind, int8_t) \ + MACRO(cub_kind, chpl_kind, int16_t) \ + MACRO(cub_kind, chpl_kind, int32_t) \ + MACRO(cub_kind, chpl_kind, int64_t) \ + MACRO(cub_kind, chpl_kind, uint8_t) \ + MACRO(cub_kind, chpl_kind, uint16_t) \ + MACRO(cub_kind, chpl_kind, uint32_t) \ + MACRO(cub_kind, chpl_kind, uint64_t) \ + MACRO(cub_kind, chpl_kind, float) \ + MACRO(cub_kind, chpl_kind, double); + #define DEF_ONE_REDUCE_RET_VAL(cub_kind, chpl_kind, data_type) \ void chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ data_type* val) {\ @@ -17,21 +29,9 @@ void chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ CUDA_CALL(cuMemcpyDtoH(val, result, sizeof(data_type))); \ } -#define DEF_REDUCE_RET_VAL(cub_kind, chpl_kind) \ - DEF_ONE_REDUCE_RET_VAL(cub_kind, chpl_kind, int8_t) \ - DEF_ONE_REDUCE_RET_VAL(cub_kind, chpl_kind, int16_t) \ - DEF_ONE_REDUCE_RET_VAL(cub_kind, chpl_kind, int32_t) \ - DEF_ONE_REDUCE_RET_VAL(cub_kind, chpl_kind, int64_t) \ - DEF_ONE_REDUCE_RET_VAL(cub_kind, chpl_kind, uint8_t) \ - DEF_ONE_REDUCE_RET_VAL(cub_kind, chpl_kind, uint16_t) \ - DEF_ONE_REDUCE_RET_VAL(cub_kind, chpl_kind, uint32_t) \ - DEF_ONE_REDUCE_RET_VAL(cub_kind, chpl_kind, uint64_t) \ - DEF_ONE_REDUCE_RET_VAL(cub_kind, chpl_kind, float) \ - DEF_ONE_REDUCE_RET_VAL(cub_kind, chpl_kind, double); - -DEF_REDUCE_RET_VAL(Sum, sum) -DEF_REDUCE_RET_VAL(Min, min) -DEF_REDUCE_RET_VAL(Max, max) +DEF_REDUCE(DEF_ONE_REDUCE_RET_VAL, Sum, sum) +DEF_REDUCE(DEF_ONE_REDUCE_RET_VAL, Min, min) +DEF_REDUCE(DEF_ONE_REDUCE_RET_VAL, Max, max) #define DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, data_type) \ void chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ @@ -50,17 +50,6 @@ void chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ *idx = result_host.key; \ } -#define DEF_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind) \ - DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, int8_t) \ - DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, int16_t) \ - DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, int32_t) \ - DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, int64_t) \ - DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, uint8_t) \ - DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, uint16_t) \ - DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, uint32_t) \ - DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, uint64_t) \ - DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, float) \ - DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, double); +DEF_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, ArgMin, minloc) +DEF_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, ArgMax, maxloc) -DEF_REDUCE_RET_VAL_IDX(ArgMin, minloc) -DEF_REDUCE_RET_VAL_IDX(ArgMax, maxloc) From c56df6bed883e104a5a3c9196439bd68cfa78f2c Mon Sep 17 00:00:00 2001 From: Engin Kayraklioglu Date: Fri, 13 Oct 2023 15:34:48 -0700 Subject: [PATCH 12/42] Start separating impl implementations Signed-off-by: Engin Kayraklioglu --- runtime/src/chpl-gpu.c | 36 ++++++++++++++++++++++++ runtime/src/gpu/nvidia/gpu-nvidia-aux.cc | 5 ++-- 2 files changed, 39 insertions(+), 2 deletions(-) diff --git a/runtime/src/chpl-gpu.c b/runtime/src/chpl-gpu.c index bded55034def..bf3f6d9ae955 100644 --- a/runtime/src/chpl-gpu.c +++ b/runtime/src/chpl-gpu.c @@ -697,4 +697,40 @@ void chpl_gpu_set_peer_access(int dev1, int dev2, bool enable) { chpl_gpu_impl_set_peer_access(dev1, dev2, enable); } +void chpl_gpu_sum_reduce_int8_t(int8_t *data, int n, int* val) { + CHPL_GPU_DEBUG("chpl_gpu_sum_reduce_int8_t called\n"); + + int dev = chpl_task_getRequestedSubloc(); + chpl_gpu_impl_use_device(dev); + void* stream = get_stream(dev); + + chpl_gpu_impl_sum_reduce_int8_t(data, n, val, stream); + + if (chpl_gpu_sync_with_host) { + CHPL_GPU_DEBUG("Eagerly synchronizing stream %p\n", stream); + wait_stream(stream); + } + + CHPL_GPU_DEBUG("chpl_gpu_sum_reduce_int8_t returned\n"); +} + +#define DEF_ONE_REDUCE_RET_VAL(kind, data_type)\ +void chpl_gpu_##kind##_reduce_##data_type(data_type *data, int n, \ + data_type* val) { \ + CHPL_GPU_DEBUG("chpl_gpu_" #kind "_reduce_" #data_type " called\n"); \ + \ + int dev = chpl_task_getRequestedSubloc(); \ + chpl_gpu_impl_use_device(dev); \ + void* stream = get_stream(dev); \ + \ + chpl_gpu_impl_sum_reduce_int8_t(data, n, val, stream); + + if (chpl_gpu_sync_with_host) { + CHPL_GPU_DEBUG("Eagerly synchronizing stream %p\n", stream); + wait_stream(stream); + } + + CHPL_GPU_DEBUG("chpl_gpu_sum_reduce_int8_t returned\n"); +} + #endif diff --git a/runtime/src/gpu/nvidia/gpu-nvidia-aux.cc b/runtime/src/gpu/nvidia/gpu-nvidia-aux.cc index 16f719c72b44..75f3fd7bb986 100644 --- a/runtime/src/gpu/nvidia/gpu-nvidia-aux.cc +++ b/runtime/src/gpu/nvidia/gpu-nvidia-aux.cc @@ -28,10 +28,10 @@ void chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (data_type*)result, n); \ CUDA_CALL(cuMemcpyDtoH(val, result, sizeof(data_type))); \ } - DEF_REDUCE(DEF_ONE_REDUCE_RET_VAL, Sum, sum) DEF_REDUCE(DEF_ONE_REDUCE_RET_VAL, Min, min) DEF_REDUCE(DEF_ONE_REDUCE_RET_VAL, Max, max) +#undef DEF_ONE_REDUCE_RET_VAL #define DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, data_type) \ void chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ @@ -49,7 +49,8 @@ void chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ *val = result_host.value; \ *idx = result_host.key; \ } - DEF_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, ArgMin, minloc) DEF_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, ArgMax, maxloc) +#undef DEF_ONE_REDUCE_RET_VAL_IDX +#undef DEF_REDUCE From 7aa3642bb1578e74efeb1dfcf46de4477e4e87bd Mon Sep 17 00:00:00 2001 From: Engin Kayraklioglu Date: Mon, 16 Oct 2023 16:07:19 -0700 Subject: [PATCH 13/42] Seperate runtime interface from the implementation interface Signed-off-by: Engin Kayraklioglu --- runtime/include/chpl-gpu-impl.h | 21 ++++++++ runtime/include/chpl-gpu.h | 28 ++++------- runtime/src/chpl-gpu.c | 62 +++++++++++++++--------- runtime/src/gpu/nvidia/Makefile.share | 2 - runtime/src/gpu/nvidia/gpu-nvidia-aux.cc | 56 ++++++++++----------- 5 files changed, 99 insertions(+), 70 deletions(-) diff --git a/runtime/include/chpl-gpu-impl.h b/runtime/include/chpl-gpu-impl.h index 1dbd1a4563b3..6c01a07aea1f 100644 --- a/runtime/include/chpl-gpu-impl.h +++ b/runtime/include/chpl-gpu-impl.h @@ -76,6 +76,27 @@ void chpl_gpu_impl_stream_destroy(void* stream); bool chpl_gpu_impl_stream_ready(void* stream); void chpl_gpu_impl_stream_synchronize(void* stream); +#define DECL_ONE_REDUCE_IMPL_RET_VAL(chpl_kind, data_type) \ +void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ + data_type* val,\ + void* stream); + +GPU_REDUCE(DECL_ONE_REDUCE_IMPL_RET_VAL, sum) +GPU_REDUCE(DECL_ONE_REDUCE_IMPL_RET_VAL, min) +GPU_REDUCE(DECL_ONE_REDUCE_IMPL_RET_VAL, max) + +#undef DECL_ONE_REDUCE_RET_VAL + +#define DECL_ONE_REDUCE_IMPL_RET_VAL_IDX(chpl_kind, data_type) \ +void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ + data_type* val, int* idx,\ + void* stream); + +GPU_REDUCE(DECL_ONE_REDUCE_IMPL_RET_VAL_IDX, minloc) +GPU_REDUCE(DECL_ONE_REDUCE_IMPL_RET_VAL_IDX, maxloc) + +#undef DECL_ONE_REDUCE_RET_VAL_IDX + #ifdef __cplusplus } #endif diff --git a/runtime/include/chpl-gpu.h b/runtime/include/chpl-gpu.h index a52d40f53fab..f3b5ae31dc8b 100644 --- a/runtime/include/chpl-gpu.h +++ b/runtime/include/chpl-gpu.h @@ -24,6 +24,7 @@ #include #include "chpl-tasks.h" #include "chpl-mem-desc.h" +#include "gpu/chpl-gpu-reduce-util.h" #ifdef __cplusplus extern "C" { @@ -149,33 +150,24 @@ size_t chpl_gpu_get_alloc_size(void* ptr); bool chpl_gpu_can_access_peer(int dev1, int dev2); void chpl_gpu_set_peer_access(int dev1, int dev2, bool enable); - -#define DECL_REDUCE(MACRO, chpl_kind) \ - MACRO(chpl_kind, int8_t) \ - MACRO(chpl_kind, int16_t) \ - MACRO(chpl_kind, int32_t) \ - MACRO(chpl_kind, int64_t) \ - MACRO(chpl_kind, uint8_t) \ - MACRO(chpl_kind, uint16_t) \ - MACRO(chpl_kind, uint32_t) \ - MACRO(chpl_kind, uint64_t) \ - MACRO(chpl_kind, float) \ - MACRO(chpl_kind, double) - #define DECL_ONE_REDUCE_RET_VAL(chpl_kind, data_type) \ void chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ data_type* val); -DECL_REDUCE(DECL_ONE_REDUCE_RET_VAL, sum); -DECL_REDUCE(DECL_ONE_REDUCE_RET_VAL, min); -DECL_REDUCE(DECL_ONE_REDUCE_RET_VAL, max); +GPU_REDUCE(DECL_ONE_REDUCE_RET_VAL, sum); +GPU_REDUCE(DECL_ONE_REDUCE_RET_VAL, min); +GPU_REDUCE(DECL_ONE_REDUCE_RET_VAL, max); + +#undef DECL_ONE_REDUCE_RET_VAL #define DECL_ONE_REDUCE_RET_VAL_IDX(chpl_kind, data_type) \ void chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ data_type* val, int* idx); -DECL_REDUCE(DECL_ONE_REDUCE_RET_VAL_IDX, minloc); -DECL_REDUCE(DECL_ONE_REDUCE_RET_VAL_IDX, maxloc); +GPU_REDUCE(DECL_ONE_REDUCE_RET_VAL_IDX, minloc); +GPU_REDUCE(DECL_ONE_REDUCE_RET_VAL_IDX, maxloc); + +#undef DECL_ONE_REDUCE_RET_VAL_IDX #endif // HAS_GPU_LOCALE diff --git a/runtime/src/chpl-gpu.c b/runtime/src/chpl-gpu.c index bf3f6d9ae955..7a5508e7e0e7 100644 --- a/runtime/src/chpl-gpu.c +++ b/runtime/src/chpl-gpu.c @@ -43,6 +43,8 @@ bool chpl_gpu_use_stream_per_task = true; #include "chpl-env.h" #include "chpl-comm-compiler-macros.h" +#include "gpu/chpl-gpu-reduce-util.h" + void chpl_gpu_init(void) { chpl_gpu_impl_init(&chpl_gpu_num_devices); @@ -697,23 +699,6 @@ void chpl_gpu_set_peer_access(int dev1, int dev2, bool enable) { chpl_gpu_impl_set_peer_access(dev1, dev2, enable); } -void chpl_gpu_sum_reduce_int8_t(int8_t *data, int n, int* val) { - CHPL_GPU_DEBUG("chpl_gpu_sum_reduce_int8_t called\n"); - - int dev = chpl_task_getRequestedSubloc(); - chpl_gpu_impl_use_device(dev); - void* stream = get_stream(dev); - - chpl_gpu_impl_sum_reduce_int8_t(data, n, val, stream); - - if (chpl_gpu_sync_with_host) { - CHPL_GPU_DEBUG("Eagerly synchronizing stream %p\n", stream); - wait_stream(stream); - } - - CHPL_GPU_DEBUG("chpl_gpu_sum_reduce_int8_t returned\n"); -} - #define DEF_ONE_REDUCE_RET_VAL(kind, data_type)\ void chpl_gpu_##kind##_reduce_##data_type(data_type *data, int n, \ data_type* val) { \ @@ -723,14 +708,45 @@ void chpl_gpu_##kind##_reduce_##data_type(data_type *data, int n, \ chpl_gpu_impl_use_device(dev); \ void* stream = get_stream(dev); \ \ - chpl_gpu_impl_sum_reduce_int8_t(data, n, val, stream); + chpl_gpu_impl_##kind##_reduce_##data_type(data, n, val, stream); \ + \ + if (chpl_gpu_sync_with_host) { \ + CHPL_GPU_DEBUG("Eagerly synchronizing stream %p\n", stream); \ + wait_stream(stream); \ + } \ + \ + CHPL_GPU_DEBUG("chpl_gpu_sum_reduce_int8_t returned\n"); \ +} - if (chpl_gpu_sync_with_host) { - CHPL_GPU_DEBUG("Eagerly synchronizing stream %p\n", stream); - wait_stream(stream); - } +GPU_REDUCE(DEF_ONE_REDUCE_RET_VAL, sum) +GPU_REDUCE(DEF_ONE_REDUCE_RET_VAL, min) +GPU_REDUCE(DEF_ONE_REDUCE_RET_VAL, max) + +#undef DEF_ONE_REDUCE_RET_VAL - CHPL_GPU_DEBUG("chpl_gpu_sum_reduce_int8_t returned\n"); +#define DEF_ONE_REDUCE_RET_VAL_IDX(kind, data_type)\ +void chpl_gpu_##kind##_reduce_##data_type(data_type *data, int n, \ + data_type* val, int* idx) { \ + CHPL_GPU_DEBUG("chpl_gpu_" #kind "_reduce_" #data_type " called\n"); \ + \ + int dev = chpl_task_getRequestedSubloc(); \ + chpl_gpu_impl_use_device(dev); \ + void* stream = get_stream(dev); \ + \ + chpl_gpu_impl_##kind##_reduce_##data_type(data, n, val, idx, stream); \ + \ + if (chpl_gpu_sync_with_host) { \ + CHPL_GPU_DEBUG("Eagerly synchronizing stream %p\n", stream); \ + wait_stream(stream); \ + } \ + \ + CHPL_GPU_DEBUG("chpl_gpu_sum_reduce_int8_t returned\n"); \ } +GPU_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, minloc); +GPU_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, maxloc); + +#undef DEF_ONE_REDUCE_RET_VAL_IDX + + #endif diff --git a/runtime/src/gpu/nvidia/Makefile.share b/runtime/src/gpu/nvidia/Makefile.share index 2716a17c89d7..41930a17bc01 100644 --- a/runtime/src/gpu/nvidia/Makefile.share +++ b/runtime/src/gpu/nvidia/Makefile.share @@ -21,8 +21,6 @@ SRCS = $(GPU_SRCS) GPU_OBJS = $(addprefix $(GPU_OBJDIR)/,$(addsuffix .o,$(basename $(GPU_SRCS)))) -$(info $$GPU_OBJS is [${GPU_OBJS}]) - RUNTIME_CXXFLAGS += -x cuda -Wno-unknown-cuda-version $(RUNTIME_OBJ_DIR)/gpu-nvidia-aux.o: gpu-nvidia-aux.cc \ diff --git a/runtime/src/gpu/nvidia/gpu-nvidia-aux.cc b/runtime/src/gpu/nvidia/gpu-nvidia-aux.cc index 75f3fd7bb986..89fc8789e5a9 100644 --- a/runtime/src/gpu/nvidia/gpu-nvidia-aux.cc +++ b/runtime/src/gpu/nvidia/gpu-nvidia-aux.cc @@ -2,55 +2,57 @@ #include #include "chpl-gpu.h" +#include "chpl-gpu-impl.h" #include "../common/cuda-utils.h" - -#define DEF_REDUCE(MACRO, cub_kind, chpl_kind) \ - MACRO(cub_kind, chpl_kind, int8_t) \ - MACRO(cub_kind, chpl_kind, int16_t) \ - MACRO(cub_kind, chpl_kind, int32_t) \ - MACRO(cub_kind, chpl_kind, int64_t) \ - MACRO(cub_kind, chpl_kind, uint8_t) \ - MACRO(cub_kind, chpl_kind, uint16_t) \ - MACRO(cub_kind, chpl_kind, uint32_t) \ - MACRO(cub_kind, chpl_kind, uint64_t) \ - MACRO(cub_kind, chpl_kind, float) \ - MACRO(cub_kind, chpl_kind, double); +#include "gpu/chpl-gpu-reduce-util.h" #define DEF_ONE_REDUCE_RET_VAL(cub_kind, chpl_kind, data_type) \ -void chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ - data_type* val) {\ +void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ + data_type* val,\ + void* stream) {\ CUdeviceptr result; \ CUDA_CALL(cuMemAlloc(&result, sizeof(data_type))); \ void* temp = NULL; \ size_t temp_bytes = 0; \ - cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (data_type*)result, n); \ + cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (data_type*)result, n,\ + (CUstream)stream); \ CUDA_CALL(cuMemAlloc(((CUdeviceptr*)&temp), temp_bytes)); \ - cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (data_type*)result, n); \ - CUDA_CALL(cuMemcpyDtoH(val, result, sizeof(data_type))); \ + cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (data_type*)result, n,\ + (CUstream)stream); \ + CUDA_CALL(cuMemcpyDtoHAsync(val, result, sizeof(data_type),\ + (CUstream)stream)); \ } -DEF_REDUCE(DEF_ONE_REDUCE_RET_VAL, Sum, sum) -DEF_REDUCE(DEF_ONE_REDUCE_RET_VAL, Min, min) -DEF_REDUCE(DEF_ONE_REDUCE_RET_VAL, Max, max) + +GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL, Sum, sum) +GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL, Min, min) +GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL, Max, max) + #undef DEF_ONE_REDUCE_RET_VAL #define DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, data_type) \ -void chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ - data_type* val, int* idx) {\ +void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ + data_type* val, int* idx,\ + void* stream) {\ using kvp = cub::KeyValuePair; \ CUdeviceptr result; \ CUDA_CALL(cuMemAlloc(&result, sizeof(kvp))); \ void* temp = NULL; \ size_t temp_bytes = 0; \ - cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (kvp*)result, n); \ + cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (kvp*)result, n,\ + (CUstream)stream);\ CUDA_CALL(cuMemAlloc(((CUdeviceptr*)&temp), temp_bytes)); \ - cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (kvp*)result, n); \ + cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (kvp*)result, n,\ + (CUstream)stream);\ kvp result_host; \ - CUDA_CALL(cuMemcpyDtoH(&result_host, result, sizeof(kvp))); \ + CUDA_CALL(cuMemcpyDtoHAsync(&result_host, result, sizeof(kvp),\ + (CUstream)stream)); \ *val = result_host.value; \ *idx = result_host.key; \ } -DEF_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, ArgMin, minloc) -DEF_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, ArgMax, maxloc) + +GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, ArgMin, minloc) +GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, ArgMax, maxloc) + #undef DEF_ONE_REDUCE_RET_VAL_IDX #undef DEF_REDUCE From cf1e526a0b016cddc6a9ef4b792c3e3a6a202fed Mon Sep 17 00:00:00 2001 From: Engin Kayraklioglu Date: Mon, 16 Oct 2023 16:14:56 -0700 Subject: [PATCH 14/42] Runtime cleanup Signed-off-by: Engin Kayraklioglu --- runtime/Makefile.help | 1 - runtime/src/gpu/nvidia/Makefile.share | 4 ++-- ...gpu-nvidia-aux.cc => gpu-nvidia-reduce.cc} | 24 +++++++++++++++++++ 3 files changed, 26 insertions(+), 3 deletions(-) rename runtime/src/gpu/nvidia/{gpu-nvidia-aux.cc => gpu-nvidia-reduce.cc} (74%) diff --git a/runtime/Makefile.help b/runtime/Makefile.help index 910277c18dab..cf0d29a72fb4 100644 --- a/runtime/Makefile.help +++ b/runtime/Makefile.help @@ -96,7 +96,6 @@ RUNTIME_OBJS = \ $(THREADS_OBJS) \ $(TIMERS_OBJS) \ $(COMM_OBJS) \ - $(CUB_OBJS) \ $(GPU_OBJS) \ $(MEM_COMMON_OBJS) \ $(QIO_OBJS) \ diff --git a/runtime/src/gpu/nvidia/Makefile.share b/runtime/src/gpu/nvidia/Makefile.share index 41930a17bc01..a0c109e22f0d 100644 --- a/runtime/src/gpu/nvidia/Makefile.share +++ b/runtime/src/gpu/nvidia/Makefile.share @@ -15,7 +15,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -GPU_SRCS = gpu-nvidia.c gpu-nvidia-aux.cc +GPU_SRCS = gpu-nvidia.c gpu-nvidia-reduce.cc SRCS = $(GPU_SRCS) @@ -23,6 +23,6 @@ GPU_OBJS = $(addprefix $(GPU_OBJDIR)/,$(addsuffix .o,$(basename $(GPU_SRCS)))) RUNTIME_CXXFLAGS += -x cuda -Wno-unknown-cuda-version -$(RUNTIME_OBJ_DIR)/gpu-nvidia-aux.o: gpu-nvidia-aux.cc \ +$(RUNTIME_OBJ_DIR)/gpu-nvidia-reduce.o: gpu-nvidia-reduce.cc \ $(RUNTIME_OBJ_DIR_STAMP) $(CXX) -c $(CXX11_STD) $(RUNTIME_CXXFLAGS) $(RUNTIME_INCLS) -o $@ $< diff --git a/runtime/src/gpu/nvidia/gpu-nvidia-aux.cc b/runtime/src/gpu/nvidia/gpu-nvidia-reduce.cc similarity index 74% rename from runtime/src/gpu/nvidia/gpu-nvidia-aux.cc rename to runtime/src/gpu/nvidia/gpu-nvidia-reduce.cc index 89fc8789e5a9..6c9f7a350c34 100644 --- a/runtime/src/gpu/nvidia/gpu-nvidia-aux.cc +++ b/runtime/src/gpu/nvidia/gpu-nvidia-reduce.cc @@ -1,3 +1,24 @@ +/* + * Copyright 2020-2023 Hewlett Packard Enterprise Development LP + * Copyright 2004-2019 Cray Inc. + * Other additional copyright holders may be indicated within. * + * The entirety of this work is licensed under the Apache License, + * Version 2.0 (the "License"); you may not use this file except + * in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef HAS_GPU_LOCALE + #include #include @@ -56,3 +77,6 @@ GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, ArgMax, maxloc) #undef DEF_ONE_REDUCE_RET_VAL_IDX #undef DEF_REDUCE + +#endif // HAS_GPU_LOCALE + From 1489028227f050a0ec1837395518204c9dcd2938 Mon Sep 17 00:00:00 2001 From: Engin Kayraklioglu Date: Mon, 16 Oct 2023 16:46:18 -0700 Subject: [PATCH 15/42] Add a common header Signed-off-by: Engin Kayraklioglu --- runtime/include/gpu/chpl-gpu-reduce-util.h | 47 ++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 runtime/include/gpu/chpl-gpu-reduce-util.h diff --git a/runtime/include/gpu/chpl-gpu-reduce-util.h b/runtime/include/gpu/chpl-gpu-reduce-util.h new file mode 100644 index 000000000000..6076fb0b98a5 --- /dev/null +++ b/runtime/include/gpu/chpl-gpu-reduce-util.h @@ -0,0 +1,47 @@ +/* + * Copyright 2020-2023 Hewlett Packard Enterprise Development LP + * Copyright 2004-2019 Cray Inc. + * Other additional copyright holders may be indicated within. * + * The entirety of this work is licensed under the Apache License, + * Version 2.0 (the "License"); you may not use this file except + * in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef HAS_GPU_LOCALE + +#define GPU_IMPL_REDUCE(MACRO, impl_kind, chpl_kind) \ + MACRO(impl_kind, chpl_kind, int8_t) \ + MACRO(impl_kind, chpl_kind, int16_t) \ + MACRO(impl_kind, chpl_kind, int32_t) \ + MACRO(impl_kind, chpl_kind, int64_t) \ + MACRO(impl_kind, chpl_kind, uint8_t) \ + MACRO(impl_kind, chpl_kind, uint16_t) \ + MACRO(impl_kind, chpl_kind, uint32_t) \ + MACRO(impl_kind, chpl_kind, uint64_t) \ + MACRO(impl_kind, chpl_kind, float) \ + MACRO(impl_kind, chpl_kind, double); + +#define GPU_REDUCE(MACRO, chpl_kind) \ + MACRO(chpl_kind, int8_t) \ + MACRO(chpl_kind, int16_t) \ + MACRO(chpl_kind, int32_t) \ + MACRO(chpl_kind, int64_t) \ + MACRO(chpl_kind, uint8_t) \ + MACRO(chpl_kind, uint16_t) \ + MACRO(chpl_kind, uint32_t) \ + MACRO(chpl_kind, uint64_t) \ + MACRO(chpl_kind, float) \ + MACRO(chpl_kind, double); + +#endif // HAS_GPU_LOCALE + From 60f921c918e13875c41f920a3d135332d2206146 Mon Sep 17 00:00:00 2001 From: Engin Kayraklioglu Date: Mon, 16 Oct 2023 17:06:59 -0700 Subject: [PATCH 16/42] Blind implementation of reductions on AMD Signed-off-by: Engin Kayraklioglu --- runtime/src/gpu/amd/Makefile.include | 1 + runtime/src/gpu/amd/Makefile.share | 12 +++-- runtime/src/gpu/amd/gpu-amd-reduce.cc | 78 +++++++++++++++++++++++++++ 3 files changed, 88 insertions(+), 3 deletions(-) create mode 100644 runtime/src/gpu/amd/gpu-amd-reduce.cc diff --git a/runtime/src/gpu/amd/Makefile.include b/runtime/src/gpu/amd/Makefile.include index 7ba37d695b24..7bce5c886202 100644 --- a/runtime/src/gpu/amd/Makefile.include +++ b/runtime/src/gpu/amd/Makefile.include @@ -21,5 +21,6 @@ GPU_SUBDIR = src/gpu/amd GPU_OBJDIR = $(RUNTIME_BUILD)/$(GPU_SUBDIR) ALL_SRCS += $(CURDIR)/$(GPU_SUBDIR)/*.c +ALL_SRCS += $(CURDIR)/$(GPU_SUBDIR)/*.cc include $(RUNTIME_ROOT)/$(GPU_SUBDIR)/Makefile.share diff --git a/runtime/src/gpu/amd/Makefile.share b/runtime/src/gpu/amd/Makefile.share index e8ad87a160ab..e8b6db7040d9 100644 --- a/runtime/src/gpu/amd/Makefile.share +++ b/runtime/src/gpu/amd/Makefile.share @@ -15,8 +15,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -SRCS = gpu-amd.c +GPU_SRCS = gpu-amd.c gpu-amd-reduce.cc -GPU_SRCS = gpu-amd.c +SRCS = $(GPU_SRCS) -GPU_OBJS = $(GPU_SRCS:%.c=$(GPU_OBJDIR)/%.o) +GPU_OBJS = $(addprefix $(GPU_OBJDIR)/,$(addsuffix .o,$(basename $(GPU_SRCS)))) + +RUNTIME_CXXFLAGS += -x hip + +$(RUNTIME_OBJ_DIR)/gpu-amd-reduce.o: gpu-amd-reduce.cc \ + $(RUNTIME_OBJ_DIR_STAMP) + $(CXX) -c $(CXX11_STD) $(RUNTIME_CXXFLAGS) $(RUNTIME_INCLS) -o $@ $< diff --git a/runtime/src/gpu/amd/gpu-amd-reduce.cc b/runtime/src/gpu/amd/gpu-amd-reduce.cc new file mode 100644 index 000000000000..3cbad5b629b4 --- /dev/null +++ b/runtime/src/gpu/amd/gpu-amd-reduce.cc @@ -0,0 +1,78 @@ +/* + * Copyright 2020-2023 Hewlett Packard Enterprise Development LP + * Copyright 2004-2019 Cray Inc. + * Other additional copyright holders may be indicated within. * + * The entirety of this work is licensed under the Apache License, + * Version 2.0 (the "License"); you may not use this file except + * in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef HAS_GPU_LOCALE + +#include +#include + +#include "chpl-gpu.h" +#include "chpl-gpu-impl.h" +#include "gpu/chpl-gpu-reduce-util.h" + +#if ROCM_VERSION_MAJOR >= 5 +#define DEF_ONE_REDUCE_RET_VAL(impl_kind, chpl_kind, data_type) \ +void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ + data_type* val,\ + void* stream) {\ + hipDeviceptr_t result; \ + ROCM_CALL(hipMalloc(&result, sizeof(data_type))); \ + void* temp = NULL; \ + size_t temp_bytes = 0; \ + rocmprim::reduce(temp, temp_bytes, data, (data_type*)result, n,\ + rocmprim::impl_kind,\ + (hipStream_t)stream); \ + ROCM_CALL(hipMalloc(((CUdeviceptr*)&temp), temp_bytes)); \ + rocmprim::reduce(temp, temp_bytes, data, (data_type*)result, n,\ + rocmprim::impl_kind,\ + (hipStream_t)stream); \ + ROCM_CALL(hipMemcpyDtoHAsync(val, result, sizeof(data_type),\ + (CUstream)stream)); \ +} +#else +#define DEF_ONE_REDUCE_RET_VAL(impl_kind, chpl_kind, data_type) \ +void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ + data_type* val,\ + void* stream) {\ + chpl_internal_error("Reduction is not supported with AMD GPUs using ROCm version <5\n");\ +} +#endif + +GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL, plus, sum) +GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL, minimum, min) +GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL, maximum, max) + +#undef DEF_ONE_REDUCE_RET_VAL + +#define DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, data_type) \ +void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ + data_type* val, int* idx,\ + void* stream) {\ + chpl_internal_error(chpl_kind # " reduction is not supported with AMD GPUs\n");\ +} + +GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, unknown, minloc) +GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, unknown, maxloc) + +#undef DEF_ONE_REDUCE_RET_VAL_IDX + +#undef DEF_REDUCE + +#endif // HAS_GPU_LOCALE + From ab9b47519307db27b7d053538060a27816f217bf Mon Sep 17 00:00:00 2001 From: Engin Kayraklioglu Date: Mon, 16 Oct 2023 17:08:44 -0700 Subject: [PATCH 17/42] Add a compilerError in the module code Signed-off-by: Engin Kayraklioglu --- modules/standard/GPU.chpl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/modules/standard/GPU.chpl b/modules/standard/GPU.chpl index 566d9461b473..5d81627b1f3b 100644 --- a/modules/standard/GPU.chpl +++ b/modules/standard/GPU.chpl @@ -426,6 +426,9 @@ module GPU return val; } else { + if CHPL_GPU == "amd" { + compilerError(op + " reduction is not supported on AMD GPUs"); + } var idx: int(32); var val: t; extern externFunc proc reduce_fn(data, size, ref val, ref idx); From 5ab3e5e55714565a01fa8e156ddf7987cce730c0 Mon Sep 17 00:00:00 2001 From: Engin Kayraklioglu Date: Wed, 18 Oct 2023 17:39:39 -0500 Subject: [PATCH 18/42] Test/limit usage on AMD Signed-off-by: Engin Kayraklioglu --- modules/standard/GPU.chpl | 6 +-- runtime/src/gpu/amd/Makefile.share | 5 +- runtime/src/gpu/amd/gpu-amd-reduce.cc | 64 +++++++++++++++++-------- runtime/src/gpu/amd/gpu-amd.c | 7 +-- test/gpu/native/reduction/basic.chpl | 7 ++- util/chplenv/compile_link_args_utils.py | 8 ++++ 6 files changed, 66 insertions(+), 31 deletions(-) diff --git a/modules/standard/GPU.chpl b/modules/standard/GPU.chpl index 5d81627b1f3b..e09d7f7fbac7 100644 --- a/modules/standard/GPU.chpl +++ b/modules/standard/GPU.chpl @@ -395,6 +395,9 @@ module GPU // ============================ private inline proc doGpuReduce(param op: string, ref A: [] ?t) { + if CHPL_GPU == "amd" then + compilerError("gpu*Reduce functions are not supported on AMD GPUs"); + proc chplTypeToCTypeName(type t) param { select t { when int(8) do return "int8_t"; @@ -426,9 +429,6 @@ module GPU return val; } else { - if CHPL_GPU == "amd" { - compilerError(op + " reduction is not supported on AMD GPUs"); - } var idx: int(32); var val: t; extern externFunc proc reduce_fn(data, size, ref val, ref idx); diff --git a/runtime/src/gpu/amd/Makefile.share b/runtime/src/gpu/amd/Makefile.share index e8b6db7040d9..79f63b4497f0 100644 --- a/runtime/src/gpu/amd/Makefile.share +++ b/runtime/src/gpu/amd/Makefile.share @@ -15,7 +15,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -GPU_SRCS = gpu-amd.c gpu-amd-reduce.cc +#GPU_SRCS = gpu-amd.c gpu-amd-reduce.cc +GPU_SRCS = gpu-amd-reduce.cc gpu-amd.c SRCS = $(GPU_SRCS) @@ -25,4 +26,4 @@ RUNTIME_CXXFLAGS += -x hip $(RUNTIME_OBJ_DIR)/gpu-amd-reduce.o: gpu-amd-reduce.cc \ $(RUNTIME_OBJ_DIR_STAMP) - $(CXX) -c $(CXX11_STD) $(RUNTIME_CXXFLAGS) $(RUNTIME_INCLS) -o $@ $< + $(CXX) -c -std=c++14 $(RUNTIME_CXXFLAGS) $(RUNTIME_INCLS) -o $@ $< diff --git a/runtime/src/gpu/amd/gpu-amd-reduce.cc b/runtime/src/gpu/amd/gpu-amd-reduce.cc index 3cbad5b629b4..68aa83f7619b 100644 --- a/runtime/src/gpu/amd/gpu-amd-reduce.cc +++ b/runtime/src/gpu/amd/gpu-amd-reduce.cc @@ -19,31 +19,45 @@ #ifdef HAS_GPU_LOCALE +/* TODO uncomment these when the implementations are in #include -#include +#include +#include +*/ #include "chpl-gpu.h" #include "chpl-gpu-impl.h" #include "gpu/chpl-gpu-reduce-util.h" +#include "gpu/amd/util.h" -#if ROCM_VERSION_MAJOR >= 5 +// Engin: I can't get neither hipCUB nor rocprim to work. (hipCUB is a light +// wrapper around rocprim anyways). I filed +// https://github.com/ROCmSoftwarePlatform/hipCUB/issues/304, but I don't know +// if/when I'll hear back something. For now, I am merging the code that's +// supposed to work but doesn't instead of removing them from my branch. +#if 1 #define DEF_ONE_REDUCE_RET_VAL(impl_kind, chpl_kind, data_type) \ void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ data_type* val,\ void* stream) {\ - hipDeviceptr_t result; \ - ROCM_CALL(hipMalloc(&result, sizeof(data_type))); \ + chpl_internal_error("This function shouldn't have been called. Reduction is not supported with AMD GPUs\n");\ +} +#elif ROCM_VERSION_MAJOR >= 5 +#define DEF_ONE_REDUCE_RET_VAL(impl_kind, chpl_kind, data_type) \ +void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ + data_type* val,\ + void* stream) {\ + data_type* result; \ + ROCM_CALL(hipMalloc(&result, sizeof(data_type)));\ void* temp = NULL; \ size_t temp_bytes = 0; \ - rocmprim::reduce(temp, temp_bytes, data, (data_type*)result, n,\ - rocmprim::impl_kind,\ - (hipStream_t)stream); \ - ROCM_CALL(hipMalloc(((CUdeviceptr*)&temp), temp_bytes)); \ - rocmprim::reduce(temp, temp_bytes, data, (data_type*)result, n,\ - rocmprim::impl_kind,\ - (hipStream_t)stream); \ + ROCM_CALL(hipcub::DeviceReduce::impl_kind(temp, temp_bytes, data, (data_type*)result, n,\ + 0, true));\ + ROCM_CALL(hipMalloc(((hipDeviceptr_t*)&temp), temp_bytes)); \ + ROCM_CALL(hipcub::DeviceReduce::impl_kind(temp, temp_bytes, data, (data_type*)result, n,\ + 0, true));\ ROCM_CALL(hipMemcpyDtoHAsync(val, result, sizeof(data_type),\ - (CUstream)stream)); \ + (hipStream_t)stream)); \ } #else #define DEF_ONE_REDUCE_RET_VAL(impl_kind, chpl_kind, data_type) \ @@ -52,23 +66,35 @@ void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ void* stream) {\ chpl_internal_error("Reduction is not supported with AMD GPUs using ROCm version <5\n");\ } -#endif +#endif // 1 -GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL, plus, sum) -GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL, minimum, min) -GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL, maximum, max) +GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL, Sum, sum) +GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL, Min, min) +GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL, Max, max) #undef DEF_ONE_REDUCE_RET_VAL +#if 1 +#define DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, data_type) \ +void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ + data_type* val, int* idx,\ + void* stream) {\ + chpl_internal_error("This function shouldn't have been called. Reduction is not supported with AMD GPUs\n");\ +} +#else #define DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, data_type) \ void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ data_type* val, int* idx,\ void* stream) {\ - chpl_internal_error(chpl_kind # " reduction is not supported with AMD GPUs\n");\ + // TODO I don't know any other specific issues with these versions. Should be + // able to whip up the implementation quickly once we figure out what's going + // wrong here. + chpl_internal_error("Unimplemented"); } +#endif // 1 -GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, unknown, minloc) -GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, unknown, maxloc) +GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, ArgMin, minloc) +GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, ArgMax, maxloc) #undef DEF_ONE_REDUCE_RET_VAL_IDX diff --git a/runtime/src/gpu/amd/gpu-amd.c b/runtime/src/gpu/amd/gpu-amd.c index b3b54752d5b7..475e8ccaaeb2 100644 --- a/runtime/src/gpu/amd/gpu-amd.c +++ b/runtime/src/gpu/amd/gpu-amd.c @@ -29,6 +29,7 @@ #include "chplcgfns.h" #include "chpl-env-gen.h" #include "chpl-linefile-support.h" +#include "gpu/amd/util.h" #include @@ -37,7 +38,7 @@ #include #include -static void chpl_gpu_rocm_check(int err, const char* file, int line) { +void chpl_gpu_rocm_check(int err, const char* file, int line) { if(err == hipErrorContextAlreadyInUse) { return; } if(err != hipSuccess) { const int msg_len = 256; @@ -51,10 +52,6 @@ static void chpl_gpu_rocm_check(int err, const char* file, int line) { } } -#define ROCM_CALL(call) do {\ - chpl_gpu_rocm_check((int)call, __FILE__, __LINE__);\ -} while(0); - static inline void* chpl_gpu_load_module(const char* fatbin_data) { hipModule_t rocm_module; diff --git a/test/gpu/native/reduction/basic.chpl b/test/gpu/native/reduction/basic.chpl index 5a1ceef66952..21fe1bb56d50 100644 --- a/test/gpu/native/reduction/basic.chpl +++ b/test/gpu/native/reduction/basic.chpl @@ -1,4 +1,5 @@ use GPU; +use ChplConfig; config const n = 100; @@ -26,8 +27,10 @@ proc testType(type t) { test("sum", t); test("min", t); test("max", t); - test("minloc", t); - test("maxloc", t); + if CHPL_GPU != "amd" { + test("minloc", t); + test("maxloc", t); + } writeln(); } diff --git a/util/chplenv/compile_link_args_utils.py b/util/chplenv/compile_link_args_utils.py index fc9b325da3df..b1b46bd05d26 100644 --- a/util/chplenv/compile_link_args_utils.py +++ b/util/chplenv/compile_link_args_utils.py @@ -69,6 +69,14 @@ def get_runtime_includes_and_defines(): system.append("-isystem" + os.path.join(sdk_path, "hip", "include")) system.append("-isystem" + os.path.join(sdk_path, "hsa", "include")) + + # We need runtime to use the lld that ships with the rocm + # installation so that we can properly link hipCUB. I believe this + # requirement is coming from using "-x hip" with clang + lld_path = os.path.join(sdk_path, "llvm/bin") + system.append("-B " + lld_path) + bundled.append("-B " + lld_path) + if mem == "jemalloc": # set -DCHPL_JEMALLOC_PREFIX=chpl_je_ # this is needed since it affects code inside of headers From 0819e3110d8f7b013956723d1981f0d39a3cf68a Mon Sep 17 00:00:00 2001 From: Engin Kayraklioglu Date: Wed, 18 Oct 2023 16:42:18 -0700 Subject: [PATCH 19/42] Fix an issue, add perf test Signed-off-by: Engin Kayraklioglu --- modules/standard/GPU.chpl | 16 ++++---- .../native/reduction/reduceThroughput.chpl | 39 +++++++++++++++++++ 2 files changed, 47 insertions(+), 8 deletions(-) create mode 100644 test/gpu/native/reduction/reduceThroughput.chpl diff --git a/modules/standard/GPU.chpl b/modules/standard/GPU.chpl index e09d7f7fbac7..20a299904176 100644 --- a/modules/standard/GPU.chpl +++ b/modules/standard/GPU.chpl @@ -394,7 +394,7 @@ module GPU // Reductions // ============================ - private inline proc doGpuReduce(param op: string, ref A: [] ?t) { + private inline proc doGpuReduce(param op: string, const ref A: [] ?t) { if CHPL_GPU == "amd" then compilerError("gpu*Reduce functions are not supported on AMD GPUs"); @@ -425,22 +425,22 @@ module GPU if op == "sum" || op == "min" || op == "max" { var val: t; extern externFunc proc reduce_fn(data, size, ref val); - reduce_fn(c_ptrTo(A), A.size, val); + reduce_fn(c_ptrToConst(A), A.size, val); return val; } else { var idx: int(32); var val: t; extern externFunc proc reduce_fn(data, size, ref val, ref idx); - reduce_fn(c_ptrTo(A), A.size, val, idx); + reduce_fn(c_ptrToConst(A), A.size, val, idx); return (idx, val); } } - inline proc gpuSumReduce(ref A: [] ?t) do return doGpuReduce("sum", A); - inline proc gpuMinReduce(ref A: [] ?t) do return doGpuReduce("min", A); - inline proc gpuMaxReduce(ref A: [] ?t) do return doGpuReduce("max", A); - inline proc gpuMinLocReduce(ref A: [] ?t) do return doGpuReduce("minloc", A); - inline proc gpuMaxLocReduce(ref A: [] ?t) do return doGpuReduce("maxloc", A); + inline proc gpuSumReduce(const ref A: [] ?t) do return doGpuReduce("sum", A); + inline proc gpuMinReduce(const ref A: [] ?t) do return doGpuReduce("min", A); + inline proc gpuMaxReduce(const ref A: [] ?t) do return doGpuReduce("max", A); + inline proc gpuMinLocReduce(const ref A: [] ?t) do return doGpuReduce("minloc", A); + inline proc gpuMaxLocReduce(const ref A: [] ?t) do return doGpuReduce("maxloc", A); } diff --git a/test/gpu/native/reduction/reduceThroughput.chpl b/test/gpu/native/reduction/reduceThroughput.chpl new file mode 100644 index 000000000000..2fe0a196dd65 --- /dev/null +++ b/test/gpu/native/reduction/reduceThroughput.chpl @@ -0,0 +1,39 @@ +use GPU; +use Time; +use ChplConfig; + +config type dataType = int; +config param useGpu = CHPL_LOCALE_MODEL == "gpu"; + +config const n = 100; +config const reportPerf = true; + +var t: stopwatch; + +inline proc doSumReduce(const ref Arr) { + if useGpu then + return gpuSumReduce(Arr); + else { + return + reduce Arr; + } +} + +writeln("Using ", if useGpu then "gpu" else "cpu"); + +on if useGpu then here.gpus[0] else here { + var Arr: [0.. Date: Wed, 25 Oct 2023 14:46:10 -0700 Subject: [PATCH 20/42] Initial attempt to do multi-chunk reduction Signed-off-by: Engin Kayraklioglu --- modules/standard/GPU.chpl | 78 ++++++++++++++++++++++++++-- runtime/src/chpl-gpu.c | 4 +- test/gpu/native/reduction/basic.chpl | 6 +-- 3 files changed, 77 insertions(+), 11 deletions(-) diff --git a/modules/standard/GPU.chpl b/modules/standard/GPU.chpl index 20a299904176..ec0bec0b8f9e 100644 --- a/modules/standard/GPU.chpl +++ b/modules/standard/GPU.chpl @@ -394,6 +394,9 @@ module GPU // Reductions // ============================ + @chpldoc.nodoc + config param gpuDebugReduce = false; + private inline proc doGpuReduce(param op: string, const ref A: [] ?t) { if CHPL_GPU == "amd" then compilerError("gpu*Reduce functions are not supported on AMD GPUs"); @@ -418,22 +421,87 @@ module GPU return "chpl_gpu_"+op+"_reduce_"+chplTypeToCTypeName(t); } + inline proc subReduceVal(param op, ref accum: ?t, val: t) { + select op { + when "sum" do accum += val; + when "min" do accum = min(accum, val); + when "max" do accum = max(accum, val); + } + } + + inline proc subReduceValIdx(param op, const baseOffset, ref accum: ?t, + val: t) { + select op { + when "minloc" do + if accum[1] > val[1] then accum = val; + when "maxloc" do + if accum[1] < val[1] then accum = val; + } + + accum[0] += baseOffset; + } + + iter offsetsThatCanFitIn32Bits(size: int) { + use Math only divCeil; + const numChunks = divCeil(size, max(int(32))); + const standardChunkSize = divCeil(size, numChunks); + + if gpuDebugReduce then + writeln("Will use ", numChunks, " chunks of size ", standardChunkSize); + + foreach chunk in 0.. Date: Wed, 25 Oct 2023 16:14:39 -0700 Subject: [PATCH 21/42] Add a new test and fix an issue exposed by it Signed-off-by: Engin Kayraklioglu --- modules/standard/GPU.chpl | 16 +++++++++++++--- test/gpu/native/reduction/largeArrays.chpl | 19 +++++++++++++++++++ .../gpu/native/reduction/largeArrays.execopts | 6 ++++++ test/gpu/native/reduction/largeArrays.good | 1 + 4 files changed, 39 insertions(+), 3 deletions(-) create mode 100644 test/gpu/native/reduction/largeArrays.chpl create mode 100644 test/gpu/native/reduction/largeArrays.execopts create mode 100644 test/gpu/native/reduction/largeArrays.good diff --git a/modules/standard/GPU.chpl b/modules/standard/GPU.chpl index ec0bec0b8f9e..27ab172bdefa 100644 --- a/modules/standard/GPU.chpl +++ b/modules/standard/GPU.chpl @@ -442,8 +442,14 @@ module GPU } iter offsetsThatCanFitIn32Bits(size: int) { + // Engin: I've tried to get max(int(32)) to work as this bug is about CUB + // using `int` as the size in the interface. However, getting close to + // max(int(32)) also triggers the bug. So, I am choosing this as a + // round/safe value for the time being. + param chunkSize = 2_000_000_000; + use Math only divCeil; - const numChunks = divCeil(size, max(int(32))); + const numChunks = divCeil(size, chunkSize); const standardChunkSize = divCeil(size, numChunks); if gpuDebugReduce then @@ -453,9 +459,9 @@ module GPU const start = chunk*standardChunkSize; const curChunkSize = if start+standardChunkSize <= size then standardChunkSize - else size-standardChunkSize; + else size-start; if gpuDebugReduce then - writef("Chunk %i: (start=%i, curChunkSize=%i)", chunk, start, + writef("Chunk %i: (start=%i, curChunkSize=%i) ", chunk, start, curChunkSize); yield (start, curChunkSize); @@ -479,6 +485,8 @@ module GPU for (offset,size) in offsetsThatCanFitIn32Bits(A.size) { var curVal: t; reduce_fn(basePtr+offset, size, curVal); + if gpuDebugReduce then + writef(" (curVal=%i)\n", curVal); subReduceVal(op, val, curVal); } @@ -499,6 +507,8 @@ module GPU var curVal: t; reduce_fn(basePtr+offset, size, curVal, curIdx); subReduceValIdx(op, offset, ret, (curIdx, curVal)); + if gpuDebugReduce then + writef(" (curIdx=%i curVal=%i)\n", curIdx, curVal); } return ret; diff --git a/test/gpu/native/reduction/largeArrays.chpl b/test/gpu/native/reduction/largeArrays.chpl new file mode 100644 index 000000000000..5a7924d2961f --- /dev/null +++ b/test/gpu/native/reduction/largeArrays.chpl @@ -0,0 +1,19 @@ +use GPU; + +config const printResult = false; +config const n = 100; + +var result: uint(8); +on here.gpus[0] { + var Arr: [0..#n] uint(8) = 1; + + result = gpuSumReduce(Arr); +} + +if printResult then writeln("Result: ", result); + +// it is all 1's initially. It'll certainly overflow and the remainder will be +// the expected return value +const expected = n%(max(uint(8))+1); +if result != expected then + writef("Invalid result. Expected %u, actual %u\n", expected, result); diff --git a/test/gpu/native/reduction/largeArrays.execopts b/test/gpu/native/reduction/largeArrays.execopts new file mode 100644 index 000000000000..0af28add7fca --- /dev/null +++ b/test/gpu/native/reduction/largeArrays.execopts @@ -0,0 +1,6 @@ +--n=2_000_000_000 +--n=2_000_000_001 +--n=2_147_483_647 +--n=4_294_967_293 +--n=4_294_967_294 +--n=4_294_967_295 diff --git a/test/gpu/native/reduction/largeArrays.good b/test/gpu/native/reduction/largeArrays.good new file mode 100644 index 000000000000..abf41f07fd48 --- /dev/null +++ b/test/gpu/native/reduction/largeArrays.good @@ -0,0 +1 @@ +warning: The prototype GPU support implies --no-checks. This may impact debuggability. To suppress this warning, compile with --no-checks explicitly From 06be1a80cff3aaf55fdf93b09ed48a9916705168 Mon Sep 17 00:00:00 2001 From: Engin Kayraklioglu Date: Wed, 25 Oct 2023 16:31:01 -0700 Subject: [PATCH 22/42] Add new test Signed-off-by: Engin Kayraklioglu --- .../native/reduction/largeArraysMinMax.chpl | 27 +++++++++++++++++++ .../reduction/largeArraysMinMax.execopts | 7 +++++ .../native/reduction/largeArraysMinMax.good | 1 + 3 files changed, 35 insertions(+) create mode 100644 test/gpu/native/reduction/largeArraysMinMax.chpl create mode 100644 test/gpu/native/reduction/largeArraysMinMax.execopts create mode 100644 test/gpu/native/reduction/largeArraysMinMax.good diff --git a/test/gpu/native/reduction/largeArraysMinMax.chpl b/test/gpu/native/reduction/largeArraysMinMax.chpl new file mode 100644 index 000000000000..99566d86e3bc --- /dev/null +++ b/test/gpu/native/reduction/largeArraysMinMax.chpl @@ -0,0 +1,27 @@ + +use GPU; + +config const kind = "min"; +const isMin = kind=="min"; +assert(isMin || kind=="max"); + +config const n = 2*max(int(32)); +config const setIdx = n-1; +assert(n>setIdx); + +config const printResult = false; +const expected = if isMin then 7:uint(8) else 13:uint(8); + +var result: uint(8); +on here.gpus[0] { + var Arr: [0..#n] uint(8) = 10; + + Arr[setIdx] = expected; + + result = if isMin then gpuMinReduce(Arr) else gpuMaxReduce(Arr); +} + +if printResult then writeln("Result: ", result); + +if result != expected then + writef("Invalid result. Expected %u, actual %u\n", expected, result); diff --git a/test/gpu/native/reduction/largeArraysMinMax.execopts b/test/gpu/native/reduction/largeArraysMinMax.execopts new file mode 100644 index 000000000000..2742bc714a67 --- /dev/null +++ b/test/gpu/native/reduction/largeArraysMinMax.execopts @@ -0,0 +1,7 @@ + #largeArraysMinMax +--setIdx=0 +--setIdx=1_999_999_999 +--setIdx=2_000_000_000 +--setIdx=2_000_000_001 +--setIdx=2_147_483_647 + diff --git a/test/gpu/native/reduction/largeArraysMinMax.good b/test/gpu/native/reduction/largeArraysMinMax.good new file mode 100644 index 000000000000..abf41f07fd48 --- /dev/null +++ b/test/gpu/native/reduction/largeArraysMinMax.good @@ -0,0 +1 @@ +warning: The prototype GPU support implies --no-checks. This may impact debuggability. To suppress this warning, compile with --no-checks explicitly From d012a9b466896ca1f9c4b20fa3fe08073daf28cd Mon Sep 17 00:00:00 2001 From: Engin Kayraklioglu Date: Wed, 25 Oct 2023 17:02:36 -0700 Subject: [PATCH 23/42] Expand test to minloc,maxloc. Fix a bug Signed-off-by: Engin Kayraklioglu --- modules/standard/GPU.chpl | 8 +++---- .../native/reduction/largeArraysMinMax.chpl | 23 +++++++++++++------ .../reduction/largeArraysMinMax.compopts | 2 ++ 3 files changed, 21 insertions(+), 12 deletions(-) create mode 100644 test/gpu/native/reduction/largeArraysMinMax.compopts diff --git a/modules/standard/GPU.chpl b/modules/standard/GPU.chpl index 27ab172bdefa..ba2b01b2300b 100644 --- a/modules/standard/GPU.chpl +++ b/modules/standard/GPU.chpl @@ -433,12 +433,10 @@ module GPU val: t) { select op { when "minloc" do - if accum[1] > val[1] then accum = val; + if accum[1] > val[1] then accum = (val[0]+baseOffset, val[1]); when "maxloc" do - if accum[1] < val[1] then accum = val; + if accum[1] < val[1] then accum = (val[0]+baseOffset, val[1]); } - - accum[0] += baseOffset; } iter offsetsThatCanFitIn32Bits(size: int) { @@ -508,7 +506,7 @@ module GPU reduce_fn(basePtr+offset, size, curVal, curIdx); subReduceValIdx(op, offset, ret, (curIdx, curVal)); if gpuDebugReduce then - writef(" (curIdx=%i curVal=%i)\n", curIdx, curVal); + writef(" (curIdx=%i curVal=%i ret=%?)\n", curIdx, curVal, ret); } return ret; diff --git a/test/gpu/native/reduction/largeArraysMinMax.chpl b/test/gpu/native/reduction/largeArraysMinMax.chpl index 99566d86e3bc..dc660bc1d0f7 100644 --- a/test/gpu/native/reduction/largeArraysMinMax.chpl +++ b/test/gpu/native/reduction/largeArraysMinMax.chpl @@ -1,6 +1,7 @@ - use GPU; +config param withLoc = false; + config const kind = "min"; const isMin = kind=="min"; assert(isMin || kind=="max"); @@ -10,18 +11,26 @@ config const setIdx = n-1; assert(n>setIdx); config const printResult = false; -const expected = if isMin then 7:uint(8) else 13:uint(8); -var result: uint(8); +const expectedVal = if isMin then 7:uint(8) else 13:uint(8); +const expected = if withLoc then (setIdx, expectedVal) else expectedVal; + +inline proc doReduce(Arr) { + if withLoc then + return if isMin then gpuMinLocReduce(Arr) else gpuMaxLocReduce(Arr); + else + return if isMin then gpuMinReduce(Arr) else gpuMaxReduce(Arr); +} + +var result: if withLoc then (int, uint(8)) else uint(8); on here.gpus[0] { var Arr: [0..#n] uint(8) = 10; + Arr[setIdx] = expectedVal; - Arr[setIdx] = expected; - - result = if isMin then gpuMinReduce(Arr) else gpuMaxReduce(Arr); + result = doReduce(Arr); } if printResult then writeln("Result: ", result); if result != expected then - writef("Invalid result. Expected %u, actual %u\n", expected, result); + writef("Invalid result. Expected %?, actual %?\n", expected, result); diff --git a/test/gpu/native/reduction/largeArraysMinMax.compopts b/test/gpu/native/reduction/largeArraysMinMax.compopts new file mode 100644 index 000000000000..087ba6a454e9 --- /dev/null +++ b/test/gpu/native/reduction/largeArraysMinMax.compopts @@ -0,0 +1,2 @@ +-swithLoc=false +-swithLoc=true From e85a00ceb4f994d255e6c1d34d83f92ed46d21c4 Mon Sep 17 00:00:00 2001 From: Engin Kayraklioglu Date: Wed, 25 Oct 2023 17:28:18 -0700 Subject: [PATCH 24/42] Make the new functions work with cpu-as-device. Add skipifs Signed-off-by: Engin Kayraklioglu --- modules/standard/GPU.chpl | 23 ++++++++++++++- runtime/src/gpu/cpu/gpu-cpu.c | 29 +++++++++++++++++++ test/gpu/native/reduction/SKIPIF | 2 ++ test/gpu/native/reduction/largeArrays.skipif | 4 +++ .../native/reduction/largeArraysMinMax.skipif | 4 +++ 5 files changed, 61 insertions(+), 1 deletion(-) create mode 100644 test/gpu/native/reduction/SKIPIF create mode 100644 test/gpu/native/reduction/largeArrays.skipif create mode 100644 test/gpu/native/reduction/largeArraysMinMax.skipif diff --git a/modules/standard/GPU.chpl b/modules/standard/GPU.chpl index ba2b01b2300b..f866c779e814 100644 --- a/modules/standard/GPU.chpl +++ b/modules/standard/GPU.chpl @@ -398,8 +398,29 @@ module GPU config param gpuDebugReduce = false; private inline proc doGpuReduce(param op: string, const ref A: [] ?t) { - if CHPL_GPU == "amd" then + if op != "sum" && op != "min" && op != "max" && + op != "minloc" && op != "maxloc" { + + compilerError("Unexpected reduction kind in doGpuReduce: ", op); + } + + + if CHPL_GPU == "amd" { compilerError("gpu*Reduce functions are not supported on AMD GPUs"); + } + else if CHPL_GPU == "cpu" { + select op { + when "sum" do return + reduce A; + when "min" do return min reduce A; + when "max" do return max reduce A; + when "minloc" do return minloc reduce zip (A.domain, A); + when "maxloc" do return maxloc reduce zip (A.domain, A); + } + } + else { + compilerAssert(CHPL_GPU=="nvidia"); + } + proc chplTypeToCTypeName(type t) param { select t { diff --git a/runtime/src/gpu/cpu/gpu-cpu.c b/runtime/src/gpu/cpu/gpu-cpu.c index e76f0dcb359d..a3ae1754e57c 100644 --- a/runtime/src/gpu/cpu/gpu-cpu.c +++ b/runtime/src/gpu/cpu/gpu-cpu.c @@ -157,4 +157,33 @@ bool chpl_gpu_impl_stream_ready(void* stream) { void chpl_gpu_impl_stream_synchronize(void* stream) { } +#define DEF_ONE_REDUCE_RET_VAL(impl_kind, chpl_kind, data_type) \ +void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ + data_type* val,\ + void* stream) {\ + chpl_internal_error("This function shouldn't have been called. "\ + "cpu-as-device mode should handle reductions in "\ + "the module code\n");\ +} + +GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL, Sum, sum) +GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL, Min, min) +GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL, Max, max) + +#undef DEF_ONE_REDUCE_RET_VAL + +#define DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, data_type) \ +void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ + data_type* val, int* idx,\ + void* stream) {\ + chpl_internal_error("This function shouldn't have been called. "\ + "cpu-as-device mode should handle reductions in "\ + "the module code\n");\ +} + +GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, ArgMin, minloc) +GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, ArgMax, maxloc) + +#undef DEF_ONE_REDUCE_RET_VAL_IDX + #endif // HAS_GPU_LOCALE diff --git a/test/gpu/native/reduction/SKIPIF b/test/gpu/native/reduction/SKIPIF new file mode 100644 index 000000000000..75268eb04f42 --- /dev/null +++ b/test/gpu/native/reduction/SKIPIF @@ -0,0 +1,2 @@ +# Reductions are not supported with AMD gpus yet. +CHPL_GPU==amd diff --git a/test/gpu/native/reduction/largeArrays.skipif b/test/gpu/native/reduction/largeArrays.skipif new file mode 100644 index 000000000000..dc81184aa7f6 --- /dev/null +++ b/test/gpu/native/reduction/largeArrays.skipif @@ -0,0 +1,4 @@ +# this test is to exercise a code path that will never be run with +# cpu-as-device. Moreover, the large size makes this mode take too long to +# finish. So, I am skipping this test. +CHPL_GPU==cpu diff --git a/test/gpu/native/reduction/largeArraysMinMax.skipif b/test/gpu/native/reduction/largeArraysMinMax.skipif new file mode 100644 index 000000000000..dc81184aa7f6 --- /dev/null +++ b/test/gpu/native/reduction/largeArraysMinMax.skipif @@ -0,0 +1,4 @@ +# this test is to exercise a code path that will never be run with +# cpu-as-device. Moreover, the large size makes this mode take too long to +# finish. So, I am skipping this test. +CHPL_GPU==cpu From 186ad540ef148c5867d231dbadbad22845aee170 Mon Sep 17 00:00:00 2001 From: Engin Kayraklioglu Date: Wed, 25 Oct 2023 20:44:30 -0700 Subject: [PATCH 25/42] Add the missing good file Signed-off-by: Engin Kayraklioglu --- test/gpu/native/reduction/reduceThroughput.good | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 test/gpu/native/reduction/reduceThroughput.good diff --git a/test/gpu/native/reduction/reduceThroughput.good b/test/gpu/native/reduction/reduceThroughput.good new file mode 100644 index 000000000000..4b12519e9850 --- /dev/null +++ b/test/gpu/native/reduction/reduceThroughput.good @@ -0,0 +1,3 @@ +warning: The prototype GPU support implies --no-checks. This may impact debuggability. To suppress this warning, compile with --no-checks explicitly +Using gpu +4950 From 4b6953f2116af377adb81b9c1e27294027693ba4 Mon Sep 17 00:00:00 2001 From: Engin Kayraklioglu Date: Wed, 25 Oct 2023 21:15:14 -0700 Subject: [PATCH 26/42] Remove a trailing whitespace Signed-off-by: Engin Kayraklioglu --- runtime/src/gpu/amd/Makefile.share | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/runtime/src/gpu/amd/Makefile.share b/runtime/src/gpu/amd/Makefile.share index 79f63b4497f0..410b4f724e82 100644 --- a/runtime/src/gpu/amd/Makefile.share +++ b/runtime/src/gpu/amd/Makefile.share @@ -15,8 +15,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#GPU_SRCS = gpu-amd.c gpu-amd-reduce.cc -GPU_SRCS = gpu-amd-reduce.cc gpu-amd.c +GPU_SRCS = gpu-amd-reduce.cc gpu-amd.c SRCS = $(GPU_SRCS) From aca38c46ab9dd3c0052c62e47007bf06318fc1db Mon Sep 17 00:00:00 2001 From: Engin Kayraklioglu Date: Thu, 26 Oct 2023 09:52:14 -0700 Subject: [PATCH 27/42] Revert some of the AMD changes Signed-off-by: Engin Kayraklioglu --- runtime/src/gpu/amd/gpu-amd.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/runtime/src/gpu/amd/gpu-amd.c b/runtime/src/gpu/amd/gpu-amd.c index 475e8ccaaeb2..b3b54752d5b7 100644 --- a/runtime/src/gpu/amd/gpu-amd.c +++ b/runtime/src/gpu/amd/gpu-amd.c @@ -29,7 +29,6 @@ #include "chplcgfns.h" #include "chpl-env-gen.h" #include "chpl-linefile-support.h" -#include "gpu/amd/util.h" #include @@ -38,7 +37,7 @@ #include #include -void chpl_gpu_rocm_check(int err, const char* file, int line) { +static void chpl_gpu_rocm_check(int err, const char* file, int line) { if(err == hipErrorContextAlreadyInUse) { return; } if(err != hipSuccess) { const int msg_len = 256; @@ -52,6 +51,10 @@ void chpl_gpu_rocm_check(int err, const char* file, int line) { } } +#define ROCM_CALL(call) do {\ + chpl_gpu_rocm_check((int)call, __FILE__, __LINE__);\ +} while(0); + static inline void* chpl_gpu_load_module(const char* fatbin_data) { hipModule_t rocm_module; From 257a6aea6afb32f858a52862bbc49f3d8be547d2 Mon Sep 17 00:00:00 2001 From: Engin Kayraklioglu Date: Thu, 26 Oct 2023 09:52:42 -0700 Subject: [PATCH 28/42] Add the missing execopts Signed-off-by: Engin Kayraklioglu --- test/gpu/native/reduction/reduceThroughput.execopts | 1 + 1 file changed, 1 insertion(+) create mode 100644 test/gpu/native/reduction/reduceThroughput.execopts diff --git a/test/gpu/native/reduction/reduceThroughput.execopts b/test/gpu/native/reduction/reduceThroughput.execopts new file mode 100644 index 000000000000..367868626392 --- /dev/null +++ b/test/gpu/native/reduction/reduceThroughput.execopts @@ -0,0 +1 @@ +--reportPerf=false From 9c49498f4064c36d3d99df0dc102035564a6b041 Mon Sep 17 00:00:00 2001 From: Engin Kayraklioglu Date: Thu, 26 Oct 2023 15:49:56 -0700 Subject: [PATCH 29/42] Remove an include Signed-off-by: Engin Kayraklioglu --- runtime/src/gpu/amd/gpu-amd-reduce.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/runtime/src/gpu/amd/gpu-amd-reduce.cc b/runtime/src/gpu/amd/gpu-amd-reduce.cc index 68aa83f7619b..e9a39dfbc25d 100644 --- a/runtime/src/gpu/amd/gpu-amd-reduce.cc +++ b/runtime/src/gpu/amd/gpu-amd-reduce.cc @@ -28,7 +28,6 @@ #include "chpl-gpu.h" #include "chpl-gpu-impl.h" #include "gpu/chpl-gpu-reduce-util.h" -#include "gpu/amd/util.h" // Engin: I can't get neither hipCUB nor rocprim to work. (hipCUB is a light // wrapper around rocprim anyways). I filed From 1627385447572afa4ba93bce6413b8fb844b0be7 Mon Sep 17 00:00:00 2001 From: Engin Kayraklioglu Date: Fri, 3 Nov 2023 09:21:17 -0700 Subject: [PATCH 30/42] Relocate tests into a new noAmd directory Signed-off-by: Engin Kayraklioglu --- test/gpu/native/{reduction => noAmd}/SKIPIF | 0 test/gpu/native/{ => noAmd}/reduction/basic.chpl | 0 test/gpu/native/{ => noAmd}/reduction/basic.good | 0 test/gpu/native/{ => noAmd}/reduction/largeArrays.chpl | 0 test/gpu/native/{ => noAmd}/reduction/largeArrays.execopts | 0 test/gpu/native/{ => noAmd}/reduction/largeArrays.good | 0 test/gpu/native/{ => noAmd}/reduction/largeArrays.skipif | 0 test/gpu/native/{ => noAmd}/reduction/largeArraysMinMax.chpl | 0 test/gpu/native/{ => noAmd}/reduction/largeArraysMinMax.compopts | 0 test/gpu/native/{ => noAmd}/reduction/largeArraysMinMax.execopts | 0 test/gpu/native/{ => noAmd}/reduction/largeArraysMinMax.good | 0 test/gpu/native/{ => noAmd}/reduction/largeArraysMinMax.skipif | 0 test/gpu/native/{ => noAmd}/reduction/reduceThroughput.chpl | 0 test/gpu/native/{ => noAmd}/reduction/reduceThroughput.execopts | 0 test/gpu/native/{ => noAmd}/reduction/reduceThroughput.good | 0 15 files changed, 0 insertions(+), 0 deletions(-) rename test/gpu/native/{reduction => noAmd}/SKIPIF (100%) rename test/gpu/native/{ => noAmd}/reduction/basic.chpl (100%) rename test/gpu/native/{ => noAmd}/reduction/basic.good (100%) rename test/gpu/native/{ => noAmd}/reduction/largeArrays.chpl (100%) rename test/gpu/native/{ => noAmd}/reduction/largeArrays.execopts (100%) rename test/gpu/native/{ => noAmd}/reduction/largeArrays.good (100%) rename test/gpu/native/{ => noAmd}/reduction/largeArrays.skipif (100%) rename test/gpu/native/{ => noAmd}/reduction/largeArraysMinMax.chpl (100%) rename test/gpu/native/{ => noAmd}/reduction/largeArraysMinMax.compopts (100%) rename test/gpu/native/{ => noAmd}/reduction/largeArraysMinMax.execopts (100%) rename test/gpu/native/{ => noAmd}/reduction/largeArraysMinMax.good (100%) rename test/gpu/native/{ => noAmd}/reduction/largeArraysMinMax.skipif (100%) rename test/gpu/native/{ => noAmd}/reduction/reduceThroughput.chpl (100%) rename test/gpu/native/{ => noAmd}/reduction/reduceThroughput.execopts (100%) rename test/gpu/native/{ => noAmd}/reduction/reduceThroughput.good (100%) diff --git a/test/gpu/native/reduction/SKIPIF b/test/gpu/native/noAmd/SKIPIF similarity index 100% rename from test/gpu/native/reduction/SKIPIF rename to test/gpu/native/noAmd/SKIPIF diff --git a/test/gpu/native/reduction/basic.chpl b/test/gpu/native/noAmd/reduction/basic.chpl similarity index 100% rename from test/gpu/native/reduction/basic.chpl rename to test/gpu/native/noAmd/reduction/basic.chpl diff --git a/test/gpu/native/reduction/basic.good b/test/gpu/native/noAmd/reduction/basic.good similarity index 100% rename from test/gpu/native/reduction/basic.good rename to test/gpu/native/noAmd/reduction/basic.good diff --git a/test/gpu/native/reduction/largeArrays.chpl b/test/gpu/native/noAmd/reduction/largeArrays.chpl similarity index 100% rename from test/gpu/native/reduction/largeArrays.chpl rename to test/gpu/native/noAmd/reduction/largeArrays.chpl diff --git a/test/gpu/native/reduction/largeArrays.execopts b/test/gpu/native/noAmd/reduction/largeArrays.execopts similarity index 100% rename from test/gpu/native/reduction/largeArrays.execopts rename to test/gpu/native/noAmd/reduction/largeArrays.execopts diff --git a/test/gpu/native/reduction/largeArrays.good b/test/gpu/native/noAmd/reduction/largeArrays.good similarity index 100% rename from test/gpu/native/reduction/largeArrays.good rename to test/gpu/native/noAmd/reduction/largeArrays.good diff --git a/test/gpu/native/reduction/largeArrays.skipif b/test/gpu/native/noAmd/reduction/largeArrays.skipif similarity index 100% rename from test/gpu/native/reduction/largeArrays.skipif rename to test/gpu/native/noAmd/reduction/largeArrays.skipif diff --git a/test/gpu/native/reduction/largeArraysMinMax.chpl b/test/gpu/native/noAmd/reduction/largeArraysMinMax.chpl similarity index 100% rename from test/gpu/native/reduction/largeArraysMinMax.chpl rename to test/gpu/native/noAmd/reduction/largeArraysMinMax.chpl diff --git a/test/gpu/native/reduction/largeArraysMinMax.compopts b/test/gpu/native/noAmd/reduction/largeArraysMinMax.compopts similarity index 100% rename from test/gpu/native/reduction/largeArraysMinMax.compopts rename to test/gpu/native/noAmd/reduction/largeArraysMinMax.compopts diff --git a/test/gpu/native/reduction/largeArraysMinMax.execopts b/test/gpu/native/noAmd/reduction/largeArraysMinMax.execopts similarity index 100% rename from test/gpu/native/reduction/largeArraysMinMax.execopts rename to test/gpu/native/noAmd/reduction/largeArraysMinMax.execopts diff --git a/test/gpu/native/reduction/largeArraysMinMax.good b/test/gpu/native/noAmd/reduction/largeArraysMinMax.good similarity index 100% rename from test/gpu/native/reduction/largeArraysMinMax.good rename to test/gpu/native/noAmd/reduction/largeArraysMinMax.good diff --git a/test/gpu/native/reduction/largeArraysMinMax.skipif b/test/gpu/native/noAmd/reduction/largeArraysMinMax.skipif similarity index 100% rename from test/gpu/native/reduction/largeArraysMinMax.skipif rename to test/gpu/native/noAmd/reduction/largeArraysMinMax.skipif diff --git a/test/gpu/native/reduction/reduceThroughput.chpl b/test/gpu/native/noAmd/reduction/reduceThroughput.chpl similarity index 100% rename from test/gpu/native/reduction/reduceThroughput.chpl rename to test/gpu/native/noAmd/reduction/reduceThroughput.chpl diff --git a/test/gpu/native/reduction/reduceThroughput.execopts b/test/gpu/native/noAmd/reduction/reduceThroughput.execopts similarity index 100% rename from test/gpu/native/reduction/reduceThroughput.execopts rename to test/gpu/native/noAmd/reduction/reduceThroughput.execopts diff --git a/test/gpu/native/reduction/reduceThroughput.good b/test/gpu/native/noAmd/reduction/reduceThroughput.good similarity index 100% rename from test/gpu/native/reduction/reduceThroughput.good rename to test/gpu/native/noAmd/reduction/reduceThroughput.good From ab8a348f0032a45ed796d5db369611eaf919b290 Mon Sep 17 00:00:00 2001 From: Engin Kayraklioglu Date: Fri, 3 Nov 2023 09:48:24 -0700 Subject: [PATCH 31/42] Add a user facing error message for unknown types and a test to lock the behavior Signed-off-by: Engin Kayraklioglu --- modules/standard/GPU.chpl | 2 ++ test/gpu/native/noAmd/reduction/stringError.chpl | 7 +++++++ test/gpu/native/noAmd/reduction/stringError.good | 5 +++++ test/gpu/native/noAmd/reduction/stringError.prediff | 4 ++++ 4 files changed, 18 insertions(+) create mode 100644 test/gpu/native/noAmd/reduction/stringError.chpl create mode 100644 test/gpu/native/noAmd/reduction/stringError.good create mode 100755 test/gpu/native/noAmd/reduction/stringError.prediff diff --git a/modules/standard/GPU.chpl b/modules/standard/GPU.chpl index f866c779e814..cfc5b1313b41 100644 --- a/modules/standard/GPU.chpl +++ b/modules/standard/GPU.chpl @@ -435,6 +435,8 @@ module GPU when real(32) do return "float"; when real(64) do return "double"; } + + compilerError("Arrays with ", t:string, " elements cannot be reduced"); return "unknown"; } diff --git a/test/gpu/native/noAmd/reduction/stringError.chpl b/test/gpu/native/noAmd/reduction/stringError.chpl new file mode 100644 index 000000000000..9045408aed4b --- /dev/null +++ b/test/gpu/native/noAmd/reduction/stringError.chpl @@ -0,0 +1,7 @@ +use GPU; + +on here.gpus[0] { + var Arr: [1..10] string; + + writeln(gpuSumReduce(Arr)); +} diff --git a/test/gpu/native/noAmd/reduction/stringError.good b/test/gpu/native/noAmd/reduction/stringError.good new file mode 100644 index 000000000000..b08203390ef8 --- /dev/null +++ b/test/gpu/native/noAmd/reduction/stringError.good @@ -0,0 +1,5 @@ +warning: The prototype GPU support implies --no-checks. This may impact debuggability. To suppress this warning, compile with --no-checks explicitly +$CHPL_HOME/modules/standard/GPU.chpl:prediffed: In function 'doGpuReduce': +$CHPL_HOME/modules/standard/GPU.chpl:prediffed: error: Arrays with string elements cannot be reduced + $CHPL_HOME/modules/standard/GPU.chpl:prediffed: called as doGpuReduce(param op = "sum", A: [domain(1,int(64),one)] string) from function 'gpuSumReduce' + stringError.chpl:prediffed: called as gpuSumReduce(A: [domain(1,int(64),one)] string) diff --git a/test/gpu/native/noAmd/reduction/stringError.prediff b/test/gpu/native/noAmd/reduction/stringError.prediff new file mode 100755 index 000000000000..082c5199fbe7 --- /dev/null +++ b/test/gpu/native/noAmd/reduction/stringError.prediff @@ -0,0 +1,4 @@ +#!/bin/sh + +sed -e "s/\.chpl:[0-9]\+:/.chpl:prediffed:/" $2 > $2.tmp +mv $2.tmp $2 From 7725662163bdfceccd6c6b4d07ca0dea9f5ad134 Mon Sep 17 00:00:00 2001 From: Engin Kayraklioglu Date: Fri, 3 Nov 2023 09:52:03 -0700 Subject: [PATCH 32/42] Add more fall-through otherwises Signed-off-by: Engin Kayraklioglu --- modules/standard/GPU.chpl | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/modules/standard/GPU.chpl b/modules/standard/GPU.chpl index cfc5b1313b41..e31e217314b3 100644 --- a/modules/standard/GPU.chpl +++ b/modules/standard/GPU.chpl @@ -415,6 +415,7 @@ module GPU when "max" do return max reduce A; when "minloc" do return minloc reduce zip (A.domain, A); when "maxloc" do return maxloc reduce zip (A.domain, A); + otherwise do compilerError("Unknown reduction operation"); } } else { @@ -434,9 +435,9 @@ module GPU when uint(64) do return "uint64_t"; when real(32) do return "float"; when real(64) do return "double"; + otherwise do + compilerError("Arrays with ", t:string, " elements cannot be reduced"); } - - compilerError("Arrays with ", t:string, " elements cannot be reduced"); return "unknown"; } @@ -449,6 +450,7 @@ module GPU when "sum" do accum += val; when "min" do accum = min(accum, val); when "max" do accum = max(accum, val); + otherwise do compilerError("Unknown reduction type"); } } @@ -459,6 +461,7 @@ module GPU if accum[1] > val[1] then accum = (val[0]+baseOffset, val[1]); when "maxloc" do if accum[1] < val[1] then accum = (val[0]+baseOffset, val[1]); + otherwise do compilerError("Unknown reduction type"); } } From 6da77ee8958d42a4e8a9b9f0187cb404383729f1 Mon Sep 17 00:00:00 2001 From: Engin Kayraklioglu Date: Fri, 3 Nov 2023 10:08:33 -0700 Subject: [PATCH 33/42] Start adding documentation Signed-off-by: Engin Kayraklioglu --- modules/standard/GPU.chpl | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/modules/standard/GPU.chpl b/modules/standard/GPU.chpl index e31e217314b3..87ddee297b6a 100644 --- a/modules/standard/GPU.chpl +++ b/modules/standard/GPU.chpl @@ -539,10 +539,48 @@ module GPU } } + /* + Perform sum-reduction on an array. The array must be allocated on + GPU-accessible memory. The function should not be called from inside a + GPU-eligible loop. Only arrays with ``int``, ``uint`` and ``real`` types + are supported. A simple example is as follows. + + .. code-block:: chapel + + on here.gpus[0] { + var Arr: [1..n] int; // will be allocated in GPU-accesible memory + ... + var sum = gpuSumReduce(Arr); + } + */ inline proc gpuSumReduce(const ref A: [] ?t) do return doGpuReduce("sum", A); + + /* + Perform min-reduction on an array. The array must be allocated on + GPU-accessible memory. The function must be called from the host. Only + arrays with ``int``, ``uint`` and ``real`` types are supported. + */ inline proc gpuMinReduce(const ref A: [] ?t) do return doGpuReduce("min", A); + + /* + Perform max-reduction on an array. The array must be allocated on + GPU-accessible memory. The function must be called from the host. Only + arrays with ``int``, ``uint`` and ``real`` types are supported. + */ inline proc gpuMaxReduce(const ref A: [] ?t) do return doGpuReduce("max", A); + + /* + Perform minloc-reduction on an array. The array must be allocated on + GPU-accessible memory. The function must be called from the host. Only + arrays with ``int``, ``uint`` and ``real`` types are supported. + */ inline proc gpuMinLocReduce(const ref A: [] ?t) do return doGpuReduce("minloc", A); + + /* + Perform maxloc-reduction on an array. The array must be allocated on + GPU-accessible memory. The function must be called from the host. Only + arrays with ``int``, ``uint`` and ``real`` types are supported. + */ inline proc gpuMaxLocReduce(const ref A: [] ?t) do return doGpuReduce("maxloc", A); } From 889ef7325b834106b66a811591762d02db92a25e Mon Sep 17 00:00:00 2001 From: Engin Kayraklioglu Date: Fri, 3 Nov 2023 10:12:11 -0700 Subject: [PATCH 34/42] Add one more fallthrough, unify error messages Signed-off-by: Engin Kayraklioglu --- modules/standard/GPU.chpl | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/modules/standard/GPU.chpl b/modules/standard/GPU.chpl index 87ddee297b6a..4aa1d6ef7495 100644 --- a/modules/standard/GPU.chpl +++ b/modules/standard/GPU.chpl @@ -415,7 +415,7 @@ module GPU when "max" do return max reduce A; when "minloc" do return minloc reduce zip (A.domain, A); when "maxloc" do return maxloc reduce zip (A.domain, A); - otherwise do compilerError("Unknown reduction operation"); + otherwise do compilerError("Unknown reduction operation: ", op); } } else { @@ -450,7 +450,7 @@ module GPU when "sum" do accum += val; when "min" do accum = min(accum, val); when "max" do accum = max(accum, val); - otherwise do compilerError("Unknown reduction type"); + otherwise do compilerError("Unknown reduction operation: ", op); } } @@ -461,7 +461,7 @@ module GPU if accum[1] > val[1] then accum = (val[0]+baseOffset, val[1]); when "maxloc" do if accum[1] < val[1] then accum = (val[0]+baseOffset, val[1]); - otherwise do compilerError("Unknown reduction type"); + otherwise do compilerError("Unknown reduction operation: ", op); } } @@ -516,7 +516,7 @@ module GPU return val; } - else { + else if op == "minloc" || op == "maxloc" { var ret: (int, t); if op == "minloc" then ret[1] = max(t); @@ -537,6 +537,9 @@ module GPU return ret; } + else { + compilerError("Unknown reduction operation: ", op); + } } /* From 377f97dc60671eaebd5e8c079c337545024ca198 Mon Sep 17 00:00:00 2001 From: Engin Kayraklioglu Date: Fri, 3 Nov 2023 10:15:52 -0700 Subject: [PATCH 35/42] Free runtime memory that we were leaking before Signed-off-by: Engin Kayraklioglu --- runtime/src/gpu/nvidia/gpu-nvidia-reduce.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/runtime/src/gpu/nvidia/gpu-nvidia-reduce.cc b/runtime/src/gpu/nvidia/gpu-nvidia-reduce.cc index 6c9f7a350c34..6e8828cae16d 100644 --- a/runtime/src/gpu/nvidia/gpu-nvidia-reduce.cc +++ b/runtime/src/gpu/nvidia/gpu-nvidia-reduce.cc @@ -42,6 +42,7 @@ void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ (CUstream)stream); \ CUDA_CALL(cuMemcpyDtoHAsync(val, result, sizeof(data_type),\ (CUstream)stream)); \ + CUDA_CALL(cuMemFree(result)); \ } GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL, Sum, sum) @@ -69,6 +70,7 @@ void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ (CUstream)stream)); \ *val = result_host.value; \ *idx = result_host.key; \ + CUDA_CALL(cuMemFree(result)); \ } GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, ArgMin, minloc) From 1675b9e81990fd14172c3746ab65439238f43463 Mon Sep 17 00:00:00 2001 From: Engin Kayraklioglu Date: Fri, 3 Nov 2023 11:21:01 -0700 Subject: [PATCH 36/42] A big refactor to reduce code duplication significantly Signed-off-by: Engin Kayraklioglu --- modules/standard/GPU.chpl | 100 ++++++++++---------- runtime/include/chpl-gpu-impl.h | 23 ++--- runtime/include/chpl-gpu.h | 22 ++--- runtime/src/chpl-gpu.c | 37 ++------ runtime/src/gpu/amd/gpu-amd-reduce.cc | 4 +- runtime/src/gpu/nvidia/gpu-nvidia-reduce.cc | 5 +- 6 files changed, 79 insertions(+), 112 deletions(-) diff --git a/modules/standard/GPU.chpl b/modules/standard/GPU.chpl index 4aa1d6ef7495..63452e5b95e4 100644 --- a/modules/standard/GPU.chpl +++ b/modules/standard/GPU.chpl @@ -445,18 +445,31 @@ module GPU return "chpl_gpu_"+op+"_reduce_"+chplTypeToCTypeName(t); } - inline proc subReduceVal(param op, ref accum: ?t, val: t) { - select op { - when "sum" do accum += val; - when "min" do accum = min(accum, val); - when "max" do accum = max(accum, val); - otherwise do compilerError("Unknown reduction operation: ", op); - } + proc isValReduce(param op) param { + return op=="sum" || op=="min" || op=="max"; } - inline proc subReduceValIdx(param op, const baseOffset, ref accum: ?t, - val: t) { + proc isValIdxReduce(param op) param { + return op=="minloc" || op=="maxloc"; + } + + inline proc subReduceValIdx(param op, const baseOffset, ref accum, val) { + // do some type checking to be safe + compilerAssert(isTupleValue(val)); + if isTupleValue(accum) { + compilerAssert(isValIdxReduce(op)); + compilerAssert(val[1].type == accum[1].type); + + } + else { + compilerAssert(isValReduce(op)); + compilerAssert(val[1].type == accum.type); + } + select op { + when "sum" do accum += val[1]; + when "min" do accum = min(accum, val[1]); + when "max" do accum = max(accum, val[1]); when "minloc" do if accum[1] > val[1] then accum = (val[0]+baseOffset, val[1]); when "maxloc" do @@ -494,52 +507,41 @@ module GPU use CTypes; + // find the extern function we'll use param externFunc = getExternFuncName(op, t); - - if op == "sum" || op == "min" || op == "max" { - var val: t; - if op == "min" then - val = max(t); - else if op == "max" then - val = min(t); - - extern externFunc proc reduce_fn(data, size, ref val); - - const basePtr = c_ptrToConst(A); - for (offset,size) in offsetsThatCanFitIn32Bits(A.size) { - var curVal: t; - reduce_fn(basePtr+offset, size, curVal); - if gpuDebugReduce then - writef(" (curVal=%i)\n", curVal); - subReduceVal(op, val, curVal); - } - - return val; + extern externFunc proc reduce_fn(data, size, ref val, ref idx); + + // initialize the return value + var ret; + if isValReduce(op) { + var retTmp: t; + if op == "min" then retTmp = max(t); + else if op == "max" then retTmp = min(t); + ret = retTmp; } - else if op == "minloc" || op == "maxloc" { - var ret: (int, t); - if op == "minloc" then - ret[1] = max(t); - else if op == "maxloc" then - ret[1] = min(t); - - extern externFunc proc reduce_fn(data, size, ref val, ref idx); - - const basePtr = c_ptrToConst(A); - for (offset,size) in offsetsThatCanFitIn32Bits(A.size) { - var curIdx: int(32); - var curVal: t; - reduce_fn(basePtr+offset, size, curVal, curIdx); - subReduceValIdx(op, offset, ret, (curIdx, curVal)); - if gpuDebugReduce then - writef(" (curIdx=%i curVal=%i ret=%?)\n", curIdx, curVal, ret); - } - - return ret; + else if isValIdxReduce(op) { + var retTmp: (int, t); + if op == "minloc" then retTmp[1] = max(t); + else if op == "maxloc" then retTmp[1] = min(t); + ret = retTmp; } else { compilerError("Unknown reduction operation: ", op); + ret = 0; } + + // perform the reduction + const basePtr = c_ptrToConst(A); + for (offset,size) in offsetsThatCanFitIn32Bits(A.size) { + var curIdx: int(32) = -1; // should remain -1 for sum, min, max + var curVal: t; + reduce_fn(basePtr+offset, size, curVal, curIdx); + subReduceValIdx(op, offset, ret, (curIdx, curVal)); + if gpuDebugReduce then + writef(" (curIdx=%i curVal=%i ret=%?)\n", curIdx, curVal, ret); + } + + return ret; } /* diff --git a/runtime/include/chpl-gpu-impl.h b/runtime/include/chpl-gpu-impl.h index 6c01a07aea1f..972c57a91e9c 100644 --- a/runtime/include/chpl-gpu-impl.h +++ b/runtime/include/chpl-gpu-impl.h @@ -76,26 +76,17 @@ void chpl_gpu_impl_stream_destroy(void* stream); bool chpl_gpu_impl_stream_ready(void* stream); void chpl_gpu_impl_stream_synchronize(void* stream); -#define DECL_ONE_REDUCE_IMPL_RET_VAL(chpl_kind, data_type) \ -void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ - data_type* val,\ - void* stream); - -GPU_REDUCE(DECL_ONE_REDUCE_IMPL_RET_VAL, sum) -GPU_REDUCE(DECL_ONE_REDUCE_IMPL_RET_VAL, min) -GPU_REDUCE(DECL_ONE_REDUCE_IMPL_RET_VAL, max) - -#undef DECL_ONE_REDUCE_RET_VAL - -#define DECL_ONE_REDUCE_IMPL_RET_VAL_IDX(chpl_kind, data_type) \ +#define DECL_ONE_REDUCE_IMPL(chpl_kind, data_type) \ void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ data_type* val, int* idx,\ void* stream); +GPU_REDUCE(DECL_ONE_REDUCE_IMPL, sum) +GPU_REDUCE(DECL_ONE_REDUCE_IMPL, min) +GPU_REDUCE(DECL_ONE_REDUCE_IMPL, max) +GPU_REDUCE(DECL_ONE_REDUCE_IMPL, minloc) +GPU_REDUCE(DECL_ONE_REDUCE_IMPL, maxloc) -GPU_REDUCE(DECL_ONE_REDUCE_IMPL_RET_VAL_IDX, minloc) -GPU_REDUCE(DECL_ONE_REDUCE_IMPL_RET_VAL_IDX, maxloc) - -#undef DECL_ONE_REDUCE_RET_VAL_IDX +#undef DECL_ONE_REDUCE_IMPL #ifdef __cplusplus } diff --git a/runtime/include/chpl-gpu.h b/runtime/include/chpl-gpu.h index f3b5ae31dc8b..189f77583246 100644 --- a/runtime/include/chpl-gpu.h +++ b/runtime/include/chpl-gpu.h @@ -150,24 +150,18 @@ size_t chpl_gpu_get_alloc_size(void* ptr); bool chpl_gpu_can_access_peer(int dev1, int dev2); void chpl_gpu_set_peer_access(int dev1, int dev2, bool enable); -#define DECL_ONE_REDUCE_RET_VAL(chpl_kind, data_type) \ -void chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ - data_type* val); - -GPU_REDUCE(DECL_ONE_REDUCE_RET_VAL, sum); -GPU_REDUCE(DECL_ONE_REDUCE_RET_VAL, min); -GPU_REDUCE(DECL_ONE_REDUCE_RET_VAL, max); - -#undef DECL_ONE_REDUCE_RET_VAL - -#define DECL_ONE_REDUCE_RET_VAL_IDX(chpl_kind, data_type) \ +#define DECL_ONE_REDUCE(chpl_kind, data_type) \ void chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ data_type* val, int* idx); -GPU_REDUCE(DECL_ONE_REDUCE_RET_VAL_IDX, minloc); -GPU_REDUCE(DECL_ONE_REDUCE_RET_VAL_IDX, maxloc); +GPU_REDUCE(DECL_ONE_REDUCE, sum); +GPU_REDUCE(DECL_ONE_REDUCE, min); +GPU_REDUCE(DECL_ONE_REDUCE, max); +GPU_REDUCE(DECL_ONE_REDUCE, minloc); +GPU_REDUCE(DECL_ONE_REDUCE, maxloc); + +#undef DECL_ONE_REDUCE -#undef DECL_ONE_REDUCE_RET_VAL_IDX #endif // HAS_GPU_LOCALE diff --git a/runtime/src/chpl-gpu.c b/runtime/src/chpl-gpu.c index c0182682a33b..cdd91a599e3a 100644 --- a/runtime/src/chpl-gpu.c +++ b/runtime/src/chpl-gpu.c @@ -699,32 +699,7 @@ void chpl_gpu_set_peer_access(int dev1, int dev2, bool enable) { chpl_gpu_impl_set_peer_access(dev1, dev2, enable); } -#define DEF_ONE_REDUCE_RET_VAL(kind, data_type)\ -void chpl_gpu_##kind##_reduce_##data_type(data_type *data, int n, \ - data_type* val) { \ - CHPL_GPU_DEBUG("chpl_gpu_" #kind "_reduce_" #data_type " called\n"); \ - \ - int dev = chpl_task_getRequestedSubloc(); \ - chpl_gpu_impl_use_device(dev); \ - void* stream = get_stream(dev); \ - \ - chpl_gpu_impl_##kind##_reduce_##data_type(data, n, val, stream); \ - \ - if (chpl_gpu_sync_with_host) { \ - CHPL_GPU_DEBUG("Eagerly synchronizing stream %p\n", stream); \ - wait_stream(stream); \ - } \ - \ - CHPL_GPU_DEBUG("chpl_gpu_" #kind "_reduce_" #data_type " returned\n"); \ -} - -GPU_REDUCE(DEF_ONE_REDUCE_RET_VAL, sum) -GPU_REDUCE(DEF_ONE_REDUCE_RET_VAL, min) -GPU_REDUCE(DEF_ONE_REDUCE_RET_VAL, max) - -#undef DEF_ONE_REDUCE_RET_VAL - -#define DEF_ONE_REDUCE_RET_VAL_IDX(kind, data_type)\ +#define DEF_ONE_REDUCE(kind, data_type)\ void chpl_gpu_##kind##_reduce_##data_type(data_type *data, int n, \ data_type* val, int* idx) { \ CHPL_GPU_DEBUG("chpl_gpu_" #kind "_reduce_" #data_type " called\n"); \ @@ -743,10 +718,12 @@ void chpl_gpu_##kind##_reduce_##data_type(data_type *data, int n, \ CHPL_GPU_DEBUG("chpl_gpu_" #kind "_reduce_" #data_type " returned\n"); \ } -GPU_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, minloc); -GPU_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, maxloc); - -#undef DEF_ONE_REDUCE_RET_VAL_IDX +GPU_REDUCE(DEF_ONE_REDUCE, sum) +GPU_REDUCE(DEF_ONE_REDUCE, min) +GPU_REDUCE(DEF_ONE_REDUCE, max) +GPU_REDUCE(DEF_ONE_REDUCE, minloc) +GPU_REDUCE(DEF_ONE_REDUCE, maxloc) +#undef DEF_ONE_REDUCE #endif diff --git a/runtime/src/gpu/amd/gpu-amd-reduce.cc b/runtime/src/gpu/amd/gpu-amd-reduce.cc index e9a39dfbc25d..d8a3f01cbaac 100644 --- a/runtime/src/gpu/amd/gpu-amd-reduce.cc +++ b/runtime/src/gpu/amd/gpu-amd-reduce.cc @@ -37,14 +37,14 @@ #if 1 #define DEF_ONE_REDUCE_RET_VAL(impl_kind, chpl_kind, data_type) \ void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ - data_type* val,\ + data_type* val, int* idx\ void* stream) {\ chpl_internal_error("This function shouldn't have been called. Reduction is not supported with AMD GPUs\n");\ } #elif ROCM_VERSION_MAJOR >= 5 #define DEF_ONE_REDUCE_RET_VAL(impl_kind, chpl_kind, data_type) \ void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ - data_type* val,\ + data_type* val, int* idx\ void* stream) {\ data_type* result; \ ROCM_CALL(hipMalloc(&result, sizeof(data_type)));\ diff --git a/runtime/src/gpu/nvidia/gpu-nvidia-reduce.cc b/runtime/src/gpu/nvidia/gpu-nvidia-reduce.cc index 6e8828cae16d..daae4dce16b6 100644 --- a/runtime/src/gpu/nvidia/gpu-nvidia-reduce.cc +++ b/runtime/src/gpu/nvidia/gpu-nvidia-reduce.cc @@ -27,9 +27,12 @@ #include "../common/cuda-utils.h" #include "gpu/chpl-gpu-reduce-util.h" +// this version doesn't do anything with `idx`. Having a unified interface makes +// the implementation in the rest of the runtime and the modules more +// straightforward. #define DEF_ONE_REDUCE_RET_VAL(cub_kind, chpl_kind, data_type) \ void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ - data_type* val,\ + data_type* val, int* idx,\ void* stream) {\ CUdeviceptr result; \ CUDA_CALL(cuMemAlloc(&result, sizeof(data_type))); \ From dd10162974e3039d29d19cac47f32e6ea1e8cef9 Mon Sep 17 00:00:00 2001 From: Engin Kayraklioglu Date: Fri, 3 Nov 2023 11:31:55 -0700 Subject: [PATCH 37/42] Fix a bug for non-zero-based arrays, add test Signed-off-by: Engin Kayraklioglu --- modules/standard/GPU.chpl | 3 +++ .../native/noAmd/reduction/nonZeroBased.chpl | 21 +++++++++++++++++++ .../native/noAmd/reduction/nonZeroBased.good | 10 +++++++++ 3 files changed, 34 insertions(+) create mode 100644 test/gpu/native/noAmd/reduction/nonZeroBased.chpl create mode 100644 test/gpu/native/noAmd/reduction/nonZeroBased.good diff --git a/modules/standard/GPU.chpl b/modules/standard/GPU.chpl index 63452e5b95e4..fa91ea3ea36f 100644 --- a/modules/standard/GPU.chpl +++ b/modules/standard/GPU.chpl @@ -541,6 +541,9 @@ module GPU writef(" (curIdx=%i curVal=%i ret=%?)\n", curIdx, curVal, ret); } + if isValIdxReduce(op) then + ret[0] += A.domain.first; + return ret; } diff --git a/test/gpu/native/noAmd/reduction/nonZeroBased.chpl b/test/gpu/native/noAmd/reduction/nonZeroBased.chpl new file mode 100644 index 000000000000..ae337b07f240 --- /dev/null +++ b/test/gpu/native/noAmd/reduction/nonZeroBased.chpl @@ -0,0 +1,21 @@ +use GPU; +use Random; + +proc test(ref Arr) { + fillRandom(Arr, min=0, max=100, seed=17); + + writeln(Arr); + writeln(gpuMinLocReduce(Arr)); + writeln(gpuMaxLocReduce(Arr)); +} + +on here.gpus[0] { + var A: [-20..#10] int; + test(A); + + var B: [0..#10] int; + test(B); + + var C: [10..#10] int; + test(C); +} diff --git a/test/gpu/native/noAmd/reduction/nonZeroBased.good b/test/gpu/native/noAmd/reduction/nonZeroBased.good new file mode 100644 index 000000000000..7f00b4427b24 --- /dev/null +++ b/test/gpu/native/noAmd/reduction/nonZeroBased.good @@ -0,0 +1,10 @@ +warning: The prototype GPU support implies --no-checks. This may impact debuggability. To suppress this warning, compile with --no-checks explicitly +64 6 5 12 14 91 11 91 11 58 +(-18, 5) +(-15, 91) +64 6 5 12 14 91 11 91 11 58 +(2, 5) +(5, 91) +64 6 5 12 14 91 11 91 11 58 +(12, 5) +(15, 91) From 7f007b79c24a39338105f9c2d52c798e6b91e182 Mon Sep 17 00:00:00 2001 From: Engin Kayraklioglu Date: Fri, 3 Nov 2023 11:41:38 -0700 Subject: [PATCH 38/42] Remove trailing whitespaces Signed-off-by: Engin Kayraklioglu --- modules/standard/GPU.chpl | 83 +++++++++++++++++++++++++++------------ 1 file changed, 58 insertions(+), 25 deletions(-) diff --git a/modules/standard/GPU.chpl b/modules/standard/GPU.chpl index fa91ea3ea36f..23c2e2436485 100644 --- a/modules/standard/GPU.chpl +++ b/modules/standard/GPU.chpl @@ -547,47 +547,80 @@ module GPU return ret; } - /* - Perform sum-reduction on an array. The array must be allocated on - GPU-accessible memory. The function should not be called from inside a - GPU-eligible loop. Only arrays with ``int``, ``uint`` and ``real`` types - are supported. A simple example is as follows. - + /* + Add all elements of an array together on the GPU (that is, perform a + sum-reduction). The array must be in GPU-accessible memory and the function + must be called from outside a GPU-eligible loop. Only arrays with int, uint, + and real types are supported. A simple example is the following: + .. code-block:: chapel on here.gpus[0] { - var Arr: [1..n] int; // will be allocated in GPU-accesible memory - ... - var sum = gpuSumReduce(Arr); + var Arr = [3, 2, 1, 5, 4]; // will be GPU-accessible + writeln(gpuSumReduce(Arr)); // 15 } */ inline proc gpuSumReduce(const ref A: [] ?t) do return doGpuReduce("sum", A); - /* - Perform min-reduction on an array. The array must be allocated on - GPU-accessible memory. The function must be called from the host. Only - arrays with ``int``, ``uint`` and ``real`` types are supported. + /* + Return the minimum element of an array on the GPU (that is, perform a + min-reduction). The array must be in GPU-accessible memory and the function + must be called from outside a GPU-eligible loop. Only arrays with int, uint, + and real types are supported. A simple example is the following: + + .. code-block:: chapel + + on here.gpus[0] { + var Arr = [3, 2, 1, 5, 4]; // will be GPU-accessible + writeln(gpuMinReduce(Arr)); // 1 + } */ inline proc gpuMinReduce(const ref A: [] ?t) do return doGpuReduce("min", A); - /* - Perform max-reduction on an array. The array must be allocated on - GPU-accessible memory. The function must be called from the host. Only - arrays with ``int``, ``uint`` and ``real`` types are supported. + /* + Return the maximum element of an array on the GPU (that is, perform a + max-reduction). The array must be in GPU-accessible memory and the function + must be called from outside a GPU-eligible loop. Only arrays with int, uint, + and real types are supported. A simple example is the following: + + .. code-block:: chapel + + on here.gpus[0] { + var Arr = [3, 2, 1, 5, 4]; // will be GPU-accessible + writeln(gpuMaxReduce(Arr)); // 5 + } */ inline proc gpuMaxReduce(const ref A: [] ?t) do return doGpuReduce("max", A); - /* - Perform minloc-reduction on an array. The array must be allocated on - GPU-accessible memory. The function must be called from the host. Only - arrays with ``int``, ``uint`` and ``real`` types are supported. + /* + For an array on the GPU, return a tuple with the index and the value of the + minimum element (that is, perform a minloc-reduction). The array must be in + GPU-accessible memory and the function must be called from outside a + GPU-eligible loop. Only arrays with int, uint, and real types are + supported. A simple example is the following: + + .. code-block:: chapel + + on here.gpus[0] { + var Arr = [3, 2, 1, 5, 4]; // will be GPU-accessible + writeln(gpuMinLocReduce(Arr)); // (2, 1) + } */ inline proc gpuMinLocReduce(const ref A: [] ?t) do return doGpuReduce("minloc", A); - /* - Perform maxloc-reduction on an array. The array must be allocated on - GPU-accessible memory. The function must be called from the host. Only - arrays with ``int``, ``uint`` and ``real`` types are supported. + /* + For an array on the GPU, return a tuple with the index and the value of the + maximum element (that is, perform a maxloc-reduction). The array must be in + GPU-accessible memory and the function must be called from outside a + GPU-eligible loop. Only arrays with int, uint, and real types are + supported. A simple example is the following: + + .. code-block:: chapel + + on here.gpus[0] { + var Arr = [3, 2, 1, 5, 4]; // will be GPU-accessible + writeln(gpuMaxLocReduce(Arr)); // (3, 5) + } */ inline proc gpuMaxLocReduce(const ref A: [] ?t) do return doGpuReduce("maxloc", A); From 727f44311a6393c4725ae8a7f828e2f2b504752d Mon Sep 17 00:00:00 2001 From: Engin Kayraklioglu Date: Fri, 3 Nov 2023 11:47:19 -0700 Subject: [PATCH 39/42] More clarifications in doc Signed-off-by: Engin Kayraklioglu --- modules/standard/GPU.chpl | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/modules/standard/GPU.chpl b/modules/standard/GPU.chpl index 23c2e2436485..36612bf99d58 100644 --- a/modules/standard/GPU.chpl +++ b/modules/standard/GPU.chpl @@ -594,32 +594,34 @@ module GPU /* For an array on the GPU, return a tuple with the index and the value of the - minimum element (that is, perform a minloc-reduction). The array must be in - GPU-accessible memory and the function must be called from outside a - GPU-eligible loop. Only arrays with int, uint, and real types are - supported. A simple example is the following: + minimum element (that is, perform a minloc-reduction).If there are multiple + elements with the same minimum value, the index of the first one is + returned. The array must be in GPU-accessible memory and the function must + be called from outside a GPU-eligible loop. Only arrays with int, uint, and + real types are supported. A simple example is the following: .. code-block:: chapel on here.gpus[0] { var Arr = [3, 2, 1, 5, 4]; // will be GPU-accessible - writeln(gpuMinLocReduce(Arr)); // (2, 1) + writeln(gpuMinLocReduce(Arr)); // (2, 1). Note that Arr[2]==1. } */ inline proc gpuMinLocReduce(const ref A: [] ?t) do return doGpuReduce("minloc", A); /* For an array on the GPU, return a tuple with the index and the value of the - maximum element (that is, perform a maxloc-reduction). The array must be in - GPU-accessible memory and the function must be called from outside a - GPU-eligible loop. Only arrays with int, uint, and real types are - supported. A simple example is the following: + maximum element (that is, perform a maxloc-reduction). If there are multiple + elements with the same maximum value, the index of the first one is + returned. The array must be in GPU-accessible memory and the function must + be called from outside a GPU-eligible loop. Only arrays with int, uint, and + real types are supported. A simple example is the following: .. code-block:: chapel on here.gpus[0] { var Arr = [3, 2, 1, 5, 4]; // will be GPU-accessible - writeln(gpuMaxLocReduce(Arr)); // (3, 5) + writeln(gpuMaxLocReduce(Arr)); // (3, 5). Note that Arr[3]==5. } */ inline proc gpuMaxLocReduce(const ref A: [] ?t) do return doGpuReduce("maxloc", A); From 79ea359dcd3b1be75742be7574b32dc4bdf79ff4 Mon Sep 17 00:00:00 2001 From: Engin Kayraklioglu Date: Fri, 3 Nov 2023 11:48:37 -0700 Subject: [PATCH 40/42] Add a missing space Signed-off-by: Engin Kayraklioglu --- modules/standard/GPU.chpl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/standard/GPU.chpl b/modules/standard/GPU.chpl index 36612bf99d58..a45d7540ad09 100644 --- a/modules/standard/GPU.chpl +++ b/modules/standard/GPU.chpl @@ -594,7 +594,7 @@ module GPU /* For an array on the GPU, return a tuple with the index and the value of the - minimum element (that is, perform a minloc-reduction).If there are multiple + minimum element (that is, perform a minloc-reduction). If there are multiple elements with the same minimum value, the index of the first one is returned. The array must be in GPU-accessible memory and the function must be called from outside a GPU-eligible loop. Only arrays with int, uint, and From cc39ef743307e46fa1c9d39eb0690c24d4ca6637 Mon Sep 17 00:00:00 2001 From: Engin Kayraklioglu Date: Mon, 6 Nov 2023 13:54:32 -0800 Subject: [PATCH 41/42] Add missing commas in AMD runtime Signed-off-by: Engin Kayraklioglu --- runtime/src/gpu/amd/gpu-amd-reduce.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/runtime/src/gpu/amd/gpu-amd-reduce.cc b/runtime/src/gpu/amd/gpu-amd-reduce.cc index d8a3f01cbaac..638a15451427 100644 --- a/runtime/src/gpu/amd/gpu-amd-reduce.cc +++ b/runtime/src/gpu/amd/gpu-amd-reduce.cc @@ -37,14 +37,14 @@ #if 1 #define DEF_ONE_REDUCE_RET_VAL(impl_kind, chpl_kind, data_type) \ void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ - data_type* val, int* idx\ + data_type* val, int* idx,\ void* stream) {\ chpl_internal_error("This function shouldn't have been called. Reduction is not supported with AMD GPUs\n");\ } #elif ROCM_VERSION_MAJOR >= 5 #define DEF_ONE_REDUCE_RET_VAL(impl_kind, chpl_kind, data_type) \ void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ - data_type* val, int* idx\ + data_type* val, int* idx,\ void* stream) {\ data_type* result; \ ROCM_CALL(hipMalloc(&result, sizeof(data_type)));\ From 00ee96799c71392b8f67ecfb0dd3f593a6668d81 Mon Sep 17 00:00:00 2001 From: Engin Kayraklioglu Date: Mon, 6 Nov 2023 14:11:50 -0800 Subject: [PATCH 42/42] Move skipif to the parent directory Signed-off-by: Engin Kayraklioglu --- test/gpu/native/{noAmd/SKIPIF => noAmd.skipif} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename test/gpu/native/{noAmd/SKIPIF => noAmd.skipif} (100%) diff --git a/test/gpu/native/noAmd/SKIPIF b/test/gpu/native/noAmd.skipif similarity index 100% rename from test/gpu/native/noAmd/SKIPIF rename to test/gpu/native/noAmd.skipif