diff --git a/modules/standard/GPU.chpl b/modules/standard/GPU.chpl index 738725b29aa4..d5930427c665 100644 --- a/modules/standard/GPU.chpl +++ b/modules/standard/GPU.chpl @@ -389,4 +389,241 @@ module GPU /* When run on a GPU, atomically compare the value in 'x' and 'cmp', if they are equal store 'val' in 'x'. The operation returns the old value of x. */ inline proc gpuAtomicCAS( ref x : ?T, cmp : T, val : T) : T { return gpuAtomicTernOp("CAS", x, cmp, val); } + + // ============================ + // Reductions + // ============================ + + @chpldoc.nodoc + config param gpuDebugReduce = false; + + private inline proc doGpuReduce(param op: string, const ref A: [] ?t) { + if op != "sum" && op != "min" && op != "max" && + op != "minloc" && op != "maxloc" { + + compilerError("Unexpected reduction kind in doGpuReduce: ", op); + } + + + if CHPL_GPU == "amd" { + compilerError("gpu*Reduce functions are not supported on AMD GPUs"); + } + else if CHPL_GPU == "cpu" { + select op { + when "sum" do return + reduce A; + when "min" do return min reduce A; + when "max" do return max reduce A; + when "minloc" do return minloc reduce zip (A.domain, A); + when "maxloc" do return maxloc reduce zip (A.domain, A); + otherwise do compilerError("Unknown reduction operation: ", op); + } + } + else { + compilerAssert(CHPL_GPU=="nvidia"); + } + + + proc chplTypeToCTypeName(type t) param { + select t { + when int(8) do return "int8_t"; + when int(16) do return "int16_t"; + when int(32) do return "int32_t"; + when int(64) do return "int64_t"; + when uint(8) do return "uint8_t"; + when uint(16) do return "uint16_t"; + when uint(32) do return "uint32_t"; + when uint(64) do return "uint64_t"; + when real(32) do return "float"; + when real(64) do return "double"; + otherwise do + compilerError("Arrays with ", t:string, " elements cannot be reduced"); + } + return "unknown"; + } + + proc getExternFuncName(param op: string, type t) param: string { + return "chpl_gpu_"+op+"_reduce_"+chplTypeToCTypeName(t); + } + + proc isValReduce(param op) param { + return op=="sum" || op=="min" || op=="max"; + } + + proc isValIdxReduce(param op) param { + return op=="minloc" || op=="maxloc"; + } + + inline proc subReduceValIdx(param op, const baseOffset, ref accum, val) { + // do some type checking to be safe + compilerAssert(isTupleValue(val)); + if isTupleValue(accum) { + compilerAssert(isValIdxReduce(op)); + compilerAssert(val[1].type == accum[1].type); + + } + else { + compilerAssert(isValReduce(op)); + compilerAssert(val[1].type == accum.type); + } + + select op { + when "sum" do accum += val[1]; + when "min" do accum = min(accum, val[1]); + when "max" do accum = max(accum, val[1]); + when "minloc" do + if accum[1] > val[1] then accum = (val[0]+baseOffset, val[1]); + when "maxloc" do + if accum[1] < val[1] then accum = (val[0]+baseOffset, val[1]); + otherwise do compilerError("Unknown reduction operation: ", op); + } + } + + iter offsetsThatCanFitIn32Bits(size: int) { + // Engin: I've tried to get max(int(32)) to work as this bug is about CUB + // using `int` as the size in the interface. However, getting close to + // max(int(32)) also triggers the bug. So, I am choosing this as a + // round/safe value for the time being. + param chunkSize = 2_000_000_000; + + use Math only divCeil; + const numChunks = divCeil(size, chunkSize); + const standardChunkSize = divCeil(size, numChunks); + + if gpuDebugReduce then + writeln("Will use ", numChunks, " chunks of size ", standardChunkSize); + + foreach chunk in 0.. #include "chpl-tasks.h" #include "chpl-mem-desc.h" +#include "gpu/chpl-gpu-reduce-util.h" #ifdef __cplusplus extern "C" { @@ -149,6 +150,19 @@ size_t chpl_gpu_get_alloc_size(void* ptr); bool chpl_gpu_can_access_peer(int dev1, int dev2); void chpl_gpu_set_peer_access(int dev1, int dev2, bool enable); +#define DECL_ONE_REDUCE(chpl_kind, data_type) \ +void chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ + data_type* val, int* idx); + +GPU_REDUCE(DECL_ONE_REDUCE, sum); +GPU_REDUCE(DECL_ONE_REDUCE, min); +GPU_REDUCE(DECL_ONE_REDUCE, max); +GPU_REDUCE(DECL_ONE_REDUCE, minloc); +GPU_REDUCE(DECL_ONE_REDUCE, maxloc); + +#undef DECL_ONE_REDUCE + + #endif // HAS_GPU_LOCALE #ifdef __cplusplus diff --git a/runtime/include/gpu/chpl-gpu-reduce-util.h b/runtime/include/gpu/chpl-gpu-reduce-util.h new file mode 100644 index 000000000000..6076fb0b98a5 --- /dev/null +++ b/runtime/include/gpu/chpl-gpu-reduce-util.h @@ -0,0 +1,47 @@ +/* + * Copyright 2020-2023 Hewlett Packard Enterprise Development LP + * Copyright 2004-2019 Cray Inc. + * Other additional copyright holders may be indicated within. * + * The entirety of this work is licensed under the Apache License, + * Version 2.0 (the "License"); you may not use this file except + * in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef HAS_GPU_LOCALE + +#define GPU_IMPL_REDUCE(MACRO, impl_kind, chpl_kind) \ + MACRO(impl_kind, chpl_kind, int8_t) \ + MACRO(impl_kind, chpl_kind, int16_t) \ + MACRO(impl_kind, chpl_kind, int32_t) \ + MACRO(impl_kind, chpl_kind, int64_t) \ + MACRO(impl_kind, chpl_kind, uint8_t) \ + MACRO(impl_kind, chpl_kind, uint16_t) \ + MACRO(impl_kind, chpl_kind, uint32_t) \ + MACRO(impl_kind, chpl_kind, uint64_t) \ + MACRO(impl_kind, chpl_kind, float) \ + MACRO(impl_kind, chpl_kind, double); + +#define GPU_REDUCE(MACRO, chpl_kind) \ + MACRO(chpl_kind, int8_t) \ + MACRO(chpl_kind, int16_t) \ + MACRO(chpl_kind, int32_t) \ + MACRO(chpl_kind, int64_t) \ + MACRO(chpl_kind, uint8_t) \ + MACRO(chpl_kind, uint16_t) \ + MACRO(chpl_kind, uint32_t) \ + MACRO(chpl_kind, uint64_t) \ + MACRO(chpl_kind, float) \ + MACRO(chpl_kind, double); + +#endif // HAS_GPU_LOCALE + diff --git a/runtime/src/chpl-gpu.c b/runtime/src/chpl-gpu.c index 6879eb43cce4..f8f808c07299 100644 --- a/runtime/src/chpl-gpu.c +++ b/runtime/src/chpl-gpu.c @@ -43,6 +43,8 @@ bool chpl_gpu_use_stream_per_task = true; #include "chpl-env.h" #include "chpl-comm-compiler-macros.h" +#include "gpu/chpl-gpu-reduce-util.h" + void chpl_gpu_init(void) { chpl_gpu_impl_init(&chpl_gpu_num_devices); @@ -700,4 +702,31 @@ void chpl_gpu_set_peer_access(int dev1, int dev2, bool enable) { chpl_gpu_impl_set_peer_access(dev1, dev2, enable); } +#define DEF_ONE_REDUCE(kind, data_type)\ +void chpl_gpu_##kind##_reduce_##data_type(data_type *data, int n, \ + data_type* val, int* idx) { \ + CHPL_GPU_DEBUG("chpl_gpu_" #kind "_reduce_" #data_type " called\n"); \ + \ + int dev = chpl_task_getRequestedSubloc(); \ + chpl_gpu_impl_use_device(dev); \ + void* stream = get_stream(dev); \ + \ + chpl_gpu_impl_##kind##_reduce_##data_type(data, n, val, idx, stream); \ + \ + if (chpl_gpu_sync_with_host) { \ + CHPL_GPU_DEBUG("Eagerly synchronizing stream %p\n", stream); \ + wait_stream(stream); \ + } \ + \ + CHPL_GPU_DEBUG("chpl_gpu_" #kind "_reduce_" #data_type " returned\n"); \ +} + +GPU_REDUCE(DEF_ONE_REDUCE, sum) +GPU_REDUCE(DEF_ONE_REDUCE, min) +GPU_REDUCE(DEF_ONE_REDUCE, max) +GPU_REDUCE(DEF_ONE_REDUCE, minloc) +GPU_REDUCE(DEF_ONE_REDUCE, maxloc) + +#undef DEF_ONE_REDUCE + #endif diff --git a/runtime/src/gpu/amd/Makefile.include b/runtime/src/gpu/amd/Makefile.include index 7ba37d695b24..7bce5c886202 100644 --- a/runtime/src/gpu/amd/Makefile.include +++ b/runtime/src/gpu/amd/Makefile.include @@ -21,5 +21,6 @@ GPU_SUBDIR = src/gpu/amd GPU_OBJDIR = $(RUNTIME_BUILD)/$(GPU_SUBDIR) ALL_SRCS += $(CURDIR)/$(GPU_SUBDIR)/*.c +ALL_SRCS += $(CURDIR)/$(GPU_SUBDIR)/*.cc include $(RUNTIME_ROOT)/$(GPU_SUBDIR)/Makefile.share diff --git a/runtime/src/gpu/amd/Makefile.share b/runtime/src/gpu/amd/Makefile.share index e8ad87a160ab..410b4f724e82 100644 --- a/runtime/src/gpu/amd/Makefile.share +++ b/runtime/src/gpu/amd/Makefile.share @@ -15,8 +15,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -SRCS = gpu-amd.c +GPU_SRCS = gpu-amd-reduce.cc gpu-amd.c -GPU_SRCS = gpu-amd.c +SRCS = $(GPU_SRCS) -GPU_OBJS = $(GPU_SRCS:%.c=$(GPU_OBJDIR)/%.o) +GPU_OBJS = $(addprefix $(GPU_OBJDIR)/,$(addsuffix .o,$(basename $(GPU_SRCS)))) + +RUNTIME_CXXFLAGS += -x hip + +$(RUNTIME_OBJ_DIR)/gpu-amd-reduce.o: gpu-amd-reduce.cc \ + $(RUNTIME_OBJ_DIR_STAMP) + $(CXX) -c -std=c++14 $(RUNTIME_CXXFLAGS) $(RUNTIME_INCLS) -o $@ $< diff --git a/runtime/src/gpu/amd/gpu-amd-reduce.cc b/runtime/src/gpu/amd/gpu-amd-reduce.cc new file mode 100644 index 000000000000..638a15451427 --- /dev/null +++ b/runtime/src/gpu/amd/gpu-amd-reduce.cc @@ -0,0 +1,103 @@ +/* + * Copyright 2020-2023 Hewlett Packard Enterprise Development LP + * Copyright 2004-2019 Cray Inc. + * Other additional copyright holders may be indicated within. * + * The entirety of this work is licensed under the Apache License, + * Version 2.0 (the "License"); you may not use this file except + * in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef HAS_GPU_LOCALE + +/* TODO uncomment these when the implementations are in +#include +#include +#include +*/ + +#include "chpl-gpu.h" +#include "chpl-gpu-impl.h" +#include "gpu/chpl-gpu-reduce-util.h" + +// Engin: I can't get neither hipCUB nor rocprim to work. (hipCUB is a light +// wrapper around rocprim anyways). I filed +// https://github.com/ROCmSoftwarePlatform/hipCUB/issues/304, but I don't know +// if/when I'll hear back something. For now, I am merging the code that's +// supposed to work but doesn't instead of removing them from my branch. +#if 1 +#define DEF_ONE_REDUCE_RET_VAL(impl_kind, chpl_kind, data_type) \ +void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ + data_type* val, int* idx,\ + void* stream) {\ + chpl_internal_error("This function shouldn't have been called. Reduction is not supported with AMD GPUs\n");\ +} +#elif ROCM_VERSION_MAJOR >= 5 +#define DEF_ONE_REDUCE_RET_VAL(impl_kind, chpl_kind, data_type) \ +void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ + data_type* val, int* idx,\ + void* stream) {\ + data_type* result; \ + ROCM_CALL(hipMalloc(&result, sizeof(data_type)));\ + void* temp = NULL; \ + size_t temp_bytes = 0; \ + ROCM_CALL(hipcub::DeviceReduce::impl_kind(temp, temp_bytes, data, (data_type*)result, n,\ + 0, true));\ + ROCM_CALL(hipMalloc(((hipDeviceptr_t*)&temp), temp_bytes)); \ + ROCM_CALL(hipcub::DeviceReduce::impl_kind(temp, temp_bytes, data, (data_type*)result, n,\ + 0, true));\ + ROCM_CALL(hipMemcpyDtoHAsync(val, result, sizeof(data_type),\ + (hipStream_t)stream)); \ +} +#else +#define DEF_ONE_REDUCE_RET_VAL(impl_kind, chpl_kind, data_type) \ +void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ + data_type* val,\ + void* stream) {\ + chpl_internal_error("Reduction is not supported with AMD GPUs using ROCm version <5\n");\ +} +#endif // 1 + +GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL, Sum, sum) +GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL, Min, min) +GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL, Max, max) + +#undef DEF_ONE_REDUCE_RET_VAL + +#if 1 +#define DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, data_type) \ +void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ + data_type* val, int* idx,\ + void* stream) {\ + chpl_internal_error("This function shouldn't have been called. Reduction is not supported with AMD GPUs\n");\ +} +#else +#define DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, data_type) \ +void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ + data_type* val, int* idx,\ + void* stream) {\ + // TODO I don't know any other specific issues with these versions. Should be + // able to whip up the implementation quickly once we figure out what's going + // wrong here. + chpl_internal_error("Unimplemented"); +} +#endif // 1 + +GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, ArgMin, minloc) +GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, ArgMax, maxloc) + +#undef DEF_ONE_REDUCE_RET_VAL_IDX + +#undef DEF_REDUCE + +#endif // HAS_GPU_LOCALE + diff --git a/runtime/src/gpu/cpu/gpu-cpu.c b/runtime/src/gpu/cpu/gpu-cpu.c index e76f0dcb359d..a3ae1754e57c 100644 --- a/runtime/src/gpu/cpu/gpu-cpu.c +++ b/runtime/src/gpu/cpu/gpu-cpu.c @@ -157,4 +157,33 @@ bool chpl_gpu_impl_stream_ready(void* stream) { void chpl_gpu_impl_stream_synchronize(void* stream) { } +#define DEF_ONE_REDUCE_RET_VAL(impl_kind, chpl_kind, data_type) \ +void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ + data_type* val,\ + void* stream) {\ + chpl_internal_error("This function shouldn't have been called. "\ + "cpu-as-device mode should handle reductions in "\ + "the module code\n");\ +} + +GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL, Sum, sum) +GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL, Min, min) +GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL, Max, max) + +#undef DEF_ONE_REDUCE_RET_VAL + +#define DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, data_type) \ +void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ + data_type* val, int* idx,\ + void* stream) {\ + chpl_internal_error("This function shouldn't have been called. "\ + "cpu-as-device mode should handle reductions in "\ + "the module code\n");\ +} + +GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, ArgMin, minloc) +GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, ArgMax, maxloc) + +#undef DEF_ONE_REDUCE_RET_VAL_IDX + #endif // HAS_GPU_LOCALE diff --git a/runtime/src/gpu/nvidia/Makefile.include b/runtime/src/gpu/nvidia/Makefile.include index cb4da19051a8..9e0f8711b311 100644 --- a/runtime/src/gpu/nvidia/Makefile.include +++ b/runtime/src/gpu/nvidia/Makefile.include @@ -21,5 +21,7 @@ GPU_SUBDIR = src/gpu/nvidia GPU_OBJDIR = $(RUNTIME_BUILD)/$(GPU_SUBDIR) ALL_SRCS += $(CURDIR)/$(GPU_SUBDIR)/*.c +ALL_SRCS += $(CURDIR)/$(GPU_SUBDIR)/*.cc + include $(RUNTIME_ROOT)/$(GPU_SUBDIR)/Makefile.share diff --git a/runtime/src/gpu/nvidia/Makefile.share b/runtime/src/gpu/nvidia/Makefile.share index 36d2debdee0d..a0c109e22f0d 100644 --- a/runtime/src/gpu/nvidia/Makefile.share +++ b/runtime/src/gpu/nvidia/Makefile.share @@ -15,8 +15,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -SRCS = gpu-nvidia.c +GPU_SRCS = gpu-nvidia.c gpu-nvidia-reduce.cc -GPU_SRCS = gpu-nvidia.c +SRCS = $(GPU_SRCS) -GPU_OBJS = $(GPU_SRCS:%.c=$(GPU_OBJDIR)/%.o) +GPU_OBJS = $(addprefix $(GPU_OBJDIR)/,$(addsuffix .o,$(basename $(GPU_SRCS)))) + +RUNTIME_CXXFLAGS += -x cuda -Wno-unknown-cuda-version + +$(RUNTIME_OBJ_DIR)/gpu-nvidia-reduce.o: gpu-nvidia-reduce.cc \ + $(RUNTIME_OBJ_DIR_STAMP) + $(CXX) -c $(CXX11_STD) $(RUNTIME_CXXFLAGS) $(RUNTIME_INCLS) -o $@ $< diff --git a/runtime/src/gpu/nvidia/gpu-nvidia-reduce.cc b/runtime/src/gpu/nvidia/gpu-nvidia-reduce.cc new file mode 100644 index 000000000000..daae4dce16b6 --- /dev/null +++ b/runtime/src/gpu/nvidia/gpu-nvidia-reduce.cc @@ -0,0 +1,87 @@ +/* + * Copyright 2020-2023 Hewlett Packard Enterprise Development LP + * Copyright 2004-2019 Cray Inc. + * Other additional copyright holders may be indicated within. * + * The entirety of this work is licensed under the Apache License, + * Version 2.0 (the "License"); you may not use this file except + * in compliance with the License. + * + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef HAS_GPU_LOCALE + +#include +#include + +#include "chpl-gpu.h" +#include "chpl-gpu-impl.h" +#include "../common/cuda-utils.h" +#include "gpu/chpl-gpu-reduce-util.h" + +// this version doesn't do anything with `idx`. Having a unified interface makes +// the implementation in the rest of the runtime and the modules more +// straightforward. +#define DEF_ONE_REDUCE_RET_VAL(cub_kind, chpl_kind, data_type) \ +void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ + data_type* val, int* idx,\ + void* stream) {\ + CUdeviceptr result; \ + CUDA_CALL(cuMemAlloc(&result, sizeof(data_type))); \ + void* temp = NULL; \ + size_t temp_bytes = 0; \ + cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (data_type*)result, n,\ + (CUstream)stream); \ + CUDA_CALL(cuMemAlloc(((CUdeviceptr*)&temp), temp_bytes)); \ + cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (data_type*)result, n,\ + (CUstream)stream); \ + CUDA_CALL(cuMemcpyDtoHAsync(val, result, sizeof(data_type),\ + (CUstream)stream)); \ + CUDA_CALL(cuMemFree(result)); \ +} + +GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL, Sum, sum) +GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL, Min, min) +GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL, Max, max) + +#undef DEF_ONE_REDUCE_RET_VAL + +#define DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, data_type) \ +void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\ + data_type* val, int* idx,\ + void* stream) {\ + using kvp = cub::KeyValuePair; \ + CUdeviceptr result; \ + CUDA_CALL(cuMemAlloc(&result, sizeof(kvp))); \ + void* temp = NULL; \ + size_t temp_bytes = 0; \ + cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (kvp*)result, n,\ + (CUstream)stream);\ + CUDA_CALL(cuMemAlloc(((CUdeviceptr*)&temp), temp_bytes)); \ + cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (kvp*)result, n,\ + (CUstream)stream);\ + kvp result_host; \ + CUDA_CALL(cuMemcpyDtoHAsync(&result_host, result, sizeof(kvp),\ + (CUstream)stream)); \ + *val = result_host.value; \ + *idx = result_host.key; \ + CUDA_CALL(cuMemFree(result)); \ +} + +GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, ArgMin, minloc) +GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, ArgMax, maxloc) + +#undef DEF_ONE_REDUCE_RET_VAL_IDX + +#undef DEF_REDUCE + +#endif // HAS_GPU_LOCALE + diff --git a/test/gpu/native/noAmd.skipif b/test/gpu/native/noAmd.skipif new file mode 100644 index 000000000000..75268eb04f42 --- /dev/null +++ b/test/gpu/native/noAmd.skipif @@ -0,0 +1,2 @@ +# Reductions are not supported with AMD gpus yet. +CHPL_GPU==amd diff --git a/test/gpu/native/noAmd/reduction/basic.chpl b/test/gpu/native/noAmd/reduction/basic.chpl new file mode 100644 index 000000000000..67a2d4d3251d --- /dev/null +++ b/test/gpu/native/noAmd/reduction/basic.chpl @@ -0,0 +1,44 @@ +use GPU; +use ChplConfig; + +config const n = 100; + +proc testType(type t) { + proc test(param op: string, type t) { + on here.gpus[0] { + var Arr: [0..#n] t; + + foreach i in Arr.domain do Arr[i] = i:t; + + var res; + select op { + when "sum" do res=gpuSumReduce(Arr); + when "min" do res=gpuMinReduce(Arr); + when "max" do res=gpuMaxReduce(Arr); + when "minloc" do res=gpuMinLocReduce(Arr); + when "maxloc" do res=gpuMaxLocReduce(Arr); + } + + writeln(op, ": ", res); + } + } + + writeln("Testing type ", t:string); + test("sum", t); + test("min", t); + test("max", t); + test("minloc", t); + test("maxloc", t); + writeln(); +} + +testType(int(8)); +testType(int(16)); +testType(int(32)); +testType(int(64)); +testType(uint(8)); +testType(uint(16)); +testType(uint(32)); +testType(uint(64)); +testType(real(32)); +testType(real(64)); diff --git a/test/gpu/native/noAmd/reduction/basic.good b/test/gpu/native/noAmd/reduction/basic.good new file mode 100644 index 000000000000..4f9386297b60 --- /dev/null +++ b/test/gpu/native/noAmd/reduction/basic.good @@ -0,0 +1,71 @@ +warning: The prototype GPU support implies --no-checks. This may impact debuggability. To suppress this warning, compile with --no-checks explicitly +Testing type int(8) +sum: 86 +min: 0 +max: 99 +minloc: (0, 0) +maxloc: (99, 99) + +Testing type int(16) +sum: 4950 +min: 0 +max: 99 +minloc: (0, 0) +maxloc: (99, 99) + +Testing type int(32) +sum: 4950 +min: 0 +max: 99 +minloc: (0, 0) +maxloc: (99, 99) + +Testing type int(64) +sum: 4950 +min: 0 +max: 99 +minloc: (0, 0) +maxloc: (99, 99) + +Testing type uint(8) +sum: 86 +min: 0 +max: 99 +minloc: (0, 0) +maxloc: (99, 99) + +Testing type uint(16) +sum: 4950 +min: 0 +max: 99 +minloc: (0, 0) +maxloc: (99, 99) + +Testing type uint(32) +sum: 4950 +min: 0 +max: 99 +minloc: (0, 0) +maxloc: (99, 99) + +Testing type uint(64) +sum: 4950 +min: 0 +max: 99 +minloc: (0, 0) +maxloc: (99, 99) + +Testing type real(32) +sum: 4950.0 +min: 0.0 +max: 99.0 +minloc: (0, 0.0) +maxloc: (99, 99.0) + +Testing type real(64) +sum: 4950.0 +min: 0.0 +max: 99.0 +minloc: (0, 0.0) +maxloc: (99, 99.0) + diff --git a/test/gpu/native/noAmd/reduction/largeArrays.chpl b/test/gpu/native/noAmd/reduction/largeArrays.chpl new file mode 100644 index 000000000000..5a7924d2961f --- /dev/null +++ b/test/gpu/native/noAmd/reduction/largeArrays.chpl @@ -0,0 +1,19 @@ +use GPU; + +config const printResult = false; +config const n = 100; + +var result: uint(8); +on here.gpus[0] { + var Arr: [0..#n] uint(8) = 1; + + result = gpuSumReduce(Arr); +} + +if printResult then writeln("Result: ", result); + +// it is all 1's initially. It'll certainly overflow and the remainder will be +// the expected return value +const expected = n%(max(uint(8))+1); +if result != expected then + writef("Invalid result. Expected %u, actual %u\n", expected, result); diff --git a/test/gpu/native/noAmd/reduction/largeArrays.execopts b/test/gpu/native/noAmd/reduction/largeArrays.execopts new file mode 100644 index 000000000000..0af28add7fca --- /dev/null +++ b/test/gpu/native/noAmd/reduction/largeArrays.execopts @@ -0,0 +1,6 @@ +--n=2_000_000_000 +--n=2_000_000_001 +--n=2_147_483_647 +--n=4_294_967_293 +--n=4_294_967_294 +--n=4_294_967_295 diff --git a/test/gpu/native/noAmd/reduction/largeArrays.good b/test/gpu/native/noAmd/reduction/largeArrays.good new file mode 100644 index 000000000000..abf41f07fd48 --- /dev/null +++ b/test/gpu/native/noAmd/reduction/largeArrays.good @@ -0,0 +1 @@ +warning: The prototype GPU support implies --no-checks. This may impact debuggability. To suppress this warning, compile with --no-checks explicitly diff --git a/test/gpu/native/noAmd/reduction/largeArrays.skipif b/test/gpu/native/noAmd/reduction/largeArrays.skipif new file mode 100644 index 000000000000..dc81184aa7f6 --- /dev/null +++ b/test/gpu/native/noAmd/reduction/largeArrays.skipif @@ -0,0 +1,4 @@ +# this test is to exercise a code path that will never be run with +# cpu-as-device. Moreover, the large size makes this mode take too long to +# finish. So, I am skipping this test. +CHPL_GPU==cpu diff --git a/test/gpu/native/noAmd/reduction/largeArraysMinMax.chpl b/test/gpu/native/noAmd/reduction/largeArraysMinMax.chpl new file mode 100644 index 000000000000..dc660bc1d0f7 --- /dev/null +++ b/test/gpu/native/noAmd/reduction/largeArraysMinMax.chpl @@ -0,0 +1,36 @@ +use GPU; + +config param withLoc = false; + +config const kind = "min"; +const isMin = kind=="min"; +assert(isMin || kind=="max"); + +config const n = 2*max(int(32)); +config const setIdx = n-1; +assert(n>setIdx); + +config const printResult = false; + +const expectedVal = if isMin then 7:uint(8) else 13:uint(8); +const expected = if withLoc then (setIdx, expectedVal) else expectedVal; + +inline proc doReduce(Arr) { + if withLoc then + return if isMin then gpuMinLocReduce(Arr) else gpuMaxLocReduce(Arr); + else + return if isMin then gpuMinReduce(Arr) else gpuMaxReduce(Arr); +} + +var result: if withLoc then (int, uint(8)) else uint(8); +on here.gpus[0] { + var Arr: [0..#n] uint(8) = 10; + Arr[setIdx] = expectedVal; + + result = doReduce(Arr); +} + +if printResult then writeln("Result: ", result); + +if result != expected then + writef("Invalid result. Expected %?, actual %?\n", expected, result); diff --git a/test/gpu/native/noAmd/reduction/largeArraysMinMax.compopts b/test/gpu/native/noAmd/reduction/largeArraysMinMax.compopts new file mode 100644 index 000000000000..087ba6a454e9 --- /dev/null +++ b/test/gpu/native/noAmd/reduction/largeArraysMinMax.compopts @@ -0,0 +1,2 @@ +-swithLoc=false +-swithLoc=true diff --git a/test/gpu/native/noAmd/reduction/largeArraysMinMax.execopts b/test/gpu/native/noAmd/reduction/largeArraysMinMax.execopts new file mode 100644 index 000000000000..2742bc714a67 --- /dev/null +++ b/test/gpu/native/noAmd/reduction/largeArraysMinMax.execopts @@ -0,0 +1,7 @@ + #largeArraysMinMax +--setIdx=0 +--setIdx=1_999_999_999 +--setIdx=2_000_000_000 +--setIdx=2_000_000_001 +--setIdx=2_147_483_647 + diff --git a/test/gpu/native/noAmd/reduction/largeArraysMinMax.good b/test/gpu/native/noAmd/reduction/largeArraysMinMax.good new file mode 100644 index 000000000000..abf41f07fd48 --- /dev/null +++ b/test/gpu/native/noAmd/reduction/largeArraysMinMax.good @@ -0,0 +1 @@ +warning: The prototype GPU support implies --no-checks. This may impact debuggability. To suppress this warning, compile with --no-checks explicitly diff --git a/test/gpu/native/noAmd/reduction/largeArraysMinMax.skipif b/test/gpu/native/noAmd/reduction/largeArraysMinMax.skipif new file mode 100644 index 000000000000..dc81184aa7f6 --- /dev/null +++ b/test/gpu/native/noAmd/reduction/largeArraysMinMax.skipif @@ -0,0 +1,4 @@ +# this test is to exercise a code path that will never be run with +# cpu-as-device. Moreover, the large size makes this mode take too long to +# finish. So, I am skipping this test. +CHPL_GPU==cpu diff --git a/test/gpu/native/noAmd/reduction/nonZeroBased.chpl b/test/gpu/native/noAmd/reduction/nonZeroBased.chpl new file mode 100644 index 000000000000..ae337b07f240 --- /dev/null +++ b/test/gpu/native/noAmd/reduction/nonZeroBased.chpl @@ -0,0 +1,21 @@ +use GPU; +use Random; + +proc test(ref Arr) { + fillRandom(Arr, min=0, max=100, seed=17); + + writeln(Arr); + writeln(gpuMinLocReduce(Arr)); + writeln(gpuMaxLocReduce(Arr)); +} + +on here.gpus[0] { + var A: [-20..#10] int; + test(A); + + var B: [0..#10] int; + test(B); + + var C: [10..#10] int; + test(C); +} diff --git a/test/gpu/native/noAmd/reduction/nonZeroBased.good b/test/gpu/native/noAmd/reduction/nonZeroBased.good new file mode 100644 index 000000000000..7f00b4427b24 --- /dev/null +++ b/test/gpu/native/noAmd/reduction/nonZeroBased.good @@ -0,0 +1,10 @@ +warning: The prototype GPU support implies --no-checks. This may impact debuggability. To suppress this warning, compile with --no-checks explicitly +64 6 5 12 14 91 11 91 11 58 +(-18, 5) +(-15, 91) +64 6 5 12 14 91 11 91 11 58 +(2, 5) +(5, 91) +64 6 5 12 14 91 11 91 11 58 +(12, 5) +(15, 91) diff --git a/test/gpu/native/noAmd/reduction/reduceThroughput.chpl b/test/gpu/native/noAmd/reduction/reduceThroughput.chpl new file mode 100644 index 000000000000..2fe0a196dd65 --- /dev/null +++ b/test/gpu/native/noAmd/reduction/reduceThroughput.chpl @@ -0,0 +1,39 @@ +use GPU; +use Time; +use ChplConfig; + +config type dataType = int; +config param useGpu = CHPL_LOCALE_MODEL == "gpu"; + +config const n = 100; +config const reportPerf = true; + +var t: stopwatch; + +inline proc doSumReduce(const ref Arr) { + if useGpu then + return gpuSumReduce(Arr); + else { + return + reduce Arr; + } +} + +writeln("Using ", if useGpu then "gpu" else "cpu"); + +on if useGpu then here.gpus[0] else here { + var Arr: [0.. $2.tmp +mv $2.tmp $2 diff --git a/util/chplenv/compile_link_args_utils.py b/util/chplenv/compile_link_args_utils.py index fc9b325da3df..b1b46bd05d26 100644 --- a/util/chplenv/compile_link_args_utils.py +++ b/util/chplenv/compile_link_args_utils.py @@ -69,6 +69,14 @@ def get_runtime_includes_and_defines(): system.append("-isystem" + os.path.join(sdk_path, "hip", "include")) system.append("-isystem" + os.path.join(sdk_path, "hsa", "include")) + + # We need runtime to use the lld that ships with the rocm + # installation so that we can properly link hipCUB. I believe this + # requirement is coming from using "-x hip" with clang + lld_path = os.path.join(sdk_path, "llvm/bin") + system.append("-B " + lld_path) + bundled.append("-B " + lld_path) + if mem == "jemalloc": # set -DCHPL_JEMALLOC_PREFIX=chpl_je_ # this is needed since it affects code inside of headers