diff --git a/modules/standard/GPU.chpl b/modules/standard/GPU.chpl
index 738725b29aa4..d5930427c665 100644
--- a/modules/standard/GPU.chpl
+++ b/modules/standard/GPU.chpl
@@ -389,4 +389,241 @@ module GPU
   /* When run on a GPU, atomically compare the value in 'x' and 'cmp', if they
      are equal store 'val' in 'x'. The operation returns the old value of x. */
   inline proc gpuAtomicCAS(  ref x : ?T, cmp : T, val : T) : T { return gpuAtomicTernOp("CAS", x, cmp, val); }
+
+  // ============================
+  // Reductions
+  // ============================
+
+  @chpldoc.nodoc
+  config param gpuDebugReduce = false;
+
+  private inline proc doGpuReduce(param op: string, const ref A: [] ?t) {
+    if op != "sum" && op != "min" && op != "max" &&
+       op != "minloc" && op != "maxloc" {
+
+      compilerError("Unexpected reduction kind in doGpuReduce: ", op);
+    }
+
+
+    if CHPL_GPU == "amd" {
+      compilerError("gpu*Reduce functions are not supported on AMD GPUs");
+    }
+    else if CHPL_GPU == "cpu" {
+      select op {
+        when "sum" do return + reduce A;
+        when "min" do return min reduce A;
+        when "max" do return max reduce A;
+        when "minloc" do return minloc reduce zip (A.domain, A);
+        when "maxloc" do return maxloc reduce zip (A.domain, A);
+        otherwise do compilerError("Unknown reduction operation: ", op);
+      }
+    }
+    else {
+      compilerAssert(CHPL_GPU=="nvidia");
+    }
+
+
+    proc chplTypeToCTypeName(type t) param {
+      select t {
+        when int(8)   do return "int8_t";
+        when int(16)  do return "int16_t";
+        when int(32)  do return "int32_t";
+        when int(64)  do return "int64_t";
+        when uint(8)  do return "uint8_t";
+        when uint(16) do return "uint16_t";
+        when uint(32) do return "uint32_t";
+        when uint(64) do return "uint64_t";
+        when real(32) do return "float";
+        when real(64) do return "double";
+        otherwise do
+          compilerError("Arrays with ", t:string, " elements cannot be reduced");
+      }
+      return "unknown";
+    }
+
+    proc getExternFuncName(param op: string, type t) param: string {
+      return "chpl_gpu_"+op+"_reduce_"+chplTypeToCTypeName(t);
+    }
+
+    proc isValReduce(param op) param {
+      return op=="sum" || op=="min" || op=="max";
+    }
+
+    proc isValIdxReduce(param op) param {
+      return op=="minloc" || op=="maxloc";
+    }
+
+    inline proc subReduceValIdx(param op, const baseOffset, ref accum, val) {
+      // do some type checking to be safe
+      compilerAssert(isTupleValue(val));
+      if isTupleValue(accum) {
+        compilerAssert(isValIdxReduce(op));
+        compilerAssert(val[1].type == accum[1].type);
+
+      }
+      else {
+        compilerAssert(isValReduce(op));
+        compilerAssert(val[1].type == accum.type);
+      }
+
+      select op {
+        when "sum" do accum += val[1];
+        when "min" do accum = min(accum, val[1]);
+        when "max" do accum = max(accum, val[1]);
+        when "minloc" do
+          if accum[1] > val[1] then accum = (val[0]+baseOffset, val[1]);
+        when "maxloc" do
+          if accum[1] < val[1] then accum = (val[0]+baseOffset, val[1]);
+        otherwise do compilerError("Unknown reduction operation: ", op);
+      }
+    }
+
+    iter offsetsThatCanFitIn32Bits(size: int) {
+      // Engin: I've tried to get max(int(32)) to work as this bug is about CUB
+      // using `int` as the size in the interface. However, getting close to
+      // max(int(32)) also triggers the bug. So, I am choosing this as a
+      // round/safe value for the time being.
+      param chunkSize = 2_000_000_000;
+
+      use Math only divCeil;
+      const numChunks = divCeil(size, chunkSize);
+      const standardChunkSize = divCeil(size, numChunks);
+
+      if gpuDebugReduce then
+        writeln("Will use ", numChunks, " chunks of size ", standardChunkSize);
+
+      foreach chunk in 0..<numChunks {
+        const start = chunk*standardChunkSize;
+        const curChunkSize = if start+standardChunkSize <= size
+                               then standardChunkSize
+                               else size-start;
+        if gpuDebugReduce then
+          writef("Chunk %i: (start=%i, curChunkSize=%i) ", chunk, start,
+                 curChunkSize);
+
+        yield (start, curChunkSize);
+      }
+    }
+
+    use CTypes;
+
+    // find the extern function we'll use
+    param externFunc = getExternFuncName(op, t);
+    extern externFunc proc reduce_fn(data, size, ref val, ref idx);
+
+    // initialize the return value
+    var ret;
+    if isValReduce(op) {
+      var retTmp: t;
+      if op == "min" then retTmp = max(t);
+      else if op == "max" then retTmp = min(t);
+      ret = retTmp;
+    }
+    else if isValIdxReduce(op) {
+      var retTmp: (int, t);
+      if op == "minloc" then retTmp[1] = max(t);
+      else if op == "maxloc" then retTmp[1] = min(t);
+      ret = retTmp;
+    }
+    else {
+      compilerError("Unknown reduction operation: ", op);
+      ret = 0;
+    }
+
+    // perform the reduction
+    const basePtr = c_ptrToConst(A);
+    for (offset,size) in offsetsThatCanFitIn32Bits(A.size) {
+      var curIdx: int(32) = -1; // should remain -1 for sum, min, max
+      var curVal: t;
+      reduce_fn(basePtr+offset, size, curVal, curIdx);
+      subReduceValIdx(op, offset, ret, (curIdx, curVal));
+      if gpuDebugReduce then
+        writef(" (curIdx=%i curVal=%i ret=%?)\n", curIdx, curVal, ret);
+    }
+
+    if isValIdxReduce(op) then
+      ret[0] += A.domain.first;
+
+    return ret;
+  }
+
+  /*
+    Add all elements of an array together on the GPU (that is, perform a
+    sum-reduction). The array must be in GPU-accessible memory and the function
+    must be called from outside a GPU-eligible loop. Only arrays with int, uint,
+    and real types are supported. A simple example is the following:
+
+     .. code-block:: chapel
+
+       on here.gpus[0] {
+         var Arr = [3, 2, 1, 5, 4]; // will be GPU-accessible
+         writeln(gpuSumReduce(Arr)); // 15
+       }
+  */
+  inline proc gpuSumReduce(const ref A: [] ?t) do return doGpuReduce("sum", A);
+
+  /*
+    Return the minimum element of an array on the GPU (that is, perform a
+    min-reduction). The array must be in GPU-accessible memory and the function
+    must be called from outside a GPU-eligible loop. Only arrays with int, uint,
+    and real types are supported. A simple example is the following:
+
+     .. code-block:: chapel
+
+       on here.gpus[0] {
+         var Arr = [3, 2, 1, 5, 4]; // will be GPU-accessible
+         writeln(gpuMinReduce(Arr)); // 1
+       }
+  */
+  inline proc gpuMinReduce(const ref A: [] ?t) do return doGpuReduce("min", A);
+
+  /*
+    Return the maximum element of an array on the GPU (that is, perform a
+    max-reduction). The array must be in GPU-accessible memory and the function
+    must be called from outside a GPU-eligible loop. Only arrays with int, uint,
+    and real types are supported. A simple example is the following:
+
+     .. code-block:: chapel
+
+       on here.gpus[0] {
+         var Arr = [3, 2, 1, 5, 4]; // will be GPU-accessible
+         writeln(gpuMaxReduce(Arr)); // 5
+       }
+  */
+  inline proc gpuMaxReduce(const ref A: [] ?t) do return doGpuReduce("max", A);
+
+  /*
+    For an array on the GPU, return a tuple with the index and the value of the
+    minimum element (that is, perform a minloc-reduction). If there are multiple
+    elements with the same minimum value, the index of the first one is
+    returned. The array must be in GPU-accessible memory and the function must
+    be called from outside a GPU-eligible loop.  Only arrays with int, uint, and
+    real types are supported. A simple example is the following:
+
+     .. code-block:: chapel
+
+       on here.gpus[0] {
+         var Arr = [3, 2, 1, 5, 4]; // will be GPU-accessible
+         writeln(gpuMinLocReduce(Arr)); // (2, 1). Note that Arr[2]==1.
+       }
+  */
+  inline proc gpuMinLocReduce(const ref A: [] ?t) do return doGpuReduce("minloc", A);
+
+  /*
+    For an array on the GPU, return a tuple with the index and the value of the
+    maximum element (that is, perform a maxloc-reduction). If there are multiple
+    elements with the same maximum value, the index of the first one is
+    returned. The array must be in GPU-accessible memory and the function must
+    be called from outside a GPU-eligible loop.  Only arrays with int, uint, and
+    real types are supported. A simple example is the following:
+
+     .. code-block:: chapel
+
+       on here.gpus[0] {
+         var Arr = [3, 2, 1, 5, 4]; // will be GPU-accessible
+         writeln(gpuMaxLocReduce(Arr)); // (3, 5). Note that Arr[3]==5.
+       }
+  */
+  inline proc gpuMaxLocReduce(const ref A: [] ?t) do return doGpuReduce("maxloc", A);
+
 }
diff --git a/runtime/include/chpl-gpu-impl.h b/runtime/include/chpl-gpu-impl.h
index 1dbd1a4563b3..972c57a91e9c 100644
--- a/runtime/include/chpl-gpu-impl.h
+++ b/runtime/include/chpl-gpu-impl.h
@@ -76,6 +76,18 @@ void chpl_gpu_impl_stream_destroy(void* stream);
 bool chpl_gpu_impl_stream_ready(void* stream);
 void chpl_gpu_impl_stream_synchronize(void* stream);
 
+#define DECL_ONE_REDUCE_IMPL(chpl_kind, data_type) \
+void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
+                                                    data_type* val, int* idx,\
+                                                    void* stream);
+GPU_REDUCE(DECL_ONE_REDUCE_IMPL, sum)
+GPU_REDUCE(DECL_ONE_REDUCE_IMPL, min)
+GPU_REDUCE(DECL_ONE_REDUCE_IMPL, max)
+GPU_REDUCE(DECL_ONE_REDUCE_IMPL, minloc)
+GPU_REDUCE(DECL_ONE_REDUCE_IMPL, maxloc)
+
+#undef DECL_ONE_REDUCE_IMPL
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/runtime/include/chpl-gpu.h b/runtime/include/chpl-gpu.h
index 5fbc5276facc..189f77583246 100644
--- a/runtime/include/chpl-gpu.h
+++ b/runtime/include/chpl-gpu.h
@@ -24,6 +24,7 @@
 #include <stdbool.h>
 #include "chpl-tasks.h"
 #include "chpl-mem-desc.h"
+#include "gpu/chpl-gpu-reduce-util.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -149,6 +150,19 @@ size_t chpl_gpu_get_alloc_size(void* ptr);
 bool chpl_gpu_can_access_peer(int dev1, int dev2);
 void chpl_gpu_set_peer_access(int dev1, int dev2, bool enable);
 
+#define DECL_ONE_REDUCE(chpl_kind, data_type) \
+void chpl_gpu_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
+                                               data_type* val, int* idx);
+
+GPU_REDUCE(DECL_ONE_REDUCE, sum);
+GPU_REDUCE(DECL_ONE_REDUCE, min);
+GPU_REDUCE(DECL_ONE_REDUCE, max);
+GPU_REDUCE(DECL_ONE_REDUCE, minloc);
+GPU_REDUCE(DECL_ONE_REDUCE, maxloc);
+
+#undef DECL_ONE_REDUCE
+
+
 #endif // HAS_GPU_LOCALE
 
 #ifdef __cplusplus
diff --git a/runtime/include/gpu/chpl-gpu-reduce-util.h b/runtime/include/gpu/chpl-gpu-reduce-util.h
new file mode 100644
index 000000000000..6076fb0b98a5
--- /dev/null
+++ b/runtime/include/gpu/chpl-gpu-reduce-util.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2020-2023 Hewlett Packard Enterprise Development LP
+ * Copyright 2004-2019 Cray Inc.
+ * Other additional copyright holders may be indicated within.  *
+ * The entirety of this work is licensed under the Apache License,
+ * Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef HAS_GPU_LOCALE
+
+#define GPU_IMPL_REDUCE(MACRO, impl_kind, chpl_kind) \
+  MACRO(impl_kind, chpl_kind, int8_t)  \
+  MACRO(impl_kind, chpl_kind, int16_t)  \
+  MACRO(impl_kind, chpl_kind, int32_t)  \
+  MACRO(impl_kind, chpl_kind, int64_t)  \
+  MACRO(impl_kind, chpl_kind, uint8_t)  \
+  MACRO(impl_kind, chpl_kind, uint16_t)  \
+  MACRO(impl_kind, chpl_kind, uint32_t)  \
+  MACRO(impl_kind, chpl_kind, uint64_t)  \
+  MACRO(impl_kind, chpl_kind, float)   \
+  MACRO(impl_kind, chpl_kind, double);
+
+#define GPU_REDUCE(MACRO, chpl_kind) \
+  MACRO(chpl_kind, int8_t)  \
+  MACRO(chpl_kind, int16_t)  \
+  MACRO(chpl_kind, int32_t)  \
+  MACRO(chpl_kind, int64_t)  \
+  MACRO(chpl_kind, uint8_t)  \
+  MACRO(chpl_kind, uint16_t)  \
+  MACRO(chpl_kind, uint32_t)  \
+  MACRO(chpl_kind, uint64_t)  \
+  MACRO(chpl_kind, float)   \
+  MACRO(chpl_kind, double);
+
+#endif // HAS_GPU_LOCALE
+
diff --git a/runtime/src/chpl-gpu.c b/runtime/src/chpl-gpu.c
index 6879eb43cce4..f8f808c07299 100644
--- a/runtime/src/chpl-gpu.c
+++ b/runtime/src/chpl-gpu.c
@@ -43,6 +43,8 @@ bool chpl_gpu_use_stream_per_task = true;
 #include "chpl-env.h"
 #include "chpl-comm-compiler-macros.h"
 
+#include "gpu/chpl-gpu-reduce-util.h"
+
 void chpl_gpu_init(void) {
   chpl_gpu_impl_init(&chpl_gpu_num_devices);
 
@@ -700,4 +702,31 @@ void chpl_gpu_set_peer_access(int dev1, int dev2, bool enable) {
   chpl_gpu_impl_set_peer_access(dev1, dev2, enable);
 }
 
+#define DEF_ONE_REDUCE(kind, data_type)\
+void chpl_gpu_##kind##_reduce_##data_type(data_type *data, int n, \
+                                          data_type* val, int* idx) { \
+  CHPL_GPU_DEBUG("chpl_gpu_" #kind "_reduce_" #data_type " called\n"); \
+  \
+  int dev = chpl_task_getRequestedSubloc(); \
+  chpl_gpu_impl_use_device(dev); \
+  void* stream = get_stream(dev); \
+  \
+  chpl_gpu_impl_##kind##_reduce_##data_type(data, n, val, idx, stream); \
+  \
+  if (chpl_gpu_sync_with_host) { \
+    CHPL_GPU_DEBUG("Eagerly synchronizing stream %p\n", stream); \
+    wait_stream(stream); \
+  } \
+  \
+  CHPL_GPU_DEBUG("chpl_gpu_" #kind "_reduce_" #data_type " returned\n"); \
+}
+
+GPU_REDUCE(DEF_ONE_REDUCE, sum)
+GPU_REDUCE(DEF_ONE_REDUCE, min)
+GPU_REDUCE(DEF_ONE_REDUCE, max)
+GPU_REDUCE(DEF_ONE_REDUCE, minloc)
+GPU_REDUCE(DEF_ONE_REDUCE, maxloc)
+
+#undef DEF_ONE_REDUCE
+
 #endif
diff --git a/runtime/src/gpu/amd/Makefile.include b/runtime/src/gpu/amd/Makefile.include
index 7ba37d695b24..7bce5c886202 100644
--- a/runtime/src/gpu/amd/Makefile.include
+++ b/runtime/src/gpu/amd/Makefile.include
@@ -21,5 +21,6 @@ GPU_SUBDIR = src/gpu/amd
 GPU_OBJDIR = $(RUNTIME_BUILD)/$(GPU_SUBDIR)
 
 ALL_SRCS += $(CURDIR)/$(GPU_SUBDIR)/*.c
+ALL_SRCS += $(CURDIR)/$(GPU_SUBDIR)/*.cc
 
 include $(RUNTIME_ROOT)/$(GPU_SUBDIR)/Makefile.share
diff --git a/runtime/src/gpu/amd/Makefile.share b/runtime/src/gpu/amd/Makefile.share
index e8ad87a160ab..410b4f724e82 100644
--- a/runtime/src/gpu/amd/Makefile.share
+++ b/runtime/src/gpu/amd/Makefile.share
@@ -15,8 +15,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-SRCS = gpu-amd.c
+GPU_SRCS = gpu-amd-reduce.cc gpu-amd.c
 
-GPU_SRCS = gpu-amd.c
+SRCS = $(GPU_SRCS)
 
-GPU_OBJS = $(GPU_SRCS:%.c=$(GPU_OBJDIR)/%.o)
+GPU_OBJS = $(addprefix $(GPU_OBJDIR)/,$(addsuffix .o,$(basename $(GPU_SRCS))))
+
+RUNTIME_CXXFLAGS += -x hip
+
+$(RUNTIME_OBJ_DIR)/gpu-amd-reduce.o: gpu-amd-reduce.cc \
+                                         $(RUNTIME_OBJ_DIR_STAMP)
+	$(CXX) -c -std=c++14 $(RUNTIME_CXXFLAGS) $(RUNTIME_INCLS) -o $@ $<
diff --git a/runtime/src/gpu/amd/gpu-amd-reduce.cc b/runtime/src/gpu/amd/gpu-amd-reduce.cc
new file mode 100644
index 000000000000..638a15451427
--- /dev/null
+++ b/runtime/src/gpu/amd/gpu-amd-reduce.cc
@@ -0,0 +1,103 @@
+/*
+ * Copyright 2020-2023 Hewlett Packard Enterprise Development LP
+ * Copyright 2004-2019 Cray Inc.
+ * Other additional copyright holders may be indicated within.  *
+ * The entirety of this work is licensed under the Apache License,
+ * Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef HAS_GPU_LOCALE
+
+/* TODO uncomment these when the implementations are in
+#include <hip/hip_common.h>
+#include <hipcub/hipcub.hpp>
+#include <rocm_version.h>
+*/
+
+#include "chpl-gpu.h"
+#include "chpl-gpu-impl.h"
+#include "gpu/chpl-gpu-reduce-util.h"
+
+// Engin: I can't get neither hipCUB nor rocprim to work. (hipCUB is a light
+// wrapper around rocprim anyways). I filed
+// https://github.com/ROCmSoftwarePlatform/hipCUB/issues/304, but I don't know
+// if/when I'll hear back something. For now, I am merging the code that's
+// supposed to work but doesn't instead of removing them from my branch.
+#if 1
+#define DEF_ONE_REDUCE_RET_VAL(impl_kind, chpl_kind, data_type) \
+void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
+                                                    data_type* val, int* idx,\
+                                                    void* stream) {\
+  chpl_internal_error("This function shouldn't have been called. Reduction is not supported with AMD GPUs\n");\
+}
+#elif ROCM_VERSION_MAJOR >= 5
+#define DEF_ONE_REDUCE_RET_VAL(impl_kind, chpl_kind, data_type) \
+void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
+                                                    data_type* val, int* idx,\
+                                                    void* stream) {\
+  data_type* result; \
+  ROCM_CALL(hipMalloc(&result, sizeof(data_type)));\
+  void* temp = NULL; \
+  size_t temp_bytes = 0; \
+  ROCM_CALL(hipcub::DeviceReduce::impl_kind(temp, temp_bytes, data, (data_type*)result, n,\
+                                  0, true));\
+  ROCM_CALL(hipMalloc(((hipDeviceptr_t*)&temp), temp_bytes)); \
+  ROCM_CALL(hipcub::DeviceReduce::impl_kind(temp, temp_bytes, data, (data_type*)result, n,\
+                                  0, true));\
+  ROCM_CALL(hipMemcpyDtoHAsync(val, result, sizeof(data_type),\
+                              (hipStream_t)stream)); \
+}
+#else
+#define DEF_ONE_REDUCE_RET_VAL(impl_kind, chpl_kind, data_type) \
+void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
+                                                    data_type* val,\
+                                                    void* stream) {\
+  chpl_internal_error("Reduction is not supported with AMD GPUs using ROCm version <5\n");\
+}
+#endif // 1
+
+GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL, Sum, sum)
+GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL, Min, min)
+GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL, Max, max)
+
+#undef DEF_ONE_REDUCE_RET_VAL
+
+#if 1
+#define DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, data_type) \
+void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
+                                                    data_type* val, int* idx,\
+                                                    void* stream) {\
+  chpl_internal_error("This function shouldn't have been called. Reduction is not supported with AMD GPUs\n");\
+}
+#else
+#define DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, data_type) \
+void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
+                                                    data_type* val, int* idx,\
+                                                    void* stream) {\
+  // TODO I don't know any other specific issues with these versions. Should be
+  // able to whip up the implementation quickly once we figure out what's going
+  // wrong here.
+  chpl_internal_error("Unimplemented");
+}
+#endif // 1
+
+GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, ArgMin, minloc)
+GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, ArgMax, maxloc)
+
+#undef DEF_ONE_REDUCE_RET_VAL_IDX
+
+#undef DEF_REDUCE
+
+#endif // HAS_GPU_LOCALE
+
diff --git a/runtime/src/gpu/cpu/gpu-cpu.c b/runtime/src/gpu/cpu/gpu-cpu.c
index e76f0dcb359d..a3ae1754e57c 100644
--- a/runtime/src/gpu/cpu/gpu-cpu.c
+++ b/runtime/src/gpu/cpu/gpu-cpu.c
@@ -157,4 +157,33 @@ bool chpl_gpu_impl_stream_ready(void* stream) {
 void chpl_gpu_impl_stream_synchronize(void* stream) {
 }
 
+#define DEF_ONE_REDUCE_RET_VAL(impl_kind, chpl_kind, data_type) \
+void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
+                                                    data_type* val,\
+                                                    void* stream) {\
+  chpl_internal_error("This function shouldn't have been called. "\
+                      "cpu-as-device mode should handle reductions in "\
+                      "the module code\n");\
+}
+
+GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL, Sum, sum)
+GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL, Min, min)
+GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL, Max, max)
+
+#undef DEF_ONE_REDUCE_RET_VAL
+
+#define DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, data_type) \
+void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
+                                                    data_type* val, int* idx,\
+                                                    void* stream) {\
+  chpl_internal_error("This function shouldn't have been called. "\
+                      "cpu-as-device mode should handle reductions in "\
+                      "the module code\n");\
+}
+
+GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, ArgMin, minloc)
+GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, ArgMax, maxloc)
+
+#undef DEF_ONE_REDUCE_RET_VAL_IDX
+
 #endif // HAS_GPU_LOCALE
diff --git a/runtime/src/gpu/nvidia/Makefile.include b/runtime/src/gpu/nvidia/Makefile.include
index cb4da19051a8..9e0f8711b311 100644
--- a/runtime/src/gpu/nvidia/Makefile.include
+++ b/runtime/src/gpu/nvidia/Makefile.include
@@ -21,5 +21,7 @@ GPU_SUBDIR = src/gpu/nvidia
 GPU_OBJDIR = $(RUNTIME_BUILD)/$(GPU_SUBDIR)
 
 ALL_SRCS += $(CURDIR)/$(GPU_SUBDIR)/*.c
+ALL_SRCS += $(CURDIR)/$(GPU_SUBDIR)/*.cc
+
 
 include $(RUNTIME_ROOT)/$(GPU_SUBDIR)/Makefile.share
diff --git a/runtime/src/gpu/nvidia/Makefile.share b/runtime/src/gpu/nvidia/Makefile.share
index 36d2debdee0d..a0c109e22f0d 100644
--- a/runtime/src/gpu/nvidia/Makefile.share
+++ b/runtime/src/gpu/nvidia/Makefile.share
@@ -15,8 +15,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-SRCS = gpu-nvidia.c
+GPU_SRCS = gpu-nvidia.c gpu-nvidia-reduce.cc
 
-GPU_SRCS = gpu-nvidia.c
+SRCS = $(GPU_SRCS)
 
-GPU_OBJS = $(GPU_SRCS:%.c=$(GPU_OBJDIR)/%.o)
+GPU_OBJS = $(addprefix $(GPU_OBJDIR)/,$(addsuffix .o,$(basename $(GPU_SRCS))))
+
+RUNTIME_CXXFLAGS += -x cuda -Wno-unknown-cuda-version
+
+$(RUNTIME_OBJ_DIR)/gpu-nvidia-reduce.o: gpu-nvidia-reduce.cc \
+                                         $(RUNTIME_OBJ_DIR_STAMP)
+	$(CXX) -c $(CXX11_STD) $(RUNTIME_CXXFLAGS) $(RUNTIME_INCLS) -o $@ $<
diff --git a/runtime/src/gpu/nvidia/gpu-nvidia-reduce.cc b/runtime/src/gpu/nvidia/gpu-nvidia-reduce.cc
new file mode 100644
index 000000000000..daae4dce16b6
--- /dev/null
+++ b/runtime/src/gpu/nvidia/gpu-nvidia-reduce.cc
@@ -0,0 +1,87 @@
+/*
+ * Copyright 2020-2023 Hewlett Packard Enterprise Development LP
+ * Copyright 2004-2019 Cray Inc.
+ * Other additional copyright holders may be indicated within.  *
+ * The entirety of this work is licensed under the Apache License,
+ * Version 2.0 (the "License"); you may not use this file except
+ * in compliance with the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef HAS_GPU_LOCALE
+
+#include <cuda.h>
+#include <cub/cub.cuh>
+
+#include "chpl-gpu.h"
+#include "chpl-gpu-impl.h"
+#include "../common/cuda-utils.h"
+#include "gpu/chpl-gpu-reduce-util.h"
+
+// this version doesn't do anything with `idx`. Having a unified interface makes
+// the implementation in the rest of the runtime and the modules more
+// straightforward.
+#define DEF_ONE_REDUCE_RET_VAL(cub_kind, chpl_kind, data_type) \
+void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
+                                                    data_type* val, int* idx,\
+                                                    void* stream) {\
+  CUdeviceptr result; \
+  CUDA_CALL(cuMemAlloc(&result, sizeof(data_type))); \
+  void* temp = NULL; \
+  size_t temp_bytes = 0; \
+  cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (data_type*)result, n,\
+                              (CUstream)stream); \
+  CUDA_CALL(cuMemAlloc(((CUdeviceptr*)&temp), temp_bytes)); \
+  cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (data_type*)result, n,\
+                              (CUstream)stream); \
+  CUDA_CALL(cuMemcpyDtoHAsync(val, result, sizeof(data_type),\
+                              (CUstream)stream)); \
+  CUDA_CALL(cuMemFree(result)); \
+}
+
+GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL, Sum, sum)
+GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL, Min, min)
+GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL, Max, max)
+
+#undef DEF_ONE_REDUCE_RET_VAL
+
+#define DEF_ONE_REDUCE_RET_VAL_IDX(cub_kind, chpl_kind, data_type) \
+void chpl_gpu_impl_##chpl_kind##_reduce_##data_type(data_type* data, int n,\
+                                                    data_type* val, int* idx,\
+                                                    void* stream) {\
+  using kvp = cub::KeyValuePair<int,data_type>; \
+  CUdeviceptr result; \
+  CUDA_CALL(cuMemAlloc(&result, sizeof(kvp))); \
+  void* temp = NULL; \
+  size_t temp_bytes = 0; \
+  cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (kvp*)result, n,\
+                              (CUstream)stream);\
+  CUDA_CALL(cuMemAlloc(((CUdeviceptr*)&temp), temp_bytes)); \
+  cub::DeviceReduce::cub_kind(temp, temp_bytes, data, (kvp*)result, n,\
+                              (CUstream)stream);\
+  kvp result_host; \
+  CUDA_CALL(cuMemcpyDtoHAsync(&result_host, result, sizeof(kvp),\
+                              (CUstream)stream)); \
+  *val = result_host.value; \
+  *idx = result_host.key; \
+  CUDA_CALL(cuMemFree(result)); \
+}
+
+GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, ArgMin, minloc)
+GPU_IMPL_REDUCE(DEF_ONE_REDUCE_RET_VAL_IDX, ArgMax, maxloc)
+
+#undef DEF_ONE_REDUCE_RET_VAL_IDX
+
+#undef DEF_REDUCE
+
+#endif // HAS_GPU_LOCALE
+
diff --git a/test/gpu/native/noAmd.skipif b/test/gpu/native/noAmd.skipif
new file mode 100644
index 000000000000..75268eb04f42
--- /dev/null
+++ b/test/gpu/native/noAmd.skipif
@@ -0,0 +1,2 @@
+# Reductions are not supported with AMD gpus yet.
+CHPL_GPU==amd
diff --git a/test/gpu/native/noAmd/reduction/basic.chpl b/test/gpu/native/noAmd/reduction/basic.chpl
new file mode 100644
index 000000000000..67a2d4d3251d
--- /dev/null
+++ b/test/gpu/native/noAmd/reduction/basic.chpl
@@ -0,0 +1,44 @@
+use GPU;
+use ChplConfig;
+
+config const n = 100;
+
+proc testType(type t) {
+  proc test(param op: string, type t) {
+    on here.gpus[0] {
+      var Arr: [0..#n] t;
+
+      foreach i in Arr.domain do Arr[i] = i:t;
+
+      var res;
+      select op {
+        when "sum" do res=gpuSumReduce(Arr);
+        when "min" do res=gpuMinReduce(Arr);
+        when "max" do res=gpuMaxReduce(Arr);
+        when "minloc" do res=gpuMinLocReduce(Arr);
+        when "maxloc" do res=gpuMaxLocReduce(Arr);
+      }
+
+      writeln(op, ": ", res);
+    }
+  }
+
+  writeln("Testing type ", t:string);
+  test("sum", t);
+  test("min", t);
+  test("max", t);
+  test("minloc", t);
+  test("maxloc", t);
+  writeln();
+}
+
+testType(int(8));
+testType(int(16));
+testType(int(32));
+testType(int(64));
+testType(uint(8));
+testType(uint(16));
+testType(uint(32));
+testType(uint(64));
+testType(real(32));
+testType(real(64));
diff --git a/test/gpu/native/noAmd/reduction/basic.good b/test/gpu/native/noAmd/reduction/basic.good
new file mode 100644
index 000000000000..4f9386297b60
--- /dev/null
+++ b/test/gpu/native/noAmd/reduction/basic.good
@@ -0,0 +1,71 @@
+warning: The prototype GPU support implies --no-checks. This may impact debuggability. To suppress this warning, compile with --no-checks explicitly
+Testing type int(8)
+sum: 86
+min: 0
+max: 99
+minloc: (0, 0)
+maxloc: (99, 99)
+
+Testing type int(16)
+sum: 4950
+min: 0
+max: 99
+minloc: (0, 0)
+maxloc: (99, 99)
+
+Testing type int(32)
+sum: 4950
+min: 0
+max: 99
+minloc: (0, 0)
+maxloc: (99, 99)
+
+Testing type int(64)
+sum: 4950
+min: 0
+max: 99
+minloc: (0, 0)
+maxloc: (99, 99)
+
+Testing type uint(8)
+sum: 86
+min: 0
+max: 99
+minloc: (0, 0)
+maxloc: (99, 99)
+
+Testing type uint(16)
+sum: 4950
+min: 0
+max: 99
+minloc: (0, 0)
+maxloc: (99, 99)
+
+Testing type uint(32)
+sum: 4950
+min: 0
+max: 99
+minloc: (0, 0)
+maxloc: (99, 99)
+
+Testing type uint(64)
+sum: 4950
+min: 0
+max: 99
+minloc: (0, 0)
+maxloc: (99, 99)
+
+Testing type real(32)
+sum: 4950.0
+min: 0.0
+max: 99.0
+minloc: (0, 0.0)
+maxloc: (99, 99.0)
+
+Testing type real(64)
+sum: 4950.0
+min: 0.0
+max: 99.0
+minloc: (0, 0.0)
+maxloc: (99, 99.0)
+
diff --git a/test/gpu/native/noAmd/reduction/largeArrays.chpl b/test/gpu/native/noAmd/reduction/largeArrays.chpl
new file mode 100644
index 000000000000..5a7924d2961f
--- /dev/null
+++ b/test/gpu/native/noAmd/reduction/largeArrays.chpl
@@ -0,0 +1,19 @@
+use GPU;
+
+config const printResult = false;
+config const n = 100;
+
+var result: uint(8);
+on here.gpus[0] {
+  var Arr: [0..#n] uint(8) = 1;
+
+  result = gpuSumReduce(Arr);
+}
+
+if printResult then writeln("Result: ", result);
+
+// it is all 1's initially. It'll certainly overflow and the remainder will be
+// the expected return value
+const expected = n%(max(uint(8))+1);
+if result != expected then
+  writef("Invalid result. Expected %u, actual %u\n", expected, result);
diff --git a/test/gpu/native/noAmd/reduction/largeArrays.execopts b/test/gpu/native/noAmd/reduction/largeArrays.execopts
new file mode 100644
index 000000000000..0af28add7fca
--- /dev/null
+++ b/test/gpu/native/noAmd/reduction/largeArrays.execopts
@@ -0,0 +1,6 @@
+--n=2_000_000_000
+--n=2_000_000_001
+--n=2_147_483_647
+--n=4_294_967_293
+--n=4_294_967_294
+--n=4_294_967_295
diff --git a/test/gpu/native/noAmd/reduction/largeArrays.good b/test/gpu/native/noAmd/reduction/largeArrays.good
new file mode 100644
index 000000000000..abf41f07fd48
--- /dev/null
+++ b/test/gpu/native/noAmd/reduction/largeArrays.good
@@ -0,0 +1 @@
+warning: The prototype GPU support implies --no-checks. This may impact debuggability. To suppress this warning, compile with --no-checks explicitly
diff --git a/test/gpu/native/noAmd/reduction/largeArrays.skipif b/test/gpu/native/noAmd/reduction/largeArrays.skipif
new file mode 100644
index 000000000000..dc81184aa7f6
--- /dev/null
+++ b/test/gpu/native/noAmd/reduction/largeArrays.skipif
@@ -0,0 +1,4 @@
+# this test is to exercise a code path that will never be run with
+# cpu-as-device. Moreover, the large size makes this mode take too long to
+# finish. So, I am skipping this test.
+CHPL_GPU==cpu
diff --git a/test/gpu/native/noAmd/reduction/largeArraysMinMax.chpl b/test/gpu/native/noAmd/reduction/largeArraysMinMax.chpl
new file mode 100644
index 000000000000..dc660bc1d0f7
--- /dev/null
+++ b/test/gpu/native/noAmd/reduction/largeArraysMinMax.chpl
@@ -0,0 +1,36 @@
+use GPU;
+
+config param withLoc = false;
+
+config const kind = "min";
+const isMin = kind=="min";
+assert(isMin || kind=="max");
+
+config const n = 2*max(int(32));
+config const setIdx = n-1;
+assert(n>setIdx);
+
+config const printResult = false;
+
+const expectedVal = if isMin then 7:uint(8) else 13:uint(8);
+const expected = if withLoc then (setIdx, expectedVal) else expectedVal;
+
+inline proc doReduce(Arr) {
+  if withLoc then
+    return if isMin then gpuMinLocReduce(Arr) else gpuMaxLocReduce(Arr);
+  else
+    return if isMin then gpuMinReduce(Arr) else gpuMaxReduce(Arr);
+}
+
+var result: if withLoc then (int, uint(8)) else uint(8);
+on here.gpus[0] {
+  var Arr: [0..#n] uint(8) = 10;
+  Arr[setIdx] = expectedVal;
+
+  result = doReduce(Arr);
+}
+
+if printResult then writeln("Result: ", result);
+
+if result != expected then
+  writef("Invalid result. Expected %?, actual %?\n", expected, result);
diff --git a/test/gpu/native/noAmd/reduction/largeArraysMinMax.compopts b/test/gpu/native/noAmd/reduction/largeArraysMinMax.compopts
new file mode 100644
index 000000000000..087ba6a454e9
--- /dev/null
+++ b/test/gpu/native/noAmd/reduction/largeArraysMinMax.compopts
@@ -0,0 +1,2 @@
+-swithLoc=false
+-swithLoc=true
diff --git a/test/gpu/native/noAmd/reduction/largeArraysMinMax.execopts b/test/gpu/native/noAmd/reduction/largeArraysMinMax.execopts
new file mode 100644
index 000000000000..2742bc714a67
--- /dev/null
+++ b/test/gpu/native/noAmd/reduction/largeArraysMinMax.execopts
@@ -0,0 +1,7 @@
+ #largeArraysMinMax
+--setIdx=0
+--setIdx=1_999_999_999
+--setIdx=2_000_000_000
+--setIdx=2_000_000_001
+--setIdx=2_147_483_647
+
diff --git a/test/gpu/native/noAmd/reduction/largeArraysMinMax.good b/test/gpu/native/noAmd/reduction/largeArraysMinMax.good
new file mode 100644
index 000000000000..abf41f07fd48
--- /dev/null
+++ b/test/gpu/native/noAmd/reduction/largeArraysMinMax.good
@@ -0,0 +1 @@
+warning: The prototype GPU support implies --no-checks. This may impact debuggability. To suppress this warning, compile with --no-checks explicitly
diff --git a/test/gpu/native/noAmd/reduction/largeArraysMinMax.skipif b/test/gpu/native/noAmd/reduction/largeArraysMinMax.skipif
new file mode 100644
index 000000000000..dc81184aa7f6
--- /dev/null
+++ b/test/gpu/native/noAmd/reduction/largeArraysMinMax.skipif
@@ -0,0 +1,4 @@
+# this test is to exercise a code path that will never be run with
+# cpu-as-device. Moreover, the large size makes this mode take too long to
+# finish. So, I am skipping this test.
+CHPL_GPU==cpu
diff --git a/test/gpu/native/noAmd/reduction/nonZeroBased.chpl b/test/gpu/native/noAmd/reduction/nonZeroBased.chpl
new file mode 100644
index 000000000000..ae337b07f240
--- /dev/null
+++ b/test/gpu/native/noAmd/reduction/nonZeroBased.chpl
@@ -0,0 +1,21 @@
+use GPU;
+use Random;
+
+proc test(ref Arr) {
+  fillRandom(Arr, min=0, max=100, seed=17);
+
+  writeln(Arr);
+  writeln(gpuMinLocReduce(Arr));
+  writeln(gpuMaxLocReduce(Arr));
+}
+
+on here.gpus[0] {
+  var A: [-20..#10] int;
+  test(A);
+
+  var B: [0..#10] int;
+  test(B);
+
+  var C: [10..#10] int;
+  test(C);
+}
diff --git a/test/gpu/native/noAmd/reduction/nonZeroBased.good b/test/gpu/native/noAmd/reduction/nonZeroBased.good
new file mode 100644
index 000000000000..7f00b4427b24
--- /dev/null
+++ b/test/gpu/native/noAmd/reduction/nonZeroBased.good
@@ -0,0 +1,10 @@
+warning: The prototype GPU support implies --no-checks. This may impact debuggability. To suppress this warning, compile with --no-checks explicitly
+64 6 5 12 14 91 11 91 11 58
+(-18, 5)
+(-15, 91)
+64 6 5 12 14 91 11 91 11 58
+(2, 5)
+(5, 91)
+64 6 5 12 14 91 11 91 11 58
+(12, 5)
+(15, 91)
diff --git a/test/gpu/native/noAmd/reduction/reduceThroughput.chpl b/test/gpu/native/noAmd/reduction/reduceThroughput.chpl
new file mode 100644
index 000000000000..2fe0a196dd65
--- /dev/null
+++ b/test/gpu/native/noAmd/reduction/reduceThroughput.chpl
@@ -0,0 +1,39 @@
+use GPU;
+use Time;
+use ChplConfig;
+
+config type dataType = int;
+config param useGpu = CHPL_LOCALE_MODEL == "gpu";
+
+config const n = 100;
+config const reportPerf = true;
+
+var t: stopwatch;
+
+inline proc doSumReduce(const ref Arr) {
+  if useGpu then
+    return gpuSumReduce(Arr);
+  else {
+    return + reduce Arr;
+  }
+}
+
+writeln("Using ", if useGpu then "gpu" else "cpu");
+
+on if useGpu then here.gpus[0] else here {
+  var Arr: [0..<n] dataType;
+
+  forall i in Arr.domain do Arr[i] = (i%100):dataType;
+
+  var result;
+  t.start();
+  result = doSumReduce(Arr);
+  t.stop();
+  writeln(result);
+}
+
+if reportPerf {
+  writeln("Time (s): ", t.elapsed());
+  writeln("Throughput (GiOP/s): ", n/t.elapsed()/(1<<30));
+  writeln("Throughput (GiB/s): ", numBytes(dataType)*n/t.elapsed()/(1<<30));
+}
diff --git a/test/gpu/native/noAmd/reduction/reduceThroughput.execopts b/test/gpu/native/noAmd/reduction/reduceThroughput.execopts
new file mode 100644
index 000000000000..367868626392
--- /dev/null
+++ b/test/gpu/native/noAmd/reduction/reduceThroughput.execopts
@@ -0,0 +1 @@
+--reportPerf=false
diff --git a/test/gpu/native/noAmd/reduction/reduceThroughput.good b/test/gpu/native/noAmd/reduction/reduceThroughput.good
new file mode 100644
index 000000000000..4b12519e9850
--- /dev/null
+++ b/test/gpu/native/noAmd/reduction/reduceThroughput.good
@@ -0,0 +1,3 @@
+warning: The prototype GPU support implies --no-checks. This may impact debuggability. To suppress this warning, compile with --no-checks explicitly
+Using gpu
+4950
diff --git a/test/gpu/native/noAmd/reduction/stringError.chpl b/test/gpu/native/noAmd/reduction/stringError.chpl
new file mode 100644
index 000000000000..9045408aed4b
--- /dev/null
+++ b/test/gpu/native/noAmd/reduction/stringError.chpl
@@ -0,0 +1,7 @@
+use GPU;
+
+on here.gpus[0] {
+  var Arr: [1..10] string;
+
+  writeln(gpuSumReduce(Arr));
+}
diff --git a/test/gpu/native/noAmd/reduction/stringError.good b/test/gpu/native/noAmd/reduction/stringError.good
new file mode 100644
index 000000000000..b08203390ef8
--- /dev/null
+++ b/test/gpu/native/noAmd/reduction/stringError.good
@@ -0,0 +1,5 @@
+warning: The prototype GPU support implies --no-checks. This may impact debuggability. To suppress this warning, compile with --no-checks explicitly
+$CHPL_HOME/modules/standard/GPU.chpl:prediffed: In function 'doGpuReduce':
+$CHPL_HOME/modules/standard/GPU.chpl:prediffed: error: Arrays with string elements cannot be reduced
+  $CHPL_HOME/modules/standard/GPU.chpl:prediffed: called as doGpuReduce(param op = "sum", A: [domain(1,int(64),one)] string) from function 'gpuSumReduce'
+  stringError.chpl:prediffed: called as gpuSumReduce(A: [domain(1,int(64),one)] string)
diff --git a/test/gpu/native/noAmd/reduction/stringError.prediff b/test/gpu/native/noAmd/reduction/stringError.prediff
new file mode 100755
index 000000000000..082c5199fbe7
--- /dev/null
+++ b/test/gpu/native/noAmd/reduction/stringError.prediff
@@ -0,0 +1,4 @@
+#!/bin/sh
+
+sed -e "s/\.chpl:[0-9]\+:/.chpl:prediffed:/" $2 > $2.tmp
+mv $2.tmp $2
diff --git a/util/chplenv/compile_link_args_utils.py b/util/chplenv/compile_link_args_utils.py
index fc9b325da3df..b1b46bd05d26 100644
--- a/util/chplenv/compile_link_args_utils.py
+++ b/util/chplenv/compile_link_args_utils.py
@@ -69,6 +69,14 @@ def get_runtime_includes_and_defines():
             system.append("-isystem" + os.path.join(sdk_path, "hip", "include"))
             system.append("-isystem" + os.path.join(sdk_path, "hsa", "include"))
 
+
+            # We need runtime to use the lld that ships with the rocm
+            # installation so that we can properly link hipCUB. I believe this
+            # requirement is coming from using "-x hip" with clang
+            lld_path = os.path.join(sdk_path, "llvm/bin")
+            system.append("-B " + lld_path)
+            bundled.append("-B " + lld_path)
+
     if mem == "jemalloc":
         # set -DCHPL_JEMALLOC_PREFIX=chpl_je_
         # this is needed since it affects code inside of headers