From ab69139ade39a176d1ea938a05163d1883eaee9f Mon Sep 17 00:00:00 2001
From: ImmanuelSegol <3ditds@gmail.com>
Date: Sun, 16 Jul 2023 07:31:41 -0400
Subject: [PATCH] Goicicle (#77)

---
 .gitignore                                    |     3 +
 curve_parameters/bn254.json                   |     2 +-
 curve_parameters/new_curve_script.py          |     2 +-
 goicicle/Makefile                             |    29 +
 goicicle/README.md                            |    49 +
 goicicle/curves/bls12377/g1.go                |   505 +
 goicicle/curves/bls12377/g2.go                |   109 +
 goicicle/curves/bls12377/msm.go               |    90 +
 goicicle/curves/bls12377/ntt.go               |    73 +
 goicicle/curves/bls12377/utils.go             |    25 +
 goicicle/curves/bls12377/utils_test.go        |    81 +
 goicicle/curves/bls12381/g1.go                |   505 +
 goicicle/curves/bls12381/g2.go                |   108 +
 goicicle/curves/bls12381/msm.go               |    90 +
 goicicle/curves/bls12381/ntt.go               |    73 +
 goicicle/curves/bls12381/utils.go             |    25 +
 goicicle/curves/bls12381/utils_test.go        |    81 +
 goicicle/curves/bn254/g1.go                   |   503 +
 goicicle/curves/bn254/g1_test.go              |   229 +
 goicicle/curves/bn254/g2.go                   |   235 +
 goicicle/curves/bn254/g2_test.go              |    18 +
 goicicle/curves/bn254/msm.go                  |   187 +
 goicicle/curves/bn254/msm_test.go             |   391 +
 goicicle/curves/bn254/ntt.go                  |   202 +
 goicicle/curves/bn254/ntt_test.go             |   219 +
 goicicle/curves/bn254/utils.go                |    34 +
 goicicle/curves/bn254/utils_test.go           |    81 +
 goicicle/curves/bn254/vec_mod.go              |    41 +
 goicicle/go.mod                               |    20 +
 goicicle/go.sum                               |    25 +
 goicicle/goicicle.go                          |    58 +
 goicicle/templates/curves/curves.go           |    37 +
 goicicle/templates/curves/g1.go.tmpl          |   469 +
 goicicle/templates/curves/g2.go.tmpl          |    83 +
 goicicle/templates/curves/imports.go.tmpl     |    34 +
 goicicle/templates/hfiles/c_api.h.tmpl        |    15 +
 goicicle/templates/hfiles/msm.h.tmpl          |    35 +
 goicicle/templates/hfiles/ntt.h.tmpl          |    27 +
 goicicle/templates/hfiles/ve_mod_mult.h.tmpl  |    24 +
 goicicle/templates/main.go                    |   161 +
 goicicle/templates/msm/msm.go.tmpl            |    71 +
 goicicle/templates/ntt/ntt.go.tmpl            |    54 +
 icicle/CMakeLists.txt                         |     8 +-
 icicle/appUtils/msm/msm.cu                    |  1385 +-
 icicle/appUtils/msm/msm.cuh                   |     4 +-
 icicle/appUtils/msm/tests/msm_test.cu         |   139 +-
 icicle/appUtils/ntt/lde.cu                    |    64 +-
 icicle/appUtils/ntt/ntt.cuh                   |    38 +-
 .../vector_manipulation/ve_mod_mult.cuh       |    25 +
 icicle/curves/bls12_377/c_api.h               |    33 +
 icicle/curves/bls12_377/curve_config.cuh      |     3 +
 icicle/curves/bls12_377/lde.cu                |    19 +-
 icicle/curves/bls12_377/msm.cu                |     4 +-
 icicle/curves/bls12_377/msm.h                 |    53 +
 icicle/curves/bls12_377/ntt.h                 |    44 +
 icicle/curves/bls12_377/params.cuh            |     2 +-
 icicle/curves/bls12_377/ve_mod_mult.h         |    41 +
 icicle/curves/bls12_381/c_api.h               |    32 +
 icicle/curves/bls12_381/curve_config.cuh      |     3 +
 icicle/curves/bls12_381/lde.cu                |    26 +-
 icicle/curves/bls12_381/msm.cu                |     4 +-
 icicle/curves/bls12_381/msm.h                 |    53 +
 icicle/curves/bls12_381/ntt.h                 |    44 +
 icicle/curves/bls12_381/params.cuh            |     3 +
 .../curves/bls12_381/supported_operations.cu  |     2 +-
 icicle/curves/bls12_381/ve_mod_mult.h         |    41 +
 icicle/curves/bn254/c_api.h                   |    34 +
 icicle/curves/bn254/cuda.h                    | 14752 ++++++++++++++++
 icicle/curves/bn254/cuda_runtime.h            |  2039 +++
 icicle/curves/bn254/curve_config.cuh          |     3 +
 icicle/curves/bn254/lde.cu                    |   238 +-
 icicle/curves/bn254/msm.cu                    |   190 +-
 icicle/curves/bn254/msm.h                     |    62 +
 icicle/curves/bn254/ntt.h                     |    68 +
 icicle/curves/bn254/params.cuh                |   113 +-
 icicle/curves/bn254/projective.cu             |     2 +-
 icicle/curves/bn254/ve_mod_mult.cu            |    15 +
 icicle/curves/bn254/ve_mod_mult.h             |    41 +
 icicle/curves/curve_template/lde.cu           |    49 +-
 icicle/curves/curve_template/msm.cu           |     4 +-
 icicle/curves/curve_template/projective.cu    |     2 +-
 icicle/primitives/extension_field.cuh         |    69 +-
 icicle/primitives/field.cuh                   |   511 +-
 icicle/primitives/projective.cuh              |   131 +-
 icicle/primitives/test.cu                     |   331 +-
 icicle/primitives/test_kernels.cuh            |   112 +-
 icicle/utils/mont.cuh                         |    25 +
 src/test_bn254.rs                             |    64 +-
 88 files changed, 25529 insertions(+), 499 deletions(-)
 create mode 100644 goicicle/Makefile
 create mode 100644 goicicle/README.md
 create mode 100644 goicicle/curves/bls12377/g1.go
 create mode 100644 goicicle/curves/bls12377/g2.go
 create mode 100644 goicicle/curves/bls12377/msm.go
 create mode 100644 goicicle/curves/bls12377/ntt.go
 create mode 100644 goicicle/curves/bls12377/utils.go
 create mode 100644 goicicle/curves/bls12377/utils_test.go
 create mode 100644 goicicle/curves/bls12381/g1.go
 create mode 100644 goicicle/curves/bls12381/g2.go
 create mode 100644 goicicle/curves/bls12381/msm.go
 create mode 100644 goicicle/curves/bls12381/ntt.go
 create mode 100644 goicicle/curves/bls12381/utils.go
 create mode 100644 goicicle/curves/bls12381/utils_test.go
 create mode 100644 goicicle/curves/bn254/g1.go
 create mode 100644 goicicle/curves/bn254/g1_test.go
 create mode 100644 goicicle/curves/bn254/g2.go
 create mode 100644 goicicle/curves/bn254/g2_test.go
 create mode 100644 goicicle/curves/bn254/msm.go
 create mode 100644 goicicle/curves/bn254/msm_test.go
 create mode 100644 goicicle/curves/bn254/ntt.go
 create mode 100644 goicicle/curves/bn254/ntt_test.go
 create mode 100644 goicicle/curves/bn254/utils.go
 create mode 100644 goicicle/curves/bn254/utils_test.go
 create mode 100644 goicicle/curves/bn254/vec_mod.go
 create mode 100644 goicicle/go.mod
 create mode 100644 goicicle/go.sum
 create mode 100644 goicicle/goicicle.go
 create mode 100644 goicicle/templates/curves/curves.go
 create mode 100644 goicicle/templates/curves/g1.go.tmpl
 create mode 100644 goicicle/templates/curves/g2.go.tmpl
 create mode 100644 goicicle/templates/curves/imports.go.tmpl
 create mode 100644 goicicle/templates/hfiles/c_api.h.tmpl
 create mode 100644 goicicle/templates/hfiles/msm.h.tmpl
 create mode 100644 goicicle/templates/hfiles/ntt.h.tmpl
 create mode 100644 goicicle/templates/hfiles/ve_mod_mult.h.tmpl
 create mode 100644 goicicle/templates/main.go
 create mode 100644 goicicle/templates/msm/msm.go.tmpl
 create mode 100644 goicicle/templates/ntt/ntt.go.tmpl
 create mode 100644 icicle/curves/bls12_377/c_api.h
 create mode 100644 icicle/curves/bls12_377/msm.h
 create mode 100644 icicle/curves/bls12_377/ntt.h
 create mode 100644 icicle/curves/bls12_377/ve_mod_mult.h
 create mode 100644 icicle/curves/bls12_381/c_api.h
 create mode 100644 icicle/curves/bls12_381/msm.h
 create mode 100644 icicle/curves/bls12_381/ntt.h
 create mode 100644 icicle/curves/bls12_381/ve_mod_mult.h
 create mode 100644 icicle/curves/bn254/c_api.h
 create mode 100644 icicle/curves/bn254/cuda.h
 create mode 100644 icicle/curves/bn254/cuda_runtime.h
 create mode 100644 icicle/curves/bn254/msm.h
 create mode 100644 icicle/curves/bn254/ntt.h
 create mode 100644 icicle/curves/bn254/ve_mod_mult.h
 create mode 100644 icicle/utils/mont.cuh

diff --git a/.gitignore b/.gitignore
index c8634e3e2..01989aace 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,6 +5,9 @@
 *.cubin
 *.bin
 *.fatbin
+*.so
+*.nsys-rep
+*.ncu-rep
 **/target
 **/.vscode
 **/.*lock*csv#
diff --git a/curve_parameters/bn254.json b/curve_parameters/bn254.json
index ecacfd71a..4fcaa16d0 100644
--- a/curve_parameters/bn254.json
+++ b/curve_parameters/bn254.json
@@ -7,7 +7,7 @@
     "modulus_q" : 21888242871839275222246405745257275088696311157297823662689037894645226208583,
     "bit_count_q" : 254,
     "limb_q" : 8,
-    "root_of_unity" : 19103219067921713944291392827692070036145651957329286315305642004821462161904,
+    "root_of_unity": 19103219067921713944291392827692070036145651957329286315305642004821462161904,
     "weierstrass_b" : 3,
     "weierstrass_b_g2_re" : 19485874751759354771024239261021720505790618469301721065564631296452457478373,
     "weierstrass_b_g2_im" : 266929791119991161246907387137283842545076965332900288569378510910307636690,
diff --git a/curve_parameters/new_curve_script.py b/curve_parameters/new_curve_script.py
index cbf206fe1..923caf2a2 100644
--- a/curve_parameters/new_curve_script.py
+++ b/curve_parameters/new_curve_script.py
@@ -313,4 +313,4 @@ def get_params(config):
 with open('./src/lib.rs', 'r+') as f:
     lib_text = f.read()
     if lib_text.find(curve_name_lower) == -1:
-        f.write('\npub mod ' + curve_name_lower + ';')
\ No newline at end of file
+        f.write('\npub mod ' + curve_name_lower + ';')
diff --git a/goicicle/Makefile b/goicicle/Makefile
new file mode 100644
index 000000000..c8193dd3c
--- /dev/null
+++ b/goicicle/Makefile
@@ -0,0 +1,29 @@
+CUDA_ROOT_DIR = /usr/local/cuda
+NVCC = $(CUDA_ROOT_DIR)/bin/nvcc
+CFLAGS = -Xcompiler -fPIC -std=c++17
+LDFLAGS = -shared
+FEATURES = -DG2_DEFINED
+
+TARGET_BN254 = libbn254.so
+TARGET_BLS12_381 = libbls12_381.so
+TARGET_BLS12_377 = libbls12_377.so
+
+VPATH = ../icicle/curves/bn254:../icicle/curves/bls12_377:../icicle/curves/bls12_381
+
+SRCS_BN254 = lde.cu msm.cu projective.cu ve_mod_mult.cu
+SRCS_BLS12_381 = lde.cu msm.cu projective.cu ve_mod_mult.cu poseidon.cu
+SRCS_BLS12_377 = lde.cu msm.cu projective.cu ve_mod_mult.cu
+
+all: $(TARGET_BN254) $(TARGET_BLS12_381) $(TARGET_BLS12_377)
+
+$(TARGET_BN254): 
+	$(NVCC) $(FEATURES) $(CFLAGS) $(LDFLAGS) $(addprefix ../icicle/curves/bn254/, $(SRCS_BN254)) -o $@
+
+$(TARGET_BLS12_381):
+	$(NVCC) $(FEATURES) $(CFLAGS) $(LDFLAGS) $(addprefix ../icicle/curves/bls12_381/, $(SRCS_BLS12_381)) -o $@
+
+$(TARGET_BLS12_377):
+	$(NVCC) $(FEATURES) $(CFLAGS) $(LDFLAGS) $(addprefix ../icicle/curves/bls12_377/, $(SRCS_BLS12_377)) -o $@
+
+clean:
+	rm -f $(TARGET_BN254) $(TARGET_BLS12_381) $(TARGET_BLS12_377)
\ No newline at end of file
diff --git a/goicicle/README.md b/goicicle/README.md
new file mode 100644
index 000000000..a67e1baa6
--- /dev/null
+++ b/goicicle/README.md
@@ -0,0 +1,49 @@
+# ICICLE CUDA to Golang Binding Guide
+
+This guide provides instructions on how to compile CUDA code using the provided Makefile, and then how to use the resulting shared libraries to bind Golang to ICICLE's CUDA code.
+
+## Prerequisites
+
+To compile the CUDA files, you will need:
+
+- CUDA toolkit installed. The Makefile assumes CUDA is installed in `/usr/local/cuda`. If CUDA is installed in a different location, please adjust the `CUDA_ROOT_DIR` variable accordingly.
+- A compatible GPU and corresponding driver installed on your machine.
+
+## Structure of the Makefile
+
+The Makefile is designed to compile CUDA files for three curves: BN254, BLS12_381, and BLS12_377. The source files are located in the `icicle/curves/` directory.
+
+## Compiling CUDA Code
+
+1. Navigate to the directory containing the Makefile in your terminal.
+2. To compile all curve libraries, use the `make all` command. This will create three shared libraries: `libbn254.so`, `libbls12_381.so`, and `libbls12_377.so`.
+3. If you want to compile a specific curve, you can do so by specifying the target. For example, to compile only the BN254 curve, use `make libbn254.so`. Replace `libbn254.so` with `libbls12_381.so` or `libbls12_377.so` to compile those curves instead.
+
+The resulting `.so` files are the compiled shared libraries for each curve.
+
+## Golang Binding
+
+The shared libraries produced from the CUDA code compilation are used to bind Golang to ICICLE's CUDA code.
+
+1. These shared libraries (`libbn254.so`, `libbls12_381.so`, `libbls12_377.so`) can be imported in your Go project to leverage the GPU accelerated functionalities provided by ICICLE. 
+
+2. In your Go project, you can use `cgo` to link these shared libraries. Here's a basic example on how you can use `cgo` to link these libraries:
+
+```go
+/*
+#cgo LDFLAGS: -L/path/to/shared/libs -lbn254 -lbls12_381 -lbls12_377
+#include "icicle.h" // make sure you use the correct header file(s)
+*/
+import "C"
+
+func main() {
+    // Now you can call the C functions from the ICICLE libraries.
+    // Note that C function calls are prefixed with 'C.' in Go code.
+}
+```
+
+Replace `/path/to/shared/libs` with the actual path where the shared libraries are located on your system.
+
+## Cleaning up
+
+If you want to remove the compiled files, you can use the `make clean` command. This will remove the `libbn254.so`, `libbls12_381.so`, and `libbls12_377.so` files.
diff --git a/goicicle/curves/bls12377/g1.go b/goicicle/curves/bls12377/g1.go
new file mode 100644
index 000000000..45fe00ea7
--- /dev/null
+++ b/goicicle/curves/bls12377/g1.go
@@ -0,0 +1,505 @@
+
+	// Copyright 2023 Ingonyama
+	//
+	// Licensed under the Apache License, Version 2.0 (the "License");
+	// you may not use this file except in compliance with the License.
+	// You may obtain a copy of the License at
+	//
+	//     http://www.apache.org/licenses/LICENSE-2.0
+	//
+	// Unless required by applicable law or agreed to in writing, software
+	// distributed under the License is distributed on an "AS IS" BASIS,
+	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	// See the License for the specific language governing permissions and
+	// limitations under the License.
+	
+// Code generated by Ingonyama DO NOT EDIT
+
+package bls12377
+
+import (
+	"unsafe"
+
+	"encoding/binary"
+	"fmt"
+
+	
+
+
+	"github.com/consensys/gnark-crypto/ecc/bls12-377"
+
+
+
+	
+
+	"github.com/consensys/gnark-crypto/ecc/bls12-377/fp"
+
+
+
+	
+
+
+	"github.com/consensys/gnark-crypto/ecc/bls12-377/fr"
+
+
+
+)
+
+// #cgo CFLAGS: -I${SRCDIR}/icicle/curves/bls12377/
+// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbn12_377
+// #include "c_api.h"
+// #include "ve_mod_mult.h"
+import "C"
+
+const SCALAR_SIZE = 8
+const BASE_SIZE = 12
+
+type ScalarField struct {
+	s [SCALAR_SIZE]uint32
+}
+
+type BaseField struct {
+	s [BASE_SIZE]uint32
+}
+
+type Field interface {
+	toGnarkFr() *fr.Element
+}
+
+/*
+ * Common Constrctors
+ */
+
+func NewFieldZero[T BaseField | ScalarField]() *T {
+	var field T
+
+	return &field
+}
+
+func NewFieldFromFrGnark[T BaseField | ScalarField](element fr.Element) *T {
+	s := ConvertUint64ArrToUint32Arr(element.Bits()) // get non-montgomry
+
+	return &T{s}
+}
+
+func NewFieldFromFpGnark[T BaseField | ScalarField](element fp.Element) *T {
+	s := ConvertUint64ArrToUint32Arr(element.Bits()) // get non-montgomry
+
+	return &T{s}
+}
+
+/*
+ * BaseField Constrctors
+ */
+
+func NewBaseFieldOne() *BaseField {
+	var s [BASE_SIZE]uint32
+
+	s[0] = 1
+
+	return &BaseField{s}
+}
+
+func BaseFieldFromLimbs(limbs [BASE_SIZE]uint32) *BaseField {
+	bf := NewFieldZero[BaseField]()
+	copy(bf.s[:], limbs[:])
+
+	return bf
+}
+
+/*
+ * BaseField methods
+ */
+
+func (f *BaseField) limbs() [BASE_SIZE]uint32 {
+	return f.s
+}
+
+func (f *BaseField) toBytesLe() []byte {
+	bytes := make([]byte, len(f.s)*4)
+	for i, v := range f.s {
+		binary.LittleEndian.PutUint32(bytes[i*4:], v)
+	}
+
+	return bytes
+}
+
+func (f *BaseField) toGnarkFr() *fr.Element {
+	fb := f.toBytesLe()
+	var b32 [32]byte
+	copy(b32[:], fb[:32])
+
+	v, e := fr.LittleEndian.Element(&b32)
+
+	if e != nil {
+		panic(fmt.Sprintf("unable to create convert point %v got error %v", f, e))
+	}
+
+	return &v
+}
+
+func (f *BaseField) toGnarkFp() *fp.Element {
+	fb := f.toBytesLe()
+	var b32 [32]byte
+	copy(b32[:], fb[:32])
+
+	v, e := fp.LittleEndian.Element(&b32)
+
+	if e != nil {
+		panic(fmt.Sprintf("unable to create convert point %v got error %v", f, e))
+	}
+
+	return &v
+}
+
+/*
+ * ScalarField methods
+ */
+
+func NewScalarFieldOne() *ScalarField {
+	var s [SCALAR_SIZE]uint32
+
+	s[0] = 1
+
+	return &ScalarField{s}
+}
+
+/*
+ * ScalarField methods
+ */
+
+func (f *ScalarField) limbs() [SCALAR_SIZE]uint32 {
+	return f.s
+}
+
+func (f *ScalarField) toBytesLe() []byte {
+	bytes := make([]byte, len(f.s)*4)
+	for i, v := range f.s {
+		binary.LittleEndian.PutUint32(bytes[i*4:], v)
+	}
+
+	return bytes
+}
+
+func (f ScalarField) toGnarkFr() *fr.Element {
+	fb := f.toBytesLe()
+	var b32 [32]byte
+	copy(b32[:], fb[:32])
+
+	v, e := fr.LittleEndian.Element(&b32)
+
+	if e != nil {
+		panic(fmt.Sprintf("unable to create convert point %v got error %v", f, e))
+	}
+
+	return &v
+}
+
+func (f *ScalarField) toGnarkFp() *fp.Element {
+	fb := f.toBytesLe()
+	var b32 [32]byte
+	copy(b32[:], fb[:32])
+
+	v, e := fp.LittleEndian.Element(&b32)
+
+	if e != nil {
+		panic(fmt.Sprintf("unable to create convert point %v got error %v", f, e))
+	}
+
+	return &v
+}
+
+/*
+ * PointBLS12377
+ */
+
+type PointBLS12377 struct {
+	x, y, z BaseField
+}
+
+func NewPointBLS12377Zero() *PointBLS12377 {
+	return &PointBLS12377{
+		x: *NewFieldZero[BaseField](),
+		y: *NewBaseFieldOne(),
+		z: *NewFieldZero[BaseField](),
+	}
+}
+
+func (p *PointBLS12377) eq(pCompare *PointBLS12377) bool {
+	// Cast *PointBLS12377 to *C.BLS12377_projective_t
+	// The unsafe.Pointer cast is necessary because Go doesn't allow direct casts
+	// between different pointer types.
+	// It's your responsibility to ensure that the types are compatible.
+	pC := (*C.BLS12377_projective_t)(unsafe.Pointer(p))
+	pCompareC := (*C.BLS12377_projective_t)(unsafe.Pointer(pCompare))
+
+	// Call the C function
+	// The C function doesn't keep any references to the data,
+	// so it's fine if the Go garbage collector moves or deletes the data later.
+	return bool(C.eq_bls12377(pC, pCompareC))
+}
+
+func (p *PointBLS12377) strip_z() *PointAffineNoInfinityBLS12377 {
+	return &PointAffineNoInfinityBLS12377{
+		x: p.x,
+		y: p.y,
+	}
+}
+
+func (p *PointBLS12377) toGnarkAffine() *bls12377.G1Affine {
+	px := p.x.toGnarkFp()
+	py := p.y.toGnarkFp()
+	pz := p.z.toGnarkFp()
+
+	zInv := new(fp.Element)
+	x := new(fp.Element)
+	y := new(fp.Element)
+
+	zInv.Inverse(pz)
+
+	x.Mul(px, zInv)
+	y.Mul(py, zInv)
+
+	return &bls12377.G1Affine{X: *x, Y: *y}
+}
+
+func (p *PointBLS12377) ToGnarkJac() *bls12377.G1Jac {
+	var p1 bls12377.G1Jac
+	p1.FromAffine(p.toGnarkAffine())
+
+	return &p1
+}
+
+func PointBLS12377FromG1AffineGnark(gnark *bls12377.G1Affine) *PointBLS12377 {
+	point := PointBLS12377{
+		x: *NewFieldFromFpGnark[BaseField](gnark.X),
+		y: *NewFieldFromFpGnark[BaseField](gnark.Y),
+		z: *NewBaseFieldOne(),
+	}
+
+	return &point
+}
+
+// converts jac fromat to projective
+func PointBLS12377FromJacGnark(gnark *bls12377.G1Jac) *PointBLS12377 {
+	var pointAffine bls12377.G1Affine
+	pointAffine.FromJacobian(gnark)
+
+	point := PointBLS12377{
+		x: *NewFieldFromFpGnark[BaseField](pointAffine.X),
+		y: *NewFieldFromFpGnark[BaseField](pointAffine.Y),
+		z: *NewBaseFieldOne(),
+	}
+
+	return &point
+}
+
+func PointBLS12377fromLimbs(x, y, z *[]uint32) *PointBLS12377 {
+	return &PointBLS12377{
+		x: *BaseFieldFromLimbs(getFixedLimbs(x)),
+		y: *BaseFieldFromLimbs(getFixedLimbs(y)),
+		z: *BaseFieldFromLimbs(getFixedLimbs(z)),
+	}
+}
+
+/*
+ * PointAffineNoInfinityBLS12377
+ */
+
+type PointAffineNoInfinityBLS12377 struct {
+	x, y BaseField
+}
+
+func NewPointAffineNoInfinityBLS12377Zero() *PointAffineNoInfinityBLS12377 {
+	return &PointAffineNoInfinityBLS12377{
+		x: *NewFieldZero[BaseField](),
+		y: *NewFieldZero[BaseField](),
+	}
+}
+
+func (p *PointAffineNoInfinityBLS12377) toProjective() *PointBLS12377 {
+	return &PointBLS12377{
+		x: p.x,
+		y: p.y,
+		z: *NewBaseFieldOne(),
+	}
+}
+
+func (p *PointAffineNoInfinityBLS12377) toGnarkAffine() *bls12377.G1Affine {
+	return p.toProjective().toGnarkAffine()
+}
+
+func PointAffineNoInfinityBLS12377FromLimbs(x, y *[]uint32) *PointAffineNoInfinityBLS12377 {
+	return &PointAffineNoInfinityBLS12377{
+		x: *BaseFieldFromLimbs(getFixedLimbs(x)),
+		y: *BaseFieldFromLimbs(getFixedLimbs(y)),
+	}
+}
+
+/*
+ * Multiplication
+ */
+
+func MultiplyVec(a []PointBLS12377, b []ScalarField, deviceID int) {
+	if len(a) != len(b) {
+		panic("a and b have different lengths")
+	}
+
+	pointsC := (*C.BLS12377_projective_t)(unsafe.Pointer(&a[0]))
+	scalarsC := (*C.BLS12377_scalar_t)(unsafe.Pointer(&b[0]))
+	deviceIdC := C.size_t(deviceID)
+	nElementsC := C.size_t(len(a))
+
+	C.vec_mod_mult_point_bls12377(pointsC, scalarsC, nElementsC, deviceIdC)
+}
+
+func MultiplyScalar(a []ScalarField, b []ScalarField, deviceID int) {
+	if len(a) != len(b) {
+		panic("a and b have different lengths")
+	}
+
+	aC := (*C.BLS12377_scalar_t)(unsafe.Pointer(&a[0]))
+	bC := (*C.BLS12377_scalar_t)(unsafe.Pointer(&b[0]))
+	deviceIdC := C.size_t(deviceID)
+	nElementsC := C.size_t(len(a))
+
+	C.vec_mod_mult_scalar_bls12377(aC, bC, nElementsC, deviceIdC)
+}
+
+// Multiply a matrix by a scalar:
+//
+//	`a` - flattenned matrix;
+//	`b` - vector to multiply `a` by;
+func MultiplyMatrix(a []ScalarField, b []ScalarField, deviceID int) {
+	c := make([]ScalarField, len(b))
+	for i := range c {
+		c[i] = *NewFieldZero[ScalarField]()
+	}
+
+	aC := (*C.BLS12377_scalar_t)(unsafe.Pointer(&a[0]))
+	bC := (*C.BLS12377_scalar_t)(unsafe.Pointer(&b[0]))
+	cC := (*C.BLS12377_scalar_t)(unsafe.Pointer(&c[0]))
+	deviceIdC := C.size_t(deviceID)
+	nElementsC := C.size_t(len(a))
+
+	C.matrix_vec_mod_mult_bls12377(aC, bC, cC, nElementsC, deviceIdC)
+}
+
+/*
+ * Utils
+ */
+
+func getFixedLimbs(slice *[]uint32) [BASE_SIZE]uint32 {
+	if len(*slice) <= BASE_SIZE {
+		limbs := [BASE_SIZE]uint32{}
+		copy(limbs[:len(*slice)], *slice)
+		return limbs
+	}
+
+	panic("slice has too many elements")
+}
+
+func BatchConvertFromFrGnark[T BaseField | ScalarField](elements []fr.Element) []T {
+	var newElements []T
+	for _, e := range elements {
+		converted := NewFieldFromFrGnark[T](e)
+		newElements = append(newElements, *converted)
+	}
+
+	return newElements
+}
+
+func BatchConvertFromFrGnarkThreaded[T BaseField | ScalarField](elements []fr.Element, routines int) []T {
+	var newElements []T
+
+	if routines > 1 {
+		channels := make([]chan []T, routines)
+		for i := 0; i < routines; i ++ {
+			channels[i] = make(chan []T, 1)
+		} 
+
+		convert := func(elements []fr.Element, chanIndex int) {
+			var convertedElements []T
+			for _, e := range elements {
+				converted := NewFieldFromFrGnark[T](e)
+				convertedElements = append(convertedElements, *converted)
+			}
+
+			channels[chanIndex] <- convertedElements
+		}
+
+		batchLen := len(elements)/routines
+		for i := 0; i < routines; i ++ {
+			elemsToConv := elements[batchLen*i:batchLen*(i+1)]
+			go convert(elemsToConv, i)
+		}
+
+		for i := 0; i < routines; i ++ {
+			newElements = append(newElements, <-channels[i]...)
+		}
+	} else {
+		for _, e := range elements {
+			converted := NewFieldFromFrGnark[T](e)
+			newElements = append(newElements, *converted)
+		}
+	}
+
+	return newElements
+}
+
+func BatchConvertToFrGnark[T Field](elements []T) []fr.Element {
+	var newElements []fr.Element
+	for _, e := range elements {
+		converted := e.toGnarkFr()
+		newElements = append(newElements, *converted)
+	}
+
+	return newElements
+}
+
+func BatchConvertToFrGnarkThreaded[T Field](elements []T, routines int) []fr.Element {
+	var newElements []fr.Element
+
+	if routines > 1 {
+		channels := make([]chan []fr.Element, routines)
+		for i := 0; i < routines; i ++ {
+			channels[i] = make(chan []fr.Element, 1)
+		} 
+
+		convert := func(elements []T, chanIndex int) {
+			var convertedElements []fr.Element
+			for _, e := range elements {
+				converted := e.toGnarkFr()
+				convertedElements = append(convertedElements, *converted)
+			}
+
+			channels[chanIndex] <- convertedElements
+		}
+
+		batchLen := len(elements)/routines
+		for i := 0; i < routines; i ++ {
+			elemsToConv := elements[batchLen*i:batchLen*(i+1)]
+			go convert(elemsToConv, i)
+		}
+
+		for i := 0; i < routines; i ++ {
+			newElements = append(newElements, <-channels[i]...)
+		}
+	} else {
+		for _, e := range elements {
+			converted := e.toGnarkFr()
+			newElements = append(newElements, *converted)
+		}
+	}
+
+	return newElements
+}
+
+func BatchConvertFromG1Affine(elements []bls12377.G1Affine) []PointAffineNoInfinityBLS12377 {
+	var newElements []PointAffineNoInfinityBLS12377
+	for _, e := range elements {
+		newElement := PointBLS12377FromG1AffineGnark(&e).strip_z()
+		newElements = append(newElements, *newElement)
+	}
+	return newElements
+}
diff --git a/goicicle/curves/bls12377/g2.go b/goicicle/curves/bls12377/g2.go
new file mode 100644
index 000000000..7541f9899
--- /dev/null
+++ b/goicicle/curves/bls12377/g2.go
@@ -0,0 +1,109 @@
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+	
+// Code generated by Ingonyama DO NOT EDIT
+
+package bls12377
+
+import (
+	"unsafe"
+	
+
+
+	"github.com/consensys/gnark-crypto/ecc/bls12-377"
+
+
+
+)
+
+// #cgo CFLAGS: -I${SRCDIR}/icicle/curves/bls12377/
+// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbn12_377
+// #include "c_api.h"
+// #include "ve_mod_mult.h"
+import "C"
+
+func BatchConvertFromG2Affine(elements []bls12377.G2Affine) []G2PointAffine {
+	var newElements []G2PointAffine
+	for _, gg2Affine := range elements {
+		var newElement G2PointAffine
+		newElement.FromGnarkAffine(&gg2Affine)
+
+		newElements = append(newElements, newElement)
+	}
+	return newElements
+}
+
+// G2 extension field
+
+type G2Element [4]uint64
+
+type ExtentionField struct {
+	A0, A1 G2Element
+}
+
+type G2PointAffine struct {
+	x, y ExtentionField
+}
+
+type G2Point struct {
+	x, y, z ExtentionField
+}
+
+func (p *G2Point) eqg2(pCompare *G2Point) bool {
+	// Cast *PointBLS12377 to *C.BLS12377_projective_t
+	// The unsafe.Pointer cast is necessary because Go doesn't allow direct casts
+	// between different pointer types.
+	// It's your responsibility to ensure that the types are compatible.
+	pC := (*C.BLS12377_g2_projective_t)(unsafe.Pointer(p))
+	pCompareC := (*C.BLS12377_g2_projective_t)(unsafe.Pointer(pCompare))
+
+	// Call the C function
+	// The C function doesn't keep any references to the data,
+	// so it's fine if the Go garbage collector moves or deletes the data later.
+	return bool(C.eq_g2_bls12377(pC, pCompareC))
+}
+
+func (p *G2PointAffine) ToProjective() G2Point {
+	return G2Point{
+		x: p.x,
+		y: p.y,
+		z: ExtentionField{
+			A0: G2Element{1, 0, 0, 0},
+			A1: G2Element{0, 0, 0, 0},
+		},
+	}
+}
+
+func (g *G2PointAffine) FromGnarkAffine(gnark *bls12377.G2Affine) *G2PointAffine {
+	// Bits() returns non montgomery format
+	g.x.A0 = gnark.X.A0.Bits() 
+	g.x.A1 = gnark.X.A1.Bits()
+	g.y.A0 = gnark.Y.A0.Bits()
+	g.y.A1 = gnark.Y.A1.Bits()
+
+	return g
+}
+
+func (g *G2PointAffine) FromGnarkJac(gnark *bls12377.G2Jac) *G2PointAffine {
+	var pointAffine bls12377.G2Affine
+	pointAffine.FromJacobian(gnark)
+
+	// Bits() returns non montgomery format
+	g.x.A0 = pointAffine.X.A0.Bits()
+	g.x.A1 = pointAffine.X.A1.Bits()
+	g.y.A0 = pointAffine.Y.A0.Bits()
+	g.y.A1 = pointAffine.Y.A1.Bits()
+
+	return g
+}
diff --git a/goicicle/curves/bls12377/msm.go b/goicicle/curves/bls12377/msm.go
new file mode 100644
index 000000000..4f476e13b
--- /dev/null
+++ b/goicicle/curves/bls12377/msm.go
@@ -0,0 +1,90 @@
+
+	// Copyright 2023 Ingonyama
+	//
+	// Licensed under the Apache License, Version 2.0 (the "License");
+	// you may not use this file except in compliance with the License.
+	// You may obtain a copy of the License at
+	//
+	//     http://www.apache.org/licenses/LICENSE-2.0
+	//
+	// Unless required by applicable law or agreed to in writing, software
+	// distributed under the License is distributed on an "AS IS" BASIS,
+	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	// See the License for the specific language governing permissions and
+	// limitations under the License.
+	
+// Code generated by Ingonyama DO NOT EDIT
+
+package bls12377
+
+import (
+	"errors"
+	"fmt"
+	"unsafe"
+)
+
+// #cgo CFLAGS: -I../../../icicle/curves/bls12_377
+// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbn12_377
+// #include "msm.h"
+import "C"
+
+func MsmBLS12377(points []PointAffineNoInfinityBLS12377, scalars []ScalarField, device_id int) (*PointBLS12377, error) {
+	if len(points) != len(scalars) {
+		return nil, errors.New("error on: len(points) != len(scalars)")
+	}
+
+	out := new(PointBLS12377)
+
+	pointsC := (*C.BLS12377_affine_t)(unsafe.Pointer(&points[0]))
+	scalarsC := (*C.BLS12377_scalar_t)(unsafe.Pointer(&scalars[0]))
+	outC := (*C.BLS12377_projective_t)(unsafe.Pointer(out))
+
+	ret := C.msm_cuda_bls12_377(outC, pointsC, scalarsC, C.size_t(len(points)), C.size_t(device_id))
+
+	if ret != 0 {
+		return nil, fmt.Errorf("msm_cuda_bls12_377 returned error code: %d", ret)
+	}
+
+	return out, nil
+}
+
+func MsmBatchBLS12377(points *[]PointAffineNoInfinityBLS12377, scalars *[]ScalarField, batchSize, deviceId int) ([]*PointBLS12377, error) {
+	// Check for nil pointers
+	if points == nil || scalars == nil {
+		return nil, errors.New("points or scalars is nil")
+	}
+
+	if len(*points) != len(*scalars) {
+		return nil, errors.New("error on: len(points) != len(scalars)")
+	}
+
+	// Check for empty slices
+	if len(*points) == 0 || len(*scalars) == 0 {
+		return nil, errors.New("points or scalars is empty")
+	}
+
+	// Check for zero batchSize
+	if batchSize <= 0 {
+		return nil, errors.New("error on: batchSize must be greater than zero")
+	}
+
+	out := make([]*PointBLS12377, batchSize)
+
+	for i := 0; i < len(out); i++ {
+		out[i] = NewPointBLS12377Zero()
+	}
+
+	outC := (*C.BLS12377_projective_t)(unsafe.Pointer(&out[0]))
+	pointsC := (*C.BLS12377_affine_t)(unsafe.Pointer(&(*points)[0]))
+	scalarsC := (*C.BLS12377_scalar_t)(unsafe.Pointer(&(*scalars)[0]))
+	msmSizeC := C.size_t(len(*points) / batchSize)
+	deviceIdC := C.size_t(deviceId)
+	batchSizeC := C.size_t(batchSize)
+
+	ret := C.msm_batch_cuda_bls12_377(outC, pointsC, scalarsC, batchSizeC, msmSizeC, deviceIdC)
+	if ret != 0 {
+		return nil, fmt.Errorf("msm_batch_cuda_bls12_377 returned error code: %d", ret)
+	}
+
+	return out, nil
+}
diff --git a/goicicle/curves/bls12377/ntt.go b/goicicle/curves/bls12377/ntt.go
new file mode 100644
index 000000000..af10045b1
--- /dev/null
+++ b/goicicle/curves/bls12377/ntt.go
@@ -0,0 +1,73 @@
+
+	// Copyright 2023 Ingonyama
+	//
+	// Licensed under the Apache License, Version 2.0 (the "License");
+	// you may not use this file except in compliance with the License.
+	// You may obtain a copy of the License at
+	//
+	//     http://www.apache.org/licenses/LICENSE-2.0
+	//
+	// Unless required by applicable law or agreed to in writing, software
+	// distributed under the License is distributed on an "AS IS" BASIS,
+	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	// See the License for the specific language governing permissions and
+	// limitations under the License.
+	
+// Code generated by Ingonyama DO NOT EDIT
+
+package bls12377
+
+
+// #cgo CFLAGS: -I../../../icicle/curves/bls12_377/
+// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbn12_377
+// #include "ntt.h"
+import "C"
+import "unsafe"
+
+const (
+	NONE = 0
+	DIF  = 1
+	DIT  = 2
+)
+
+func NttBLS12377(scalars *[]ScalarField, isInverse bool, decimation int, deviceId int) uint64 {
+	scalarsC := (*C.BLS12377_scalar_t)(unsafe.Pointer(&(*scalars)[0]))
+
+	ret := C.ntt_cuda_bls12_377(scalarsC, C.uint32_t(len(*scalars)), C.bool(isInverse), C.size_t(decimation), C.size_t(deviceId))
+
+	return uint64(ret)
+}
+
+func NttBatchBLS12377(scalars *[]ScalarField, isInverse bool, batchSize, deviceId int) uint64 {
+	scalarsC := (*C.BLS12377_scalar_t)(unsafe.Pointer(&(*scalars)[0]))
+	isInverseC := C.bool(isInverse)
+	batchSizeC := C.uint32_t(batchSize)
+	deviceIdC := C.size_t(deviceId)
+
+	ret := C.ntt_batch_cuda_bls12_377(scalarsC, C.uint32_t(len(*scalars)), batchSizeC, isInverseC, deviceIdC)
+
+	return uint64(ret)
+}
+
+func EcNttBLS12377(values *[]PointBLS12377, isInverse bool, deviceId int) uint64 {
+	valuesC := (*C.BLS12377_projective_t)(unsafe.Pointer(&(*values)[0]))
+	deviceIdC := C.size_t(deviceId)
+	isInverseC := C.bool(isInverse)
+	n := C.uint32_t(len(*values))
+
+	ret := C.ecntt_cuda_bls12_377(valuesC, n, isInverseC, deviceIdC)
+
+	return uint64(ret)
+}
+
+func EcNttBatchBLS12377(values *[]PointBLS12377, isInverse bool, batchSize, deviceId int) uint64 {
+	valuesC := (*C.BLS12377_projective_t)(unsafe.Pointer(&(*values)[0]))
+	deviceIdC := C.size_t(deviceId)
+	isInverseC := C.bool(isInverse)
+	n := C.uint32_t(len(*values))
+	batchSizeC := C.uint32_t(batchSize)
+
+	ret := C.ecntt_batch_cuda_bls12_377(valuesC, n, batchSizeC, isInverseC, deviceIdC)
+
+	return uint64(ret)
+}
diff --git a/goicicle/curves/bls12377/utils.go b/goicicle/curves/bls12377/utils.go
new file mode 100644
index 000000000..49cf4effe
--- /dev/null
+++ b/goicicle/curves/bls12377/utils.go
@@ -0,0 +1,25 @@
+package bls12377
+
+import "encoding/binary"
+
+// Function to convert [8]uint32 to [4]uint64
+func ConvertUint32ArrToUint64Arr(arr32 [8]uint32) [4]uint64 {
+	var arr64 [4]uint64
+	for i := 0; i < len(arr32); i += 2 {
+		arr64[i/2] = (uint64(arr32[i]) << 32) | uint64(arr32[i+1])
+	}
+	return arr64
+}
+
+func ConvertUint64ArrToUint32Arr(arr64 [4]uint64) [8]uint32 {
+	var arr32 [8]uint32
+	for i, v := range arr64 {
+		b := make([]byte, 8)
+		binary.LittleEndian.PutUint64(b, v)
+
+		arr32[i*2] = binary.LittleEndian.Uint32(b[0:4])
+		arr32[i*2+1] = binary.LittleEndian.Uint32(b[4:8])
+	}
+
+	return arr32
+}
diff --git a/goicicle/curves/bls12377/utils_test.go b/goicicle/curves/bls12377/utils_test.go
new file mode 100644
index 000000000..c18ddbf26
--- /dev/null
+++ b/goicicle/curves/bls12377/utils_test.go
@@ -0,0 +1,81 @@
+package bls12377
+
+import (
+	"testing"
+)
+
+func TestConvertUint32ArrToUint64Arr(t *testing.T) {
+	testCases := []struct {
+		name  string
+		input [8]uint32
+		want  [4]uint64
+	}{
+		{
+			name:  "Test with incremental array",
+			input: [8]uint32{1, 2, 3, 4, 5, 6, 7, 8},
+			want:  [4]uint64{4294967298, 12884901892, 21474836486, 30064771080},
+		},
+		{
+			name:  "Test with all zeros",
+			input: [8]uint32{0, 0, 0, 0, 0, 0, 0, 0},
+			want:  [4]uint64{0, 0, 0, 0},
+		},
+		{
+			name:  "Test with maximum uint32 values",
+			input: [8]uint32{4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295},
+			want:  [4]uint64{18446744073709551615, 18446744073709551615, 18446744073709551615, 18446744073709551615},
+		},
+		{
+			name:  "Test with alternating min and max uint32 values",
+			input: [8]uint32{0, 4294967295, 0, 4294967295, 0, 4294967295, 0, 4294967295},
+			want:  [4]uint64{4294967295, 4294967295, 4294967295, 4294967295},
+		},
+		{
+			name:  "Test with alternating max and min uint32 values",
+			input: [8]uint32{4294967295, 0, 4294967295, 0, 4294967295, 0, 4294967295, 0},
+			want:  [4]uint64{18446744069414584320, 18446744069414584320, 18446744069414584320, 18446744069414584320},
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			got := ConvertUint32ArrToUint64Arr(tc.input)
+			if got != tc.want {
+				t.Errorf("got %v, want %v", got, tc.want)
+			}
+		})
+	}
+}
+
+func TestConvertUint64ArrToUint32Arr(t *testing.T) {
+	testCases := []struct {
+		name     string
+		input    [4]uint64
+		expected [8]uint32
+	}{
+		{
+			name:     "test one",
+			input:    [4]uint64{1, 2, 3, 4},
+			expected: [8]uint32{1, 0, 2, 0, 3, 0, 4, 0},
+		},
+		{
+			name:     "test two",
+			input:    [4]uint64{100, 200, 300, 400},
+			expected: [8]uint32{100, 0, 200, 0, 300, 0, 400, 0},
+		},
+		{
+			name:     "test three",
+			input:    [4]uint64{1000, 2000, 3000, 4000},
+			expected: [8]uint32{1000, 0, 2000, 0, 3000, 0, 4000, 0},
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			got := ConvertUint64ArrToUint32Arr(tc.input)
+			if got != tc.expected {
+				t.Errorf("got %v, want %v", got, tc.expected)
+			}
+		})
+	}
+}
diff --git a/goicicle/curves/bls12381/g1.go b/goicicle/curves/bls12381/g1.go
new file mode 100644
index 000000000..f2a159655
--- /dev/null
+++ b/goicicle/curves/bls12381/g1.go
@@ -0,0 +1,505 @@
+
+	// Copyright 2023 Ingonyama
+	//
+	// Licensed under the Apache License, Version 2.0 (the "License");
+	// you may not use this file except in compliance with the License.
+	// You may obtain a copy of the License at
+	//
+	//     http://www.apache.org/licenses/LICENSE-2.0
+	//
+	// Unless required by applicable law or agreed to in writing, software
+	// distributed under the License is distributed on an "AS IS" BASIS,
+	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	// See the License for the specific language governing permissions and
+	// limitations under the License.
+	
+// Code generated by Ingonyama DO NOT EDIT
+
+package bls12381
+
+import (
+	"unsafe"
+
+	"encoding/binary"
+	"fmt"
+
+	
+
+
+	"github.com/consensys/gnark-crypto/ecc/bls12-381"
+
+
+
+	
+
+	"github.com/consensys/gnark-crypto/ecc/bls12-381/fp"
+
+
+
+	
+
+
+	"github.com/consensys/gnark-crypto/ecc/bls12-381/fr"
+
+
+
+)
+
+// #cgo CFLAGS: -I${SRCDIR}/icicle/curves/bls12381/
+// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbn12_381
+// #include "c_api.h"
+// #include "ve_mod_mult.h"
+import "C"
+
+const SCALAR_SIZE = 8
+const BASE_SIZE = 12
+
+type ScalarField struct {
+	s [SCALAR_SIZE]uint32
+}
+
+type BaseField struct {
+	s [BASE_SIZE]uint32
+}
+
+type Field interface {
+	toGnarkFr() *fr.Element
+}
+
+/*
+ * Common Constrctors
+ */
+
+func NewFieldZero[T BaseField | ScalarField]() *T {
+	var field T
+
+	return &field
+}
+
+func NewFieldFromFrGnark[T BaseField | ScalarField](element fr.Element) *T {
+	s := ConvertUint64ArrToUint32Arr(element.Bits()) // get non-montgomry
+
+	return &T{s}
+}
+
+func NewFieldFromFpGnark[T BaseField | ScalarField](element fp.Element) *T {
+	s := ConvertUint64ArrToUint32Arr(element.Bits()) // get non-montgomry
+
+	return &T{s}
+}
+
+/*
+ * BaseField Constrctors
+ */
+
+func NewBaseFieldOne() *BaseField {
+	var s [BASE_SIZE]uint32
+
+	s[0] = 1
+
+	return &BaseField{s}
+}
+
+func BaseFieldFromLimbs(limbs [BASE_SIZE]uint32) *BaseField {
+	bf := NewFieldZero[BaseField]()
+	copy(bf.s[:], limbs[:])
+
+	return bf
+}
+
+/*
+ * BaseField methods
+ */
+
+func (f *BaseField) limbs() [BASE_SIZE]uint32 {
+	return f.s
+}
+
+func (f *BaseField) toBytesLe() []byte {
+	bytes := make([]byte, len(f.s)*4)
+	for i, v := range f.s {
+		binary.LittleEndian.PutUint32(bytes[i*4:], v)
+	}
+
+	return bytes
+}
+
+func (f *BaseField) toGnarkFr() *fr.Element {
+	fb := f.toBytesLe()
+	var b32 [32]byte
+	copy(b32[:], fb[:32])
+
+	v, e := fr.LittleEndian.Element(&b32)
+
+	if e != nil {
+		panic(fmt.Sprintf("unable to create convert point %v got error %v", f, e))
+	}
+
+	return &v
+}
+
+func (f *BaseField) toGnarkFp() *fp.Element {
+	fb := f.toBytesLe()
+	var b32 [32]byte
+	copy(b32[:], fb[:32])
+
+	v, e := fp.LittleEndian.Element(&b32)
+
+	if e != nil {
+		panic(fmt.Sprintf("unable to create convert point %v got error %v", f, e))
+	}
+
+	return &v
+}
+
+/*
+ * ScalarField methods
+ */
+
+func NewScalarFieldOne() *ScalarField {
+	var s [SCALAR_SIZE]uint32
+
+	s[0] = 1
+
+	return &ScalarField{s}
+}
+
+/*
+ * ScalarField methods
+ */
+
+func (f *ScalarField) limbs() [SCALAR_SIZE]uint32 {
+	return f.s
+}
+
+func (f *ScalarField) toBytesLe() []byte {
+	bytes := make([]byte, len(f.s)*4)
+	for i, v := range f.s {
+		binary.LittleEndian.PutUint32(bytes[i*4:], v)
+	}
+
+	return bytes
+}
+
+func (f ScalarField) toGnarkFr() *fr.Element {
+	fb := f.toBytesLe()
+	var b32 [32]byte
+	copy(b32[:], fb[:32])
+
+	v, e := fr.LittleEndian.Element(&b32)
+
+	if e != nil {
+		panic(fmt.Sprintf("unable to create convert point %v got error %v", f, e))
+	}
+
+	return &v
+}
+
+func (f *ScalarField) toGnarkFp() *fp.Element {
+	fb := f.toBytesLe()
+	var b32 [32]byte
+	copy(b32[:], fb[:32])
+
+	v, e := fp.LittleEndian.Element(&b32)
+
+	if e != nil {
+		panic(fmt.Sprintf("unable to create convert point %v got error %v", f, e))
+	}
+
+	return &v
+}
+
+/*
+ * PointBLS12381
+ */
+
+type PointBLS12381 struct {
+	x, y, z BaseField
+}
+
+func NewPointBLS12381Zero() *PointBLS12381 {
+	return &PointBLS12381{
+		x: *NewFieldZero[BaseField](),
+		y: *NewBaseFieldOne(),
+		z: *NewFieldZero[BaseField](),
+	}
+}
+
+func (p *PointBLS12381) eq(pCompare *PointBLS12381) bool {
+	// Cast *PointBLS12381 to *C.BLS12381_projective_t
+	// The unsafe.Pointer cast is necessary because Go doesn't allow direct casts
+	// between different pointer types.
+	// It's your responsibility to ensure that the types are compatible.
+	pC := (*C.BLS12381_projective_t)(unsafe.Pointer(p))
+	pCompareC := (*C.BLS12381_projective_t)(unsafe.Pointer(pCompare))
+
+	// Call the C function
+	// The C function doesn't keep any references to the data,
+	// so it's fine if the Go garbage collector moves or deletes the data later.
+	return bool(C.eq_bls12381(pC, pCompareC))
+}
+
+func (p *PointBLS12381) strip_z() *PointAffineNoInfinityBLS12381 {
+	return &PointAffineNoInfinityBLS12381{
+		x: p.x,
+		y: p.y,
+	}
+}
+
+func (p *PointBLS12381) toGnarkAffine() *bls12381.G1Affine {
+	px := p.x.toGnarkFp()
+	py := p.y.toGnarkFp()
+	pz := p.z.toGnarkFp()
+
+	zInv := new(fp.Element)
+	x := new(fp.Element)
+	y := new(fp.Element)
+
+	zInv.Inverse(pz)
+
+	x.Mul(px, zInv)
+	y.Mul(py, zInv)
+
+	return &bls12381.G1Affine{X: *x, Y: *y}
+}
+
+func (p *PointBLS12381) ToGnarkJac() *bls12381.G1Jac {
+	var p1 bls12381.G1Jac
+	p1.FromAffine(p.toGnarkAffine())
+
+	return &p1
+}
+
+func PointBLS12381FromG1AffineGnark(gnark *bls12381.G1Affine) *PointBLS12381 {
+	point := PointBLS12381{
+		x: *NewFieldFromFpGnark[BaseField](gnark.X),
+		y: *NewFieldFromFpGnark[BaseField](gnark.Y),
+		z: *NewBaseFieldOne(),
+	}
+
+	return &point
+}
+
+// converts jac fromat to projective
+func PointBLS12381FromJacGnark(gnark *bls12381.G1Jac) *PointBLS12381 {
+	var pointAffine bls12381.G1Affine
+	pointAffine.FromJacobian(gnark)
+
+	point := PointBLS12381{
+		x: *NewFieldFromFpGnark[BaseField](pointAffine.X),
+		y: *NewFieldFromFpGnark[BaseField](pointAffine.Y),
+		z: *NewBaseFieldOne(),
+	}
+
+	return &point
+}
+
+func PointBLS12381fromLimbs(x, y, z *[]uint32) *PointBLS12381 {
+	return &PointBLS12381{
+		x: *BaseFieldFromLimbs(getFixedLimbs(x)),
+		y: *BaseFieldFromLimbs(getFixedLimbs(y)),
+		z: *BaseFieldFromLimbs(getFixedLimbs(z)),
+	}
+}
+
+/*
+ * PointAffineNoInfinityBLS12381
+ */
+
+type PointAffineNoInfinityBLS12381 struct {
+	x, y BaseField
+}
+
+func NewPointAffineNoInfinityBLS12381Zero() *PointAffineNoInfinityBLS12381 {
+	return &PointAffineNoInfinityBLS12381{
+		x: *NewFieldZero[BaseField](),
+		y: *NewFieldZero[BaseField](),
+	}
+}
+
+func (p *PointAffineNoInfinityBLS12381) toProjective() *PointBLS12381 {
+	return &PointBLS12381{
+		x: p.x,
+		y: p.y,
+		z: *NewBaseFieldOne(),
+	}
+}
+
+func (p *PointAffineNoInfinityBLS12381) toGnarkAffine() *bls12381.G1Affine {
+	return p.toProjective().toGnarkAffine()
+}
+
+func PointAffineNoInfinityBLS12381FromLimbs(x, y *[]uint32) *PointAffineNoInfinityBLS12381 {
+	return &PointAffineNoInfinityBLS12381{
+		x: *BaseFieldFromLimbs(getFixedLimbs(x)),
+		y: *BaseFieldFromLimbs(getFixedLimbs(y)),
+	}
+}
+
+/*
+ * Multiplication
+ */
+
+func MultiplyVec(a []PointBLS12381, b []ScalarField, deviceID int) {
+	if len(a) != len(b) {
+		panic("a and b have different lengths")
+	}
+
+	pointsC := (*C.BLS12381_projective_t)(unsafe.Pointer(&a[0]))
+	scalarsC := (*C.BLS12381_scalar_t)(unsafe.Pointer(&b[0]))
+	deviceIdC := C.size_t(deviceID)
+	nElementsC := C.size_t(len(a))
+
+	C.vec_mod_mult_point_bls12381(pointsC, scalarsC, nElementsC, deviceIdC)
+}
+
+func MultiplyScalar(a []ScalarField, b []ScalarField, deviceID int) {
+	if len(a) != len(b) {
+		panic("a and b have different lengths")
+	}
+
+	aC := (*C.BLS12381_scalar_t)(unsafe.Pointer(&a[0]))
+	bC := (*C.BLS12381_scalar_t)(unsafe.Pointer(&b[0]))
+	deviceIdC := C.size_t(deviceID)
+	nElementsC := C.size_t(len(a))
+
+	C.vec_mod_mult_scalar_bls12381(aC, bC, nElementsC, deviceIdC)
+}
+
+// Multiply a matrix by a scalar:
+//
+//	`a` - flattenned matrix;
+//	`b` - vector to multiply `a` by;
+func MultiplyMatrix(a []ScalarField, b []ScalarField, deviceID int) {
+	c := make([]ScalarField, len(b))
+	for i := range c {
+		c[i] = *NewFieldZero[ScalarField]()
+	}
+
+	aC := (*C.BLS12381_scalar_t)(unsafe.Pointer(&a[0]))
+	bC := (*C.BLS12381_scalar_t)(unsafe.Pointer(&b[0]))
+	cC := (*C.BLS12381_scalar_t)(unsafe.Pointer(&c[0]))
+	deviceIdC := C.size_t(deviceID)
+	nElementsC := C.size_t(len(a))
+
+	C.matrix_vec_mod_mult_bls12381(aC, bC, cC, nElementsC, deviceIdC)
+}
+
+/*
+ * Utils
+ */
+
+func getFixedLimbs(slice *[]uint32) [BASE_SIZE]uint32 {
+	if len(*slice) <= BASE_SIZE {
+		limbs := [BASE_SIZE]uint32{}
+		copy(limbs[:len(*slice)], *slice)
+		return limbs
+	}
+
+	panic("slice has too many elements")
+}
+
+func BatchConvertFromFrGnark[T BaseField | ScalarField](elements []fr.Element) []T {
+	var newElements []T
+	for _, e := range elements {
+		converted := NewFieldFromFrGnark[T](e)
+		newElements = append(newElements, *converted)
+	}
+
+	return newElements
+}
+
+func BatchConvertFromFrGnarkThreaded[T BaseField | ScalarField](elements []fr.Element, routines int) []T {
+	var newElements []T
+
+	if routines > 1 {
+		channels := make([]chan []T, routines)
+		for i := 0; i < routines; i ++ {
+			channels[i] = make(chan []T, 1)
+		} 
+
+		convert := func(elements []fr.Element, chanIndex int) {
+			var convertedElements []T
+			for _, e := range elements {
+				converted := NewFieldFromFrGnark[T](e)
+				convertedElements = append(convertedElements, *converted)
+			}
+
+			channels[chanIndex] <- convertedElements
+		}
+
+		batchLen := len(elements)/routines
+		for i := 0; i < routines; i ++ {
+			elemsToConv := elements[batchLen*i:batchLen*(i+1)]
+			go convert(elemsToConv, i)
+		}
+
+		for i := 0; i < routines; i ++ {
+			newElements = append(newElements, <-channels[i]...)
+		}
+	} else {
+		for _, e := range elements {
+			converted := NewFieldFromFrGnark[T](e)
+			newElements = append(newElements, *converted)
+		}
+	}
+
+	return newElements
+}
+
+func BatchConvertToFrGnark[T Field](elements []T) []fr.Element {
+	var newElements []fr.Element
+	for _, e := range elements {
+		converted := e.toGnarkFr()
+		newElements = append(newElements, *converted)
+	}
+
+	return newElements
+}
+
+func BatchConvertToFrGnarkThreaded[T Field](elements []T, routines int) []fr.Element {
+	var newElements []fr.Element
+
+	if routines > 1 {
+		channels := make([]chan []fr.Element, routines)
+		for i := 0; i < routines; i ++ {
+			channels[i] = make(chan []fr.Element, 1)
+		} 
+
+		convert := func(elements []T, chanIndex int) {
+			var convertedElements []fr.Element
+			for _, e := range elements {
+				converted := e.toGnarkFr()
+				convertedElements = append(convertedElements, *converted)
+			}
+
+			channels[chanIndex] <- convertedElements
+		}
+
+		batchLen := len(elements)/routines
+		for i := 0; i < routines; i ++ {
+			elemsToConv := elements[batchLen*i:batchLen*(i+1)]
+			go convert(elemsToConv, i)
+		}
+
+		for i := 0; i < routines; i ++ {
+			newElements = append(newElements, <-channels[i]...)
+		}
+	} else {
+		for _, e := range elements {
+			converted := e.toGnarkFr()
+			newElements = append(newElements, *converted)
+		}
+	}
+
+	return newElements
+}
+
+func BatchConvertFromG1Affine(elements []bls12381.G1Affine) []PointAffineNoInfinityBLS12381 {
+	var newElements []PointAffineNoInfinityBLS12381
+	for _, e := range elements {
+		newElement := PointBLS12381FromG1AffineGnark(&e).strip_z()
+		newElements = append(newElements, *newElement)
+	}
+	return newElements
+}
diff --git a/goicicle/curves/bls12381/g2.go b/goicicle/curves/bls12381/g2.go
new file mode 100644
index 000000000..1b15713f2
--- /dev/null
+++ b/goicicle/curves/bls12381/g2.go
@@ -0,0 +1,108 @@
+
+	// Copyright 2023 Ingonyama
+	//
+	// Licensed under the Apache License, Version 2.0 (the "License");
+	// you may not use this file except in compliance with the License.
+	// You may obtain a copy of the License at
+	//
+	//     http://www.apache.org/licenses/LICENSE-2.0
+	//
+	// Unless required by applicable law or agreed to in writing, software
+	// distributed under the License is distributed on an "AS IS" BASIS,
+	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	// See the License for the specific language governing permissions and
+	// limitations under the License.
+	
+// Code generated by Ingonyama DO NOT EDIT
+
+package bls12381
+
+import (
+	"unsafe"
+	
+
+
+	"github.com/consensys/gnark-crypto/ecc/bls12-381"
+
+
+
+)
+
+// #cgo CFLAGS: -I${SRCDIR}/icicle/curves/bls12381/
+// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbn12_381
+// #include "c_api.h"
+// #include "ve_mod_mult.h"
+import "C"
+
+func BatchConvertFromG2Affine(elements []bls12381.G2Affine) []G2PointAffine {
+	var newElements []G2PointAffine
+	for _, gg2Affine := range elements {
+		var newElement G2PointAffine
+		newElement.FromGnarkAffine(&gg2Affine)
+
+		newElements = append(newElements, newElement)
+	}
+	return newElements
+}
+
+// G2 extension field
+
+type G2Element [4]uint64
+
+type ExtentionField struct {
+	A0, A1 G2Element
+}
+
+type G2PointAffine struct {
+	x, y ExtentionField
+}
+
+type G2Point struct {
+	x, y, z ExtentionField
+}
+
+func (p *G2Point) eqg2(pCompare *G2Point) bool {
+	// Cast *PointBLS12381 to *C.BLS12381_projective_t
+	// The unsafe.Pointer cast is necessary because Go doesn't allow direct casts
+	// between different pointer types.
+	// It's your responsibility to ensure that the types are compatible.
+	pC := (*C.BLS12381_g2_projective_t)(unsafe.Pointer(p))
+	pCompareC := (*C.BLS12381_g2_projective_t)(unsafe.Pointer(pCompare))
+
+	// Call the C function
+	// The C function doesn't keep any references to the data,
+	// so it's fine if the Go garbage collector moves or deletes the data later.
+	return bool(C.eq_g2_bls12381(pC, pCompareC))
+}
+
+func (p *G2PointAffine) ToProjective() G2Point {
+	return G2Point{
+		x: p.x,
+		y: p.y,
+		z: ExtentionField{
+			A0: G2Element{1, 0, 0, 0},
+			A1: G2Element{0, 0, 0, 0},
+		},
+	}
+}
+
+func (g *G2PointAffine) FromGnarkAffine(gnark *bls12381.G2Affine) *G2PointAffine {
+	g.x.A0 = gnark.X.A0.Bits()
+	g.x.A1 = gnark.X.A1.Bits()
+	g.y.A0 = gnark.Y.A0.Bits()
+	g.y.A1 = gnark.Y.A1.Bits()
+
+	return g
+}
+
+func (g *G2PointAffine) FromGnarkJac(gnark *bls12381.G2Jac) *G2PointAffine {
+	var pointAffine bls12381.G2Affine
+	pointAffine.FromJacobian(gnark)
+
+	g.x.A0 = pointAffine.X.A0.Bits()
+	g.x.A1 = pointAffine.X.A1.Bits()
+	g.y.A0 = pointAffine.Y.A0.Bits()
+	g.y.A1 = pointAffine.Y.A1.Bits()
+
+	return g
+}
diff --git a/goicicle/curves/bls12381/msm.go b/goicicle/curves/bls12381/msm.go
new file mode 100644
index 000000000..da98ecb79
--- /dev/null
+++ b/goicicle/curves/bls12381/msm.go
@@ -0,0 +1,90 @@
+
+	// Copyright 2023 Ingonyama
+	//
+	// Licensed under the Apache License, Version 2.0 (the "License");
+	// you may not use this file except in compliance with the License.
+	// You may obtain a copy of the License at
+	//
+	//     http://www.apache.org/licenses/LICENSE-2.0
+	//
+	// Unless required by applicable law or agreed to in writing, software
+	// distributed under the License is distributed on an "AS IS" BASIS,
+	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	// See the License for the specific language governing permissions and
+	// limitations under the License.
+	
+// Code generated by Ingonyama DO NOT EDIT
+
+package bls12381
+
+import (
+	"errors"
+	"fmt"
+	"unsafe"
+)
+
+// #cgo CFLAGS: -I../../../icicle/curves/bls12_381/
+// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbn12_381
+// #include "msm.h"
+import "C"
+
+func MsmBLS12381(points []PointAffineNoInfinityBLS12381, scalars []ScalarField, device_id int) (*PointBLS12381, error) {
+	if len(points) != len(scalars) {
+		return nil, errors.New("error on: len(points) != len(scalars)")
+	}
+
+	out := new(PointBLS12381)
+
+	pointsC := (*C.BLS12381_affine_t)(unsafe.Pointer(&points[0]))
+	scalarsC := (*C.BLS12381_scalar_t)(unsafe.Pointer(&scalars[0]))
+	outC := (*C.BLS12381_projective_t)(unsafe.Pointer(out))
+
+	ret := C.msm_cuda_bls12_381(outC, pointsC, scalarsC, C.size_t(len(points)), C.size_t(device_id))
+
+	if ret != 0 {
+		return nil, fmt.Errorf("msm_cuda_bls12_381 returned error code: %d", ret)
+	}
+
+	return out, nil
+}
+
+func MsmBatchBLS12381(points *[]PointAffineNoInfinityBLS12381, scalars *[]ScalarField, batchSize, deviceId int) ([]*PointBLS12381, error) {
+	// Check for nil pointers
+	if points == nil || scalars == nil {
+		return nil, errors.New("points or scalars is nil")
+	}
+
+	if len(*points) != len(*scalars) {
+		return nil, errors.New("error on: len(points) != len(scalars)")
+	}
+
+	// Check for empty slices
+	if len(*points) == 0 || len(*scalars) == 0 {
+		return nil, errors.New("points or scalars is empty")
+	}
+
+	// Check for zero batchSize
+	if batchSize <= 0 {
+		return nil, errors.New("error on: batchSize must be greater than zero")
+	}
+
+	out := make([]*PointBLS12381, batchSize)
+
+	for i := 0; i < len(out); i++ {
+		out[i] = NewPointBLS12381Zero()
+	}
+
+	outC := (*C.BLS12381_projective_t)(unsafe.Pointer(&out[0]))
+	pointsC := (*C.BLS12381_affine_t)(unsafe.Pointer(&(*points)[0]))
+	scalarsC := (*C.BLS12381_scalar_t)(unsafe.Pointer(&(*scalars)[0]))
+	msmSizeC := C.size_t(len(*points) / batchSize)
+	deviceIdC := C.size_t(deviceId)
+	batchSizeC := C.size_t(batchSize)
+
+	ret := C.msm_batch_cuda_bls12_381(outC, pointsC, scalarsC, batchSizeC, msmSizeC, deviceIdC)
+	if ret != 0 {
+		return nil, fmt.Errorf("msm_batch_cuda_bls12_381 returned error code: %d", ret)
+	}
+
+	return out, nil
+}
diff --git a/goicicle/curves/bls12381/ntt.go b/goicicle/curves/bls12381/ntt.go
new file mode 100644
index 000000000..ab3107cf8
--- /dev/null
+++ b/goicicle/curves/bls12381/ntt.go
@@ -0,0 +1,73 @@
+
+	// Copyright 2023 Ingonyama
+	//
+	// Licensed under the Apache License, Version 2.0 (the "License");
+	// you may not use this file except in compliance with the License.
+	// You may obtain a copy of the License at
+	//
+	//     http://www.apache.org/licenses/LICENSE-2.0
+	//
+	// Unless required by applicable law or agreed to in writing, software
+	// distributed under the License is distributed on an "AS IS" BASIS,
+	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	// See the License for the specific language governing permissions and
+	// limitations under the License.
+	
+// Code generated by Ingonyama DO NOT EDIT
+
+package bls12381
+
+
+// #cgo CFLAGS: -I../../../icicle/curves/bls12_381
+// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbn12_381
+// #include "ntt.h"
+import "C"
+import "unsafe"
+
+const (
+	NONE = 0
+	DIF  = 1
+	DIT  = 2
+)
+
+func NttBLS12381(scalars *[]ScalarField, isInverse bool, decimation int, deviceId int) uint64 {
+	scalarsC := (*C.BLS12381_scalar_t)(unsafe.Pointer(&(*scalars)[0]))
+
+	ret := C.ntt_cuda_bls12_381(scalarsC, C.uint32_t(len(*scalars)), C.bool(isInverse), C.size_t(decimation), C.size_t(deviceId))
+
+	return uint64(ret)
+}
+
+func NttBatchBLS12381(scalars *[]ScalarField, isInverse bool, batchSize, deviceId int) uint64 {
+	scalarsC := (*C.BLS12381_scalar_t)(unsafe.Pointer(&(*scalars)[0]))
+	isInverseC := C.bool(isInverse)
+	batchSizeC := C.uint32_t(batchSize)
+	deviceIdC := C.size_t(deviceId)
+
+	ret := C.ntt_batch_cuda_bls12_381(scalarsC, C.uint32_t(len(*scalars)), batchSizeC, isInverseC, deviceIdC)
+
+	return uint64(ret)
+}
+
+func EcNttBLS12381(values *[]PointBLS12381, isInverse bool, deviceId int) uint64 {
+	valuesC := (*C.BLS12381_projective_t)(unsafe.Pointer(&(*values)[0]))
+	deviceIdC := C.size_t(deviceId)
+	isInverseC := C.bool(isInverse)
+	n := C.uint32_t(len(*values))
+
+	ret := C.ecntt_cuda_bls12_381(valuesC, n, isInverseC, deviceIdC)
+
+	return uint64(ret)
+}
+
+func EcNttBatchBLS12381(values *[]PointBLS12381, isInverse bool, batchSize, deviceId int) uint64 {
+	valuesC := (*C.BLS12381_projective_t)(unsafe.Pointer(&(*values)[0]))
+	deviceIdC := C.size_t(deviceId)
+	isInverseC := C.bool(isInverse)
+	n := C.uint32_t(len(*values))
+	batchSizeC := C.uint32_t(batchSize)
+
+	ret := C.ecntt_batch_cuda_bls12_381(valuesC, n, batchSizeC, isInverseC, deviceIdC)
+
+	return uint64(ret)
+}
diff --git a/goicicle/curves/bls12381/utils.go b/goicicle/curves/bls12381/utils.go
new file mode 100644
index 000000000..d24af1af8
--- /dev/null
+++ b/goicicle/curves/bls12381/utils.go
@@ -0,0 +1,25 @@
+package bls12381
+
+import "encoding/binary"
+
+// Function to convert [8]uint32 to [4]uint64
+func ConvertUint32ArrToUint64Arr(arr32 [8]uint32) [4]uint64 {
+	var arr64 [4]uint64
+	for i := 0; i < len(arr32); i += 2 {
+		arr64[i/2] = (uint64(arr32[i]) << 32) | uint64(arr32[i+1])
+	}
+	return arr64
+}
+
+func ConvertUint64ArrToUint32Arr(arr64 [4]uint64) [8]uint32 {
+	var arr32 [8]uint32
+	for i, v := range arr64 {
+		b := make([]byte, 8)
+		binary.LittleEndian.PutUint64(b, v)
+
+		arr32[i*2] = binary.LittleEndian.Uint32(b[0:4])
+		arr32[i*2+1] = binary.LittleEndian.Uint32(b[4:8])
+	}
+
+	return arr32
+}
diff --git a/goicicle/curves/bls12381/utils_test.go b/goicicle/curves/bls12381/utils_test.go
new file mode 100644
index 000000000..762405800
--- /dev/null
+++ b/goicicle/curves/bls12381/utils_test.go
@@ -0,0 +1,81 @@
+package bls12381
+
+import (
+	"testing"
+)
+
+func TestConvertUint32ArrToUint64Arr(t *testing.T) {
+	testCases := []struct {
+		name  string
+		input [8]uint32
+		want  [4]uint64
+	}{
+		{
+			name:  "Test with incremental array",
+			input: [8]uint32{1, 2, 3, 4, 5, 6, 7, 8},
+			want:  [4]uint64{4294967298, 12884901892, 21474836486, 30064771080},
+		},
+		{
+			name:  "Test with all zeros",
+			input: [8]uint32{0, 0, 0, 0, 0, 0, 0, 0},
+			want:  [4]uint64{0, 0, 0, 0},
+		},
+		{
+			name:  "Test with maximum uint32 values",
+			input: [8]uint32{4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295},
+			want:  [4]uint64{18446744073709551615, 18446744073709551615, 18446744073709551615, 18446744073709551615},
+		},
+		{
+			name:  "Test with alternating min and max uint32 values",
+			input: [8]uint32{0, 4294967295, 0, 4294967295, 0, 4294967295, 0, 4294967295},
+			want:  [4]uint64{4294967295, 4294967295, 4294967295, 4294967295},
+		},
+		{
+			name:  "Test with alternating max and min uint32 values",
+			input: [8]uint32{4294967295, 0, 4294967295, 0, 4294967295, 0, 4294967295, 0},
+			want:  [4]uint64{18446744069414584320, 18446744069414584320, 18446744069414584320, 18446744069414584320},
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			got := ConvertUint32ArrToUint64Arr(tc.input)
+			if got != tc.want {
+				t.Errorf("got %v, want %v", got, tc.want)
+			}
+		})
+	}
+}
+
+func TestConvertUint64ArrToUint32Arr(t *testing.T) {
+	testCases := []struct {
+		name     string
+		input    [4]uint64
+		expected [8]uint32
+	}{
+		{
+			name:     "test one",
+			input:    [4]uint64{1, 2, 3, 4},
+			expected: [8]uint32{1, 0, 2, 0, 3, 0, 4, 0},
+		},
+		{
+			name:     "test two",
+			input:    [4]uint64{100, 200, 300, 400},
+			expected: [8]uint32{100, 0, 200, 0, 300, 0, 400, 0},
+		},
+		{
+			name:     "test three",
+			input:    [4]uint64{1000, 2000, 3000, 4000},
+			expected: [8]uint32{1000, 0, 2000, 0, 3000, 0, 4000, 0},
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			got := ConvertUint64ArrToUint32Arr(tc.input)
+			if got != tc.expected {
+				t.Errorf("got %v, want %v", got, tc.expected)
+			}
+		})
+	}
+}
diff --git a/goicicle/curves/bn254/g1.go b/goicicle/curves/bn254/g1.go
new file mode 100644
index 000000000..b5c560db1
--- /dev/null
+++ b/goicicle/curves/bn254/g1.go
@@ -0,0 +1,503 @@
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+package bn254
+
+import (
+	"unsafe"
+
+	"encoding/binary"
+	"fmt"
+
+	"github.com/consensys/gnark-crypto/ecc/bn254"
+
+	"github.com/consensys/gnark-crypto/ecc/bn254/fp"
+
+	"github.com/consensys/gnark-crypto/ecc/bn254/fr"
+)
+
+// #cgo CFLAGS: -I${SRCDIR}/icicle/curves/bn254/
+// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbn254
+// #include "c_api.h"
+// #include "ve_mod_mult.h"
+import "C"
+
+const SCALAR_SIZE = 8
+const BASE_SIZE = 8
+
+type ScalarField struct {
+	s [SCALAR_SIZE]uint32
+}
+
+type BaseField struct {
+	s [BASE_SIZE]uint32
+}
+
+type Field interface {
+	toGnarkFr() *fr.Element
+}
+
+/*
+ * Common Constrctors
+ */
+
+func NewFieldZero[T BaseField | ScalarField]() *T {
+	var field T
+
+	return &field
+}
+
+func NewFieldFromFrGnark[T BaseField | ScalarField](element fr.Element) *T {
+	s := ConvertUint64ArrToUint32Arr(element.Bits()) // get non-montgomry
+
+	return &T{s}
+}
+
+func NewFieldFromFpGnark[T BaseField | ScalarField](element fp.Element) *T {
+	s := ConvertUint64ArrToUint32Arr(element.Bits()) // get non-montgomry
+
+	return &T{s}
+}
+
+/*
+ * BaseField Constrctors
+ */
+
+func NewBaseFieldOne() *BaseField {
+	var s [BASE_SIZE]uint32
+
+	s[0] = 1
+
+	return &BaseField{s}
+}
+
+func BaseFieldFromLimbs(limbs [BASE_SIZE]uint32) *BaseField {
+	bf := NewFieldZero[BaseField]()
+	copy(bf.s[:], limbs[:])
+
+	return bf
+}
+
+/*
+ * BaseField methods
+ */
+
+func (f *BaseField) limbs() [BASE_SIZE]uint32 {
+	return f.s
+}
+
+func (f *BaseField) toBytesLe() []byte {
+	bytes := make([]byte, len(f.s)*4)
+	for i, v := range f.s {
+		binary.LittleEndian.PutUint32(bytes[i*4:], v)
+	}
+
+	return bytes
+}
+
+func (f *BaseField) toGnarkFr() *fr.Element {
+	fb := f.toBytesLe()
+	var b32 [32]byte
+	copy(b32[:], fb[:32])
+
+	v, e := fr.LittleEndian.Element(&b32)
+
+	if e != nil {
+		panic(fmt.Sprintf("unable to create convert point %v got error %v", f, e))
+	}
+
+	return &v
+}
+
+func (f *BaseField) toGnarkFp() *fp.Element {
+	fb := f.toBytesLe()
+	var b32 [32]byte
+	copy(b32[:], fb[:32])
+
+	v, e := fp.LittleEndian.Element(&b32)
+
+	if e != nil {
+		panic(fmt.Sprintf("unable to create convert point %v got error %v", f, e))
+	}
+
+	return &v
+}
+
+/*
+ * ScalarField methods
+ */
+
+func NewScalarFieldOne() *ScalarField {
+	var s [SCALAR_SIZE]uint32
+
+	s[0] = 1
+
+	return &ScalarField{s}
+}
+
+func (a *ScalarField) Equals(b *ScalarField) bool {
+	for i, v := range a.s {
+		if b.s[i] != v {
+			return false
+		}
+	}
+	return true
+}
+
+/*
+ * ScalarField methods
+ */
+
+func (f *ScalarField) limbs() [SCALAR_SIZE]uint32 {
+	return f.s
+}
+
+func (f *ScalarField) toBytesLe() []byte {
+	bytes := make([]byte, len(f.s)*4)
+	for i, v := range f.s {
+		binary.LittleEndian.PutUint32(bytes[i*4:], v)
+	}
+
+	return bytes
+}
+
+func (f ScalarField) toGnarkFr() *fr.Element {
+	fb := f.toBytesLe()
+	var b32 [32]byte
+	copy(b32[:], fb[:32])
+
+	v, e := fr.LittleEndian.Element(&b32)
+
+	if e != nil {
+		panic(fmt.Sprintf("unable to create convert point %v got error %v", f, e))
+	}
+
+	return &v
+}
+
+func (f *ScalarField) toGnarkFp() *fp.Element {
+	fb := f.toBytesLe()
+	var b32 [32]byte
+	copy(b32[:], fb[:32])
+
+	v, e := fp.LittleEndian.Element(&b32)
+
+	if e != nil {
+		panic(fmt.Sprintf("unable to create convert point %v got error %v", f, e))
+	}
+
+	return &v
+}
+
+/*
+ * PointBN254
+ */
+
+type PointBN254 struct {
+	x, y, z BaseField
+}
+
+func NewPointBN254Zero() *PointBN254 {
+	return &PointBN254{
+		x: *NewFieldZero[BaseField](),
+		y: *NewBaseFieldOne(),
+		z: *NewFieldZero[BaseField](),
+	}
+}
+
+func (p *PointBN254) eq(pCompare *PointBN254) bool {
+	// Cast *PointBN254 to *C.BN254_projective_t
+	// The unsafe.Pointer cast is necessary because Go doesn't allow direct casts
+	// between different pointer types.
+	// It's your responsibility to ensure that the types are compatible.
+	pC := (*C.BN254_projective_t)(unsafe.Pointer(p))
+	pCompareC := (*C.BN254_projective_t)(unsafe.Pointer(pCompare))
+
+	// Call the C function
+	// The C function doesn't keep any references to the data,
+	// so it's fine if the Go garbage collector moves or deletes the data later.
+	return bool(C.eq_bn254(pC, pCompareC))
+}
+
+func (p *PointBN254) strip_z() *PointAffineNoInfinityBN254 {
+	return &PointAffineNoInfinityBN254{
+		x: p.x,
+		y: p.y,
+	}
+}
+
+func (p *PointBN254) toGnarkAffine() *bn254.G1Affine {
+	px := p.x.toGnarkFp()
+	py := p.y.toGnarkFp()
+	pz := p.z.toGnarkFp()
+
+	zInv := new(fp.Element)
+	x := new(fp.Element)
+	y := new(fp.Element)
+
+	zInv.Inverse(pz)
+
+	x.Mul(px, zInv)
+	y.Mul(py, zInv)
+
+	return &bn254.G1Affine{X: *x, Y: *y}
+}
+
+func (p *PointBN254) ToGnarkJac() *bn254.G1Jac {
+	var p1 bn254.G1Jac
+	p1.FromAffine(p.toGnarkAffine())
+
+	return &p1
+}
+
+func PointBN254FromG1AffineGnark(gnark *bn254.G1Affine) *PointBN254 {
+	point := PointBN254{
+		x: *NewFieldFromFpGnark[BaseField](gnark.X),
+		y: *NewFieldFromFpGnark[BaseField](gnark.Y),
+		z: *NewBaseFieldOne(),
+	}
+
+	return &point
+}
+
+// converts jac fromat to projective
+func PointBN254FromJacGnark(gnark *bn254.G1Jac) *PointBN254 {
+	var pointAffine bn254.G1Affine
+	pointAffine.FromJacobian(gnark)
+
+	point := PointBN254{
+		x: *NewFieldFromFpGnark[BaseField](pointAffine.X),
+		y: *NewFieldFromFpGnark[BaseField](pointAffine.Y),
+		z: *NewBaseFieldOne(),
+	}
+
+	return &point
+}
+
+func PointBN254fromLimbs(x, y, z *[]uint32) *PointBN254 {
+	return &PointBN254{
+		x: *BaseFieldFromLimbs(getFixedLimbs(x)),
+		y: *BaseFieldFromLimbs(getFixedLimbs(y)),
+		z: *BaseFieldFromLimbs(getFixedLimbs(z)),
+	}
+}
+
+/*
+ * PointAffineNoInfinityBN254
+ */
+
+type PointAffineNoInfinityBN254 struct {
+	x, y BaseField
+}
+
+func NewPointAffineNoInfinityBN254Zero() *PointAffineNoInfinityBN254 {
+	return &PointAffineNoInfinityBN254{
+		x: *NewFieldZero[BaseField](),
+		y: *NewFieldZero[BaseField](),
+	}
+}
+
+func (p *PointAffineNoInfinityBN254) toProjective() *PointBN254 {
+	return &PointBN254{
+		x: p.x,
+		y: p.y,
+		z: *NewBaseFieldOne(),
+	}
+}
+
+func (p *PointAffineNoInfinityBN254) toGnarkAffine() *bn254.G1Affine {
+	return p.toProjective().toGnarkAffine()
+}
+
+func PointAffineNoInfinityBN254FromLimbs(x, y *[]uint32) *PointAffineNoInfinityBN254 {
+	return &PointAffineNoInfinityBN254{
+		x: *BaseFieldFromLimbs(getFixedLimbs(x)),
+		y: *BaseFieldFromLimbs(getFixedLimbs(y)),
+	}
+}
+
+/*
+ * Multiplication
+ */
+
+func MultiplyVec(a []PointBN254, b []ScalarField, deviceID int) {
+	if len(a) != len(b) {
+		panic("a and b have different lengths")
+	}
+
+	pointsC := (*C.BN254_projective_t)(unsafe.Pointer(&a[0]))
+	scalarsC := (*C.BN254_scalar_t)(unsafe.Pointer(&b[0]))
+	deviceIdC := C.size_t(deviceID)
+	nElementsC := C.size_t(len(a))
+
+	C.vec_mod_mult_point_bn254(pointsC, scalarsC, nElementsC, deviceIdC)
+}
+
+func MultiplyScalar(a []ScalarField, b []ScalarField, deviceID int) {
+	if len(a) != len(b) {
+		panic("a and b have different lengths")
+	}
+
+	aC := (*C.BN254_scalar_t)(unsafe.Pointer(&a[0]))
+	bC := (*C.BN254_scalar_t)(unsafe.Pointer(&b[0]))
+	deviceIdC := C.size_t(deviceID)
+	nElementsC := C.size_t(len(a))
+
+	C.vec_mod_mult_scalar_bn254(aC, bC, nElementsC, deviceIdC)
+}
+
+// Multiply a matrix by a scalar:
+//
+//	`a` - flattenned matrix;
+//	`b` - vector to multiply `a` by;
+func MultiplyMatrix(a []ScalarField, b []ScalarField, deviceID int) {
+	c := make([]ScalarField, len(b))
+	for i := range c {
+		c[i] = *NewFieldZero[ScalarField]()
+	}
+
+	aC := (*C.BN254_scalar_t)(unsafe.Pointer(&a[0]))
+	bC := (*C.BN254_scalar_t)(unsafe.Pointer(&b[0]))
+	cC := (*C.BN254_scalar_t)(unsafe.Pointer(&c[0]))
+	deviceIdC := C.size_t(deviceID)
+	nElementsC := C.size_t(len(a))
+
+	C.matrix_vec_mod_mult_bn254(aC, bC, cC, nElementsC, deviceIdC)
+}
+
+/*
+ * Utils
+ */
+
+func getFixedLimbs(slice *[]uint32) [BASE_SIZE]uint32 {
+	if len(*slice) <= BASE_SIZE {
+		limbs := [BASE_SIZE]uint32{}
+		copy(limbs[:len(*slice)], *slice)
+		return limbs
+	}
+
+	panic("slice has too many elements")
+}
+
+func BatchConvertFromFrGnark[T BaseField | ScalarField](elements []fr.Element) []T {
+	var newElements []T
+	for _, e := range elements {
+		converted := NewFieldFromFrGnark[T](e)
+		newElements = append(newElements, *converted)
+	}
+
+	return newElements
+}
+
+func BatchConvertFromFrGnarkThreaded[T BaseField | ScalarField](elements []fr.Element, routines int) []T {
+	var newElements []T
+
+	if routines > 1 && routines <= len(elements) {
+		channels := make([]chan []T, routines)
+		for i := 0; i < routines; i++ {
+			channels[i] = make(chan []T, 1)
+		}
+
+		convert := func(elements []fr.Element, chanIndex int) {
+			var convertedElements []T
+			for _, e := range elements {
+				converted := NewFieldFromFrGnark[T](e)
+				convertedElements = append(convertedElements, *converted)
+			}
+
+			channels[chanIndex] <- convertedElements
+		}
+
+		batchLen := len(elements) / routines
+		for i := 0; i < routines; i++ {
+			start := batchLen * i
+			end := batchLen * (i + 1)
+			elemsToConv := elements[start:end]
+			if i == routines-1 {
+				elemsToConv = elements[start:]
+			}
+			go convert(elemsToConv, i)
+		}
+
+		for i := 0; i < routines; i++ {
+			newElements = append(newElements, <-channels[i]...)
+		}
+	} else {
+		for _, e := range elements {
+			converted := NewFieldFromFrGnark[T](e)
+			newElements = append(newElements, *converted)
+		}
+	}
+
+	return newElements
+}
+
+func BatchConvertToFrGnark[T Field](elements []T) []fr.Element {
+	var newElements []fr.Element
+	for _, e := range elements {
+		converted := e.toGnarkFr()
+		newElements = append(newElements, *converted)
+	}
+
+	return newElements
+}
+
+func BatchConvertToFrGnarkThreaded[T Field](elements []T, routines int) []fr.Element {
+	var newElements []fr.Element
+
+	if routines > 1 {
+		channels := make([]chan []fr.Element, routines)
+		for i := 0; i < routines; i++ {
+			channels[i] = make(chan []fr.Element, 1)
+		}
+
+		convert := func(elements []T, chanIndex int) {
+			var convertedElements []fr.Element
+			for _, e := range elements {
+				converted := e.toGnarkFr()
+				convertedElements = append(convertedElements, *converted)
+			}
+
+			channels[chanIndex] <- convertedElements
+		}
+
+		batchLen := len(elements) / routines
+		for i := 0; i < routines; i++ {
+			elemsToConv := elements[batchLen*i : batchLen*(i+1)]
+			go convert(elemsToConv, i)
+		}
+
+		for i := 0; i < routines; i++ {
+			newElements = append(newElements, <-channels[i]...)
+		}
+	} else {
+		for _, e := range elements {
+			converted := e.toGnarkFr()
+			newElements = append(newElements, *converted)
+		}
+	}
+
+	return newElements
+}
+
+func BatchConvertFromG1Affine(elements []bn254.G1Affine) []PointAffineNoInfinityBN254 {
+	var newElements []PointAffineNoInfinityBN254
+	for _, e := range elements {
+		newElement := PointBN254FromG1AffineGnark(&e).strip_z()
+		newElements = append(newElements, *newElement)
+	}
+	return newElements
+}
diff --git a/goicicle/curves/bn254/g1_test.go b/goicicle/curves/bn254/g1_test.go
new file mode 100644
index 000000000..ba416abbd
--- /dev/null
+++ b/goicicle/curves/bn254/g1_test.go
@@ -0,0 +1,229 @@
+package bn254
+
+import (
+	"encoding/binary"
+	"fmt"
+	"testing"
+
+	"github.com/consensys/gnark-crypto/ecc/bn254"
+	"github.com/consensys/gnark-crypto/ecc/bn254/fp"
+	"github.com/consensys/gnark-crypto/ecc/bn254/fr"
+	"github.com/stretchr/testify/assert"
+)
+
+func TestNewFieldBN254One(t *testing.T) {
+	oneField := NewBaseFieldOne()
+	rawOneField := [8]uint32([8]uint32{0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0})
+
+	assert.Equal(t, oneField.s, rawOneField)
+}
+
+func TestNewFieldBN254Zero(t *testing.T) {
+	zeroField := NewFieldZero[BaseField]()
+	rawZeroField := [8]uint32([8]uint32{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0})
+
+	assert.Equal(t, zeroField.s, rawZeroField)
+}
+
+func TestFieldBN254FromGnark(t *testing.T) {
+	var rand fr.Element
+	rand.SetRandom()
+
+	f := NewFieldFromFrGnark[ScalarField](rand)
+
+	assert.Equal(t, f.s, ConvertUint64ArrToUint32Arr(rand.Bits()))
+}
+
+func BenchmarkBatchConvertFromFrGnarkThreaded(b *testing.B) {
+	// ROUTINES := []int{4,5,6,7,8}
+
+	// for _, routineAmount := range ROUTINES {
+	routineAmount := 7
+	_, scalars_fr := GenerateScalars(1 << 24)
+	b.Run(fmt.Sprintf("Convert %d", routineAmount), func(b *testing.B) {
+		for n := 0; n < b.N; n++ {
+			_ = BatchConvertFromFrGnarkThreaded[ScalarField](scalars_fr, routineAmount)
+		}
+	})
+	// }
+}
+
+func BenchmarkBatchConvertFromFrGnark(b *testing.B) {
+	_, scalars_fr := GenerateScalars(1 << 24)
+	b.Run("BatchConvert 2^24", func(b *testing.B) {
+		for n := 0; n < b.N; n++ {
+			_ = BatchConvertFromFrGnark[ScalarField](scalars_fr)
+		}
+	})
+}
+
+func TestFieldBN254ToBytesLe(t *testing.T) {
+	var rand fr.Element
+	rand.SetRandom()
+
+	f := NewFieldFromFrGnark[ScalarField](rand)
+
+	expected := make([]byte, len(f.s)*4) // each uint32 takes 4 bytes
+	for i, v := range f.s {
+		binary.LittleEndian.PutUint32(expected[i*4:], v)
+	}
+
+	assert.Equal(t, f.toBytesLe(), expected)
+	assert.Equal(t, len(f.toBytesLe()), 32)
+}
+
+func TestNewPointBN254Zero(t *testing.T) {
+	point := NewPointBN254Zero()
+	a := new(PointBN254)
+	a.ToGnarkJac()
+
+	assert.Equal(t, point.x, *NewFieldZero[BaseField]())
+	assert.Equal(t, point.y, *NewBaseFieldOne())
+	assert.Equal(t, point.z, *NewFieldZero[BaseField]())
+}
+
+func TestBN254Eq(t *testing.T) {
+	p1 := NewPointBN254Zero()
+	p2 := NewPointBN254Zero()
+	p3 := &PointBN254{
+		x: *NewBaseFieldOne(),
+		y: *NewBaseFieldOne(),
+		z: *NewBaseFieldOne(),
+	}
+
+	assert.Equal(t, p1.eq(p2), true)
+	assert.Equal(t, p1.eq(p3), false)
+}
+
+func TestBN254StripZ(t *testing.T) {
+	p1 := NewPointBN254Zero()
+	p2ZLess := p1.strip_z()
+
+	assert.IsType(t, PointAffineNoInfinityBN254{}, *p2ZLess)
+	assert.Equal(t, p1.x, p2ZLess.x)
+	assert.Equal(t, p1.y, p2ZLess.y)
+}
+
+func TestPointBN254FromGnark(t *testing.T) {
+	gnarkP, _ := randG1Jac()
+
+	p := PointBN254FromJacGnark(&gnarkP)
+
+	z_inv := new(fp.Element)
+	z_invsq := new(fp.Element)
+	z_invq3 := new(fp.Element)
+	x := new(fp.Element)
+	y := new(fp.Element)
+
+	z_inv.Inverse(&gnarkP.Z)
+	z_invsq.Mul(z_inv, z_inv)
+	z_invq3.Mul(z_invsq, z_inv)
+
+	x.Mul(&gnarkP.X, z_invsq)
+	y.Mul(&gnarkP.Y, z_invq3)
+
+	assert.Equal(t, p.x, *NewFieldFromFpGnark[BaseField](*x))
+	assert.Equal(t, p.y, *NewFieldFromFpGnark[BaseField](*y))
+	assert.Equal(t, p.z, *NewBaseFieldOne())
+}
+
+func TestPointBN254fromLimbs(t *testing.T) {
+	gnarkP, _ := randG1Jac()
+	p := PointBN254FromJacGnark(&gnarkP)
+
+	x := p.x.limbs()
+	y := p.y.limbs()
+	z := p.z.limbs()
+
+	xSlice := x[:]
+	ySlice := y[:]
+	zSlice := z[:]
+
+	pFromLimbs := PointBN254fromLimbs(&xSlice, &ySlice, &zSlice)
+
+	assert.Equal(t, pFromLimbs, p)
+}
+
+func TestNewPointAffineNoInfinityBN254Zero(t *testing.T) {
+	zeroP := NewPointAffineNoInfinityBN254Zero()
+
+	assert.Equal(t, zeroP.x, *NewFieldZero[BaseField]())
+	assert.Equal(t, zeroP.y, *NewFieldZero[BaseField]())
+}
+
+func TestPointAffineNoInfinityBN254ToProjective(t *testing.T) {
+	gnarkP, _ := randG1Jac()
+	affine := PointBN254FromJacGnark(&gnarkP).strip_z()
+	proj := affine.toProjective()
+
+	assert.Equal(t, proj.x, affine.x)
+	assert.Equal(t, proj.x, affine.x)
+	assert.Equal(t, proj.z, *NewBaseFieldOne())
+}
+
+func TestPointAffineNoInfinityBN254FromLimbs(t *testing.T) {
+	// Initialize your test values
+	x := []uint32{1, 2, 3, 4, 5, 6, 7, 8}
+	y := []uint32{9, 10, 11, 12, 13, 14, 15, 16}
+
+	// Execute your function
+	result := PointAffineNoInfinityBN254FromLimbs(&x, &y)
+
+	// Define your expected result
+	expected := &PointAffineNoInfinityBN254{
+		x: *BaseFieldFromLimbs(getFixedLimbs(&x)),
+		y: *BaseFieldFromLimbs(getFixedLimbs(&y)),
+	}
+
+	// Test if result is as expected
+	assert.Equal(t, result, expected)
+}
+
+func TestToGnarkAffine(t *testing.T) {
+	gJac, _ := randG1Jac()
+	proj := PointBN254FromJacGnark(&gJac)
+
+	var gAffine bn254.G1Affine
+	gAffine.FromJacobian(&gJac)
+
+	affine := *proj.toGnarkAffine()
+	assert.Equal(t, affine, gAffine)
+}
+
+func TestGetFixedLimbs(t *testing.T) {
+	t.Run("case of valid input of length less than 8", func(t *testing.T) {
+		slice := []uint32{1, 2, 3, 4, 5, 6, 7}
+		expected := [8]uint32{1, 2, 3, 4, 5, 6, 7, 0}
+
+		result := getFixedLimbs(&slice)
+		assert.Equal(t, result, expected)
+	})
+
+	t.Run("case of valid input of length 8", func(t *testing.T) {
+		slice := []uint32{1, 2, 3, 4, 5, 6, 7, 8}
+		expected := [8]uint32{1, 2, 3, 4, 5, 6, 7, 8}
+
+		result := getFixedLimbs(&slice)
+		assert.Equal(t, result, expected)
+	})
+
+	t.Run("case of empty input", func(t *testing.T) {
+		slice := []uint32{}
+		expected := [8]uint32{0, 0, 0, 0, 0, 0, 0, 0}
+
+		result := getFixedLimbs(&slice)
+		assert.Equal(t, result, expected)
+	})
+
+	t.Run("case of input length greater than 8", func(t *testing.T) {
+		slice := []uint32{1, 2, 3, 4, 5, 6, 7, 8, 9}
+
+		defer func() {
+			if r := recover(); r == nil {
+				t.Errorf("the code did not panic")
+			}
+		}()
+
+		getFixedLimbs(&slice)
+	})
+}
diff --git a/goicicle/curves/bn254/g2.go b/goicicle/curves/bn254/g2.go
new file mode 100644
index 000000000..63a32d5a0
--- /dev/null
+++ b/goicicle/curves/bn254/g2.go
@@ -0,0 +1,235 @@
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+package bn254
+
+import (
+	"encoding/binary"
+	"errors"
+	"fmt"
+	"unsafe"
+
+	"github.com/consensys/gnark-crypto/ecc/bn254"
+	"github.com/consensys/gnark-crypto/ecc/bn254/fp"
+)
+
+// #cgo CFLAGS: -I${SRCDIR}/icicle/curves/bn254/
+// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbn254
+// #include "c_api.h"
+// #include "ve_mod_mult.h"
+import "C"
+
+func BatchConvertFromG2Affine(elements []bn254.G2Affine) []G2PointAffine {
+	var newElements []G2PointAffine
+	for _, gg2Affine := range elements {
+		var newElement G2PointAffine
+		newElement.FromGnarkAffine(&gg2Affine)
+
+		newElements = append(newElements, newElement)
+	}
+	return newElements
+}
+
+func BatchConvertFromG2AffineThreads(elements []bn254.G2Affine, routines int) []G2PointAffine {
+	var newElements []G2PointAffine
+
+	if routines > 1 && routines <= len(elements) {
+		channels := make([]chan []G2PointAffine, routines)
+		for i := 0; i < routines; i++ {
+			channels[i] = make(chan []G2PointAffine, 1)
+		}
+
+		convert := func(elements []bn254.G2Affine, chanIndex int) {
+			var convertedElements []G2PointAffine
+			for _, e := range elements {
+				var converted G2PointAffine
+				converted.FromGnarkAffine(&e)
+				convertedElements = append(convertedElements, converted)
+			}
+
+			channels[chanIndex] <- convertedElements
+		}
+
+		batchLen := len(elements) / routines
+		for i := 0; i < routines; i++ {
+			start := batchLen * i
+			end := batchLen * (i + 1)
+			elemsToConv := elements[start:end]
+			if i == routines-1 {
+				elemsToConv = elements[start:]
+			}
+			go convert(elemsToConv, i)
+		}
+
+		for i := 0; i < routines; i++ {
+			newElements = append(newElements, <-channels[i]...)
+		}
+	} else {
+		for _, e := range elements {
+			var converted G2PointAffine
+			converted.FromGnarkAffine(&e)
+			newElements = append(newElements, converted)
+		}
+	}
+
+	return newElements
+}
+
+// G2 extension field
+
+type G2Element [4]uint64
+
+type ExtentionField struct {
+	A0, A1 G2Element
+}
+
+type G2PointAffine struct {
+	x, y ExtentionField
+}
+
+type G2Point struct {
+	x, y, z ExtentionField
+}
+
+func (p *G2Point) eqg2(pCompare *G2Point) bool {
+	// Cast *PointBN254 to *C.BN254_projective_t
+	// The unsafe.Pointer cast is necessary because Go doesn't allow direct casts
+	// between different pointer types.
+	// It's your responsibility to ensure that the types are compatible.
+	pC := (*C.BN254_g2_projective_t)(unsafe.Pointer(p))
+	pCompareC := (*C.BN254_g2_projective_t)(unsafe.Pointer(pCompare))
+
+	// Call the C function
+	// The C function doesn't keep any references to the data,
+	// so it's fine if the Go garbage collector moves or deletes the data later.
+	return bool(C.eq_g2_bn254(pC, pCompareC))
+}
+
+func (f *G2Element) toBytesLe() []byte {
+	var bytes []byte
+	for _, val := range f {
+		buf := make([]byte, 8) // 8 bytes because uint64 is 64-bit
+		binary.LittleEndian.PutUint64(buf, val)
+		bytes = append(bytes, buf...)
+	}
+	return bytes
+}
+
+/*
+TODO: the following functions are due to a bug in the cuda code,
+these fucntions should be deleted once cuda MsmG2 returns non montgomery format
+*/
+const (
+	q0 uint64 = 4332616871279656263
+	q1 uint64 = 10917124144477883021
+	q2 uint64 = 13281191951274694749
+	q3 uint64 = 3486998266802970665
+)
+
+func smallerThanModulus(z fp.Element) bool {
+	return (z[3] < q3 || (z[3] == q3 && (z[2] < q2 || (z[2] == q2 && (z[1] < q1 || (z[1] == q1 && (z[0] < q0)))))))
+}
+
+func ElementWithOutConvertingToMontgomery(b *[32]byte) (fp.Element, error) {
+	var z fp.Element
+	z[0] = binary.LittleEndian.Uint64((*b)[0:8])
+	z[1] = binary.LittleEndian.Uint64((*b)[8:16])
+	z[2] = binary.LittleEndian.Uint64((*b)[16:24])
+	z[3] = binary.LittleEndian.Uint64((*b)[24:32])
+
+	if !smallerThanModulus(z) {
+		return fp.Element{}, errors.New("invalid fp.Element encoding")
+	}
+
+	return z, nil
+}
+
+func (f *G2Element) toGnarkFp() *fp.Element {
+	fb := f.toBytesLe()
+	var b32 [32]byte
+	copy(b32[:], fb[:32])
+
+	v, e := ElementWithOutConvertingToMontgomery(&b32) // cuda returns montgomery format
+	//v2, e := fp.LittleEndian.Element(&b32) // TODO: revert back to this once cuda code is fixed.
+
+	if e != nil {
+		panic(fmt.Sprintf("unable to create convert point %v got error %v", f, e))
+	}
+
+	return &v
+}
+
+func (f *ExtentionField) toGnarkE2() bn254.E2 {
+	return bn254.E2{
+		A0: *f.A0.toGnarkFp(),
+		A1: *f.A1.toGnarkFp(),
+	}
+}
+
+func (p *G2Point) ToGnarkJac() *bn254.G2Jac {
+	x := p.x.toGnarkE2()
+	y := p.y.toGnarkE2()
+	z := p.z.toGnarkE2()
+
+	var zSquared bn254.E2
+	zSquared.Mul(&z, &z)
+
+	var X bn254.E2
+	X.Mul(&x, &z)
+
+	var Y bn254.E2
+	Y.Mul(&y, &zSquared)
+
+	after := bn254.G2Jac{
+		X: X,
+		Y: Y,
+		Z: z,
+	}
+
+	return &after
+}
+
+func (p *G2PointAffine) ToProjective() G2Point {
+	return G2Point{
+		x: p.x,
+		y: p.y,
+		z: ExtentionField{
+			A0: G2Element{1, 0, 0, 0},
+			A1: G2Element{0, 0, 0, 0},
+		},
+	}
+}
+
+func (g *G2PointAffine) FromGnarkAffine(gnark *bn254.G2Affine) *G2PointAffine {
+	g.x.A0 = gnark.X.A0.Bits()
+	g.x.A1 = gnark.X.A1.Bits()
+	g.y.A0 = gnark.Y.A0.Bits()
+	g.y.A1 = gnark.Y.A1.Bits()
+
+	return g
+}
+
+func (g *G2PointAffine) FromGnarkJac(gnark *bn254.G2Jac) *G2PointAffine {
+	var pointAffine bn254.G2Affine
+	pointAffine.FromJacobian(gnark)
+
+	g.x.A0 = pointAffine.X.A0.Bits()
+	g.x.A1 = pointAffine.X.A1.Bits()
+	g.y.A0 = pointAffine.Y.A0.Bits()
+	g.y.A1 = pointAffine.Y.A1.Bits()
+
+	return g
+}
diff --git a/goicicle/curves/bn254/g2_test.go b/goicicle/curves/bn254/g2_test.go
new file mode 100644
index 000000000..1d8233ec1
--- /dev/null
+++ b/goicicle/curves/bn254/g2_test.go
@@ -0,0 +1,18 @@
+package bn254
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestToGnarkJacG2(t *testing.T) {
+	gnark, _ := randG2Jac()
+
+	var pointAffine G2PointAffine
+	pointAffine.FromGnarkJac(&gnark)
+	pointProjective := pointAffine.ToProjective()
+	backToGnark := pointProjective.ToGnarkJac()
+
+	assert.True(t, gnark.Equal(backToGnark))
+}
diff --git a/goicicle/curves/bn254/msm.go b/goicicle/curves/bn254/msm.go
new file mode 100644
index 000000000..d7a88d81e
--- /dev/null
+++ b/goicicle/curves/bn254/msm.go
@@ -0,0 +1,187 @@
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+package bn254
+
+import (
+	"errors"
+	"fmt"
+	"unsafe"
+)
+
+// #cgo CFLAGS: -I../../../icicle/curves/bn254/
+// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbn254
+// #include "msm.h"
+import "C"
+
+func MsmBN254(out *PointBN254, points []PointAffineNoInfinityBN254, scalars []ScalarField, device_id int) (*PointBN254, error) {
+	if len(points) != len(scalars) {
+		return nil, errors.New("error on: len(points) != len(scalars)")
+	}
+
+	pointsC := (*C.BN254_affine_t)(unsafe.Pointer(&points[0]))
+	scalarsC := (*C.BN254_scalar_t)(unsafe.Pointer(&scalars[0]))
+	outC := (*C.BN254_projective_t)(unsafe.Pointer(out))
+	ret := C.msm_cuda_bn254(outC, pointsC, scalarsC, C.size_t(len(points)), C.size_t(device_id))
+
+	if ret != 0 {
+		return nil, fmt.Errorf("msm_cuda_bn254 returned error code: %d", ret)
+	}
+
+	return out, nil
+}
+
+func MsmG2BatchBN254(points *[]G2PointAffine, scalars *[]ScalarField, batchSize, deviceId int) ([]*G2Point, error) {
+	// Check for nil pointers
+	if points == nil || scalars == nil {
+		return nil, errors.New("points or scalars is nil")
+	}
+
+	if len(*points) != len(*scalars) {
+		return nil, errors.New("error on: len(points) != len(scalars)")
+	}
+
+	// Check for empty slices
+	if len(*points) == 0 || len(*scalars) == 0 {
+		return nil, errors.New("points or scalars is empty")
+	}
+
+	// Check for zero batchSize
+	if batchSize <= 0 {
+		return nil, errors.New("error on: batchSize must be greater than zero")
+	}
+
+	out := make([]*G2Point, batchSize)
+
+	outC := (*C.BN254_g2_projective_t)(unsafe.Pointer(&out[0]))
+	pointsC := (*C.BN254_g2_affine_t)(unsafe.Pointer(&(*points)[0]))
+	scalarsC := (*C.BN254_scalar_t)(unsafe.Pointer(&(*scalars)[0]))
+	msmSizeC := C.size_t(len(*points) / batchSize)
+	deviceIdC := C.size_t(deviceId)
+	batchSizeC := C.size_t(batchSize)
+
+	ret := C.msm_batch_g2_cuda_bn254(outC, pointsC, scalarsC, batchSizeC, msmSizeC, deviceIdC)
+	if ret != 0 {
+		return nil, fmt.Errorf("msm_batch_cuda_bn254 returned error code: %d", ret)
+	}
+
+	return out, nil
+}
+
+func MsmG2BN254(out *G2Point, points []G2PointAffine, scalars []ScalarField, device_id int) (*G2Point, error) {
+	if len(points) != len(scalars) {
+		return nil, errors.New("error on: len(points) != len(scalars)")
+	}
+
+	pointsC := (*C.BN254_g2_affine_t)(unsafe.Pointer(&points[0]))
+	scalarsC := (*C.BN254_scalar_t)(unsafe.Pointer(&scalars[0]))
+	outC := (*C.BN254_g2_projective_t)(unsafe.Pointer(out))
+
+	ret := C.msm_g2_cuda_bn254(outC, pointsC, scalarsC, C.size_t(len(points)), C.size_t(device_id))
+
+	if ret != 0 {
+		return nil, fmt.Errorf("msm_g2_cuda_bn254 returned error code: %d", ret)
+	}
+
+	return out, nil
+}
+
+func CommitG2(d_out, d_scalars, d_points unsafe.Pointer, count int) int {
+	d_outC := (*C.BN254_g2_projective_t)(d_out)
+	scalarsC := (*C.BN254_scalar_t)(d_scalars)
+	pointsC := (*C.BN254_g2_affine_t)(d_points)
+	countC := (C.size_t)(count)
+
+	ret := C.commit_g2_cuda_bn254(d_outC, scalarsC, pointsC, countC, 0)
+
+	if ret != 0 {
+		return -1
+	}
+
+	return 0
+}
+
+func MsmBatchBN254(points *[]PointAffineNoInfinityBN254, scalars *[]ScalarField, batchSize, deviceId int) ([]*PointBN254, error) {
+	// Check for nil pointers
+	if points == nil || scalars == nil {
+		return nil, errors.New("points or scalars is nil")
+	}
+
+	if len(*points) != len(*scalars) {
+		return nil, errors.New("error on: len(points) != len(scalars)")
+	}
+
+	// Check for empty slices
+	if len(*points) == 0 || len(*scalars) == 0 {
+		return nil, errors.New("points or scalars is empty")
+	}
+
+	// Check for zero batchSize
+	if batchSize <= 0 {
+		return nil, errors.New("error on: batchSize must be greater than zero")
+	}
+
+	out := make([]*PointBN254, batchSize)
+
+	for i := 0; i < len(out); i++ {
+		out[i] = NewPointBN254Zero()
+	}
+
+	outC := (*C.BN254_projective_t)(unsafe.Pointer(&out[0]))
+	pointsC := (*C.BN254_affine_t)(unsafe.Pointer(&(*points)[0]))
+	scalarsC := (*C.BN254_scalar_t)(unsafe.Pointer(&(*scalars)[0]))
+	msmSizeC := C.size_t(len(*points) / batchSize)
+	deviceIdC := C.size_t(deviceId)
+	batchSizeC := C.size_t(batchSize)
+
+	ret := C.msm_batch_cuda_bn254(outC, pointsC, scalarsC, batchSizeC, msmSizeC, deviceIdC)
+	if ret != 0 {
+		return nil, fmt.Errorf("msm_batch_cuda_bn254 returned error code: %d", ret)
+	}
+
+	return out, nil
+}
+
+func Commit(d_out, d_scalars, d_points unsafe.Pointer, count int) int {
+	d_outC := (*C.BN254_projective_t)(d_out)
+	scalarsC := (*C.BN254_scalar_t)(d_scalars)
+	pointsC := (*C.BN254_affine_t)(d_points)
+	countC := (C.size_t)(count)
+
+	ret := C.commit_cuda_bn254(d_outC, scalarsC, pointsC, countC, 0)
+
+	if ret != 0 {
+		return -1
+	}
+
+	return 0
+}
+
+func CommitBatch(d_out, d_scalars, d_points unsafe.Pointer, count, batch_size int) int {
+	d_outC := (*C.BN254_projective_t)(d_out)
+	scalarsC := (*C.BN254_scalar_t)(d_scalars)
+	pointsC := (*C.BN254_affine_t)(d_points)
+	countC := (C.size_t)(count)
+	batch_sizeC := (C.size_t)(batch_size)
+
+	ret := C.commit_batch_cuda_bn254(d_outC, scalarsC, pointsC, countC, batch_sizeC, 0)
+
+	if ret != 0 {
+		return -1
+	}
+
+	return 0
+}
diff --git a/goicicle/curves/bn254/msm_test.go b/goicicle/curves/bn254/msm_test.go
new file mode 100644
index 000000000..73d636ae0
--- /dev/null
+++ b/goicicle/curves/bn254/msm_test.go
@@ -0,0 +1,391 @@
+package bn254
+
+import (
+	"fmt"
+	"math"
+	"math/big"
+	"testing"
+	"time"
+	"unsafe"
+
+	"github.com/consensys/gnark-crypto/ecc"
+	"github.com/consensys/gnark-crypto/ecc/bn254"
+	"github.com/consensys/gnark-crypto/ecc/bn254/fr"
+	"github.com/ingonyama-zk/icicle/goicicle"
+	"github.com/stretchr/testify/assert"
+)
+
+func randG1Jac() (bn254.G1Jac, error) {
+	var point bn254.G1Jac
+	var scalar fr.Element
+
+	_, err := scalar.SetRandom()
+	if err != nil {
+		return point, err
+	}
+
+	genG1Jac, _, _, _ := bn254.Generators()
+
+	//randomBigInt, err := rand.Int(rand.Reader, new(big.Int).Lsh(big.NewInt(1), 63))
+	//randomBigInt, err := rand.Int(rand.Reader, big.NewInt(100))
+	randomBigInt := big.NewInt(100)
+
+	point.ScalarMultiplication(&genG1Jac, scalar.BigInt(randomBigInt))
+	return point, nil
+}
+
+func GeneratePoints(count int) ([]PointAffineNoInfinityBN254, []bn254.G1Affine) {
+	// Declare a slice of integers
+	var points []PointAffineNoInfinityBN254
+	var pointsAffine []bn254.G1Affine
+
+	// populate the slice
+	for i := 0; i < 10; i++ {
+		gnarkP, _ := randG1Jac()
+		var pointAffine bn254.G1Affine
+		pointAffine.FromJacobian(&gnarkP)
+
+		p := PointBN254FromJacGnark(&gnarkP).strip_z()
+
+		pointsAffine = append(pointsAffine, pointAffine)
+		points = append(points, *p)
+	}
+
+	log2_10 := math.Log2(10)
+	log2Count := math.Log2(float64(count))
+	log2Size := int(math.Ceil(log2Count - log2_10))
+
+	for i := 0; i < log2Size; i++ {
+		pointsAffine = append(pointsAffine, pointsAffine...)
+		points = append(points, points...)
+	}
+
+	return points[:count], pointsAffine[:count]
+}
+
+func GeneratePointsProj(count int) ([]PointBN254, []bn254.G1Jac) {
+	// Declare a slice of integers
+	var points []PointBN254
+	var pointsAffine []bn254.G1Jac
+
+	// Use a loop to populate the slice
+	for i := 0; i < count; i++ {
+		gnarkP, _ := randG1Jac()
+		p := PointBN254FromJacGnark(&gnarkP)
+
+		pointsAffine = append(pointsAffine, gnarkP)
+		points = append(points, *p)
+	}
+
+	return points, pointsAffine
+}
+
+func GenerateScalars(count int) ([]ScalarField, []fr.Element) {
+	// Declare a slice of integers
+	var scalars []ScalarField
+	var scalars_fr []fr.Element
+
+	var rand fr.Element
+	for i := 0; i < count; i++ {
+		rand.SetRandom()
+		s := NewFieldFromFrGnark[ScalarField](rand)
+
+		scalars_fr = append(scalars_fr, rand)
+		scalars = append(scalars, *s)
+	}
+
+	return scalars[:count], scalars_fr[:count]
+}
+
+func TestMSM(t *testing.T) {
+	for _, v := range []int{24} {
+		count := 1 << v
+
+		points, gnarkPoints := GeneratePoints(count)
+		fmt.Print("Finished generating points\n")
+		scalars, gnarkScalars := GenerateScalars(count)
+		fmt.Print("Finished generating scalars\n")
+
+		out := new(PointBN254)
+		startTime := time.Now()
+		_, e := MsmBN254(out, points, scalars, 0) // non mont
+		fmt.Printf("icicle MSM took: %d ms\n", time.Since(startTime).Milliseconds())
+
+		assert.Equal(t, e, nil, "error should be nil")
+		fmt.Print("Finished icicle MSM\n")
+
+		var bn254AffineLib bn254.G1Affine
+
+		gResult, _ := bn254AffineLib.MultiExp(gnarkPoints, gnarkScalars, ecc.MultiExpConfig{})
+		fmt.Print("Finished Gnark MSM\n")
+
+		assert.Equal(t, out.toGnarkAffine(), gResult)
+	}
+}
+
+func TestCommitMSM(t *testing.T) {
+	for _, _ = range []int{24} {
+		count := 12_180_757
+		// count := 1 << v - 1
+
+		points, gnarkPoints := GeneratePoints(count)
+		fmt.Print("Finished generating points\n")
+		scalars, gnarkScalars := GenerateScalars(count)
+		fmt.Print("Finished generating scalars\n")
+
+		out_d, _ := goicicle.CudaMalloc(96)
+
+		pointsBytes := count * 64
+		points_d, _ := goicicle.CudaMalloc(pointsBytes)
+		goicicle.CudaMemCpyHtoD[PointAffineNoInfinityBN254](points_d, points, pointsBytes)
+
+		scalarBytes := count * 32
+		scalars_d, _ := goicicle.CudaMalloc(scalarBytes)
+		goicicle.CudaMemCpyHtoD[ScalarField](scalars_d, scalars, scalarBytes)
+
+		startTime := time.Now()
+		e := Commit(out_d, scalars_d, points_d, count)
+		fmt.Printf("icicle MSM took: %d ms\n", time.Since(startTime).Milliseconds())
+
+		outHost := make([]PointBN254, 1)
+		goicicle.CudaMemCpyDtoH[PointBN254](outHost, out_d, 96)
+
+		assert.Equal(t, e, 0, "error should be 0")
+		fmt.Print("Finished icicle MSM\n")
+
+		var bn254AffineLib bn254.G1Affine
+
+		gResult, _ := bn254AffineLib.MultiExp(gnarkPoints, gnarkScalars, ecc.MultiExpConfig{})
+		fmt.Print("Finished Gnark MSM\n")
+
+		assert.Equal(t, outHost[0].toGnarkAffine(), gResult)
+	}
+}
+
+func BenchmarkCommit(b *testing.B) {
+	LOG_MSM_SIZES := []int{20, 21, 22, 23, 24, 25, 26}
+
+	for _, logMsmSize := range LOG_MSM_SIZES {
+		msmSize := 1 << logMsmSize
+		points, _ := GeneratePoints(msmSize)
+		scalars, _ := GenerateScalars(msmSize)
+
+		out_d, _ := goicicle.CudaMalloc(96)
+
+		pointsBytes := msmSize * 64
+		points_d, _ := goicicle.CudaMalloc(pointsBytes)
+		goicicle.CudaMemCpyHtoD[PointAffineNoInfinityBN254](points_d, points, pointsBytes)
+
+		scalarBytes := msmSize * 32
+		scalars_d, _ := goicicle.CudaMalloc(scalarBytes)
+		goicicle.CudaMemCpyHtoD[ScalarField](scalars_d, scalars, scalarBytes)
+
+		b.Run(fmt.Sprintf("MSM %d", logMsmSize), func(b *testing.B) {
+			for n := 0; n < b.N; n++ {
+				e := Commit(out_d, scalars_d, points_d, msmSize)
+
+				if e != 0 {
+					panic("Error occured")
+				}
+			}
+		})
+	}
+}
+
+func TestBenchMSM(t *testing.T) {
+	for _, batchPow2 := range []int{2, 4} {
+		for _, pow2 := range []int{4, 6} {
+			msmSize := 1 << pow2
+			batchSize := 1 << batchPow2
+			count := msmSize * batchSize
+
+			points, _ := GeneratePoints(count)
+			scalars, _ := GenerateScalars(count)
+
+			a, e := MsmBatchBN254(&points, &scalars, batchSize, 0)
+
+			if e != nil {
+				t.Errorf("MsmBatchBN254 returned an error: %v", e)
+			}
+
+			if len(a) != batchSize {
+				t.Errorf("Expected length %d, but got %d", batchSize, len(a))
+			}
+		}
+	}
+}
+
+func BenchmarkMSM(b *testing.B) {
+	LOG_MSM_SIZES := []int{20, 21, 22, 23, 24, 25, 26}
+
+	for _, logMsmSize := range LOG_MSM_SIZES {
+		msmSize := 1 << logMsmSize
+		points, _ := GeneratePoints(msmSize)
+		scalars, _ := GenerateScalars(msmSize)
+		b.Run(fmt.Sprintf("MSM %d", logMsmSize), func(b *testing.B) {
+			for n := 0; n < b.N; n++ {
+				out := new(PointBN254)
+				_, e := MsmBN254(out, points, scalars, 0)
+
+				if e != nil {
+					panic("Error occured")
+				}
+			}
+		})
+	}
+}
+
+// G2
+
+func randG2Jac() (bn254.G2Jac, error) {
+	var point bn254.G2Jac
+	var scalar fr.Element
+
+	_, err := scalar.SetRandom()
+	if err != nil {
+		return point, err
+	}
+
+	_, genG2Jac, _, _ := bn254.Generators()
+
+	randomBigInt := big.NewInt(1000)
+
+	point.ScalarMultiplication(&genG2Jac, scalar.BigInt(randomBigInt))
+	return point, nil
+}
+
+func GenerateG2Points(count int) ([]G2PointAffine, []bn254.G2Affine) {
+	// Declare a slice of integers
+	var points []G2PointAffine
+	var pointsAffine []bn254.G2Affine
+
+	// populate the slice
+	for i := 0; i < count; i++ {
+		gnarkP, _ := randG2Jac()
+
+		var p G2PointAffine
+		p.FromGnarkJac(&gnarkP)
+
+		var gp bn254.G2Affine
+		gp.FromJacobian(&gnarkP)
+		pointsAffine = append(pointsAffine, gp)
+		points = append(points, p)
+	}
+
+	return points, pointsAffine
+}
+
+func TestMsmG2BN254(t *testing.T) {
+	for _, v := range []int{24} {
+		count := 1 << v
+		points, gnarkPoints := GenerateG2Points(count)
+		fmt.Print("Finished generating points\n")
+		scalars, gnarkScalars := GenerateScalars(count)
+		fmt.Print("Finished generating scalars\n")
+
+		out := new(G2Point)
+		_, e := MsmG2BN254(out, points, scalars, 0)
+		assert.Equal(t, e, nil, "error should be nil")
+
+		var result G2PointAffine
+		var bn254AffineLib bn254.G2Affine
+
+		gResult, _ := bn254AffineLib.MultiExp(gnarkPoints, gnarkScalars, ecc.MultiExpConfig{})
+
+		result.FromGnarkAffine(gResult)
+
+		pp := result.ToProjective()
+		assert.True(t, out.eqg2(&pp))
+		//assert.Equal(t, out, result.ToProjective())
+	}
+}
+
+func BenchmarkMsmG2BN254(b *testing.B) {
+	LOG_MSM_SIZES := []int{20, 21, 22, 23, 24, 25, 26}
+
+	for _, logMsmSize := range LOG_MSM_SIZES {
+		msmSize := 1 << logMsmSize
+		points, _ := GenerateG2Points(msmSize)
+		scalars, _ := GenerateScalars(msmSize)
+		b.Run(fmt.Sprintf("MSM G2 %d", logMsmSize), func(b *testing.B) {
+			for n := 0; n < b.N; n++ {
+				out := new(G2Point)
+				_, e := MsmG2BN254(out, points, scalars, 0)
+
+				if e != nil {
+					panic("Error occured")
+				}
+			}
+		})
+	}
+}
+
+func TestCommitG2MSM(t *testing.T) {
+	for _, v := range []int{24} {
+		count := 1 << v
+
+		points, gnarkPoints := GenerateG2Points(count)
+		fmt.Print("Finished generating points\n")
+		scalars, gnarkScalars := GenerateScalars(count)
+		fmt.Print("Finished generating scalars\n")
+
+		var sizeCheckG2PointAffine G2PointAffine
+		inputPointsBytes := count * int(unsafe.Sizeof(sizeCheckG2PointAffine))
+
+		var sizeCheckG2Point G2Point
+		out_d, _ := goicicle.CudaMalloc(int(unsafe.Sizeof(sizeCheckG2Point)))
+
+		points_d, _ := goicicle.CudaMalloc(inputPointsBytes)
+		goicicle.CudaMemCpyHtoD[G2PointAffine](points_d, points, inputPointsBytes)
+
+		scalarBytes := count * 32
+		scalars_d, _ := goicicle.CudaMalloc(scalarBytes)
+		goicicle.CudaMemCpyHtoD[ScalarField](scalars_d, scalars, scalarBytes)
+
+		startTime := time.Now()
+		e := CommitG2(out_d, scalars_d, points_d, count)
+		fmt.Printf("icicle MSM took: %d ms\n", time.Since(startTime).Milliseconds())
+
+		outHost := make([]G2Point, 1)
+		goicicle.CudaMemCpyDtoH[G2Point](outHost, out_d, int(unsafe.Sizeof(sizeCheckG2Point)))
+
+		assert.Equal(t, e, 0, "error should be 0")
+		fmt.Print("Finished icicle MSM\n")
+
+		var bn254AffineLib bn254.G2Affine
+
+		gResult, _ := bn254AffineLib.MultiExp(gnarkPoints, gnarkScalars, ecc.MultiExpConfig{})
+		fmt.Print("Finished Gnark MSM\n")
+		var resultGnark G2PointAffine
+		resultGnark.FromGnarkAffine(gResult)
+
+		resultGnarkProjective := resultGnark.ToProjective()
+		assert.Equal(t, len(outHost), 1)
+		result := outHost[0]
+
+		assert.True(t, result.eqg2(&resultGnarkProjective))
+	}
+}
+
+func TestBatchG2MSM(t *testing.T) {
+	for _, batchPow2 := range []int{2, 4} {
+		for _, pow2 := range []int{4, 6} {
+			msmSize := 1 << pow2
+			batchSize := 1 << batchPow2
+			count := msmSize * batchSize
+
+			points, _ := GenerateG2Points(count)
+			scalars, _ := GenerateScalars(count)
+
+			a, e := MsmG2BatchBN254(&points, &scalars, batchSize, 0)
+
+			if e != nil {
+				t.Errorf("MsmBatchBN254 returned an error: %v", e)
+			}
+
+			if len(a) != batchSize {
+				t.Errorf("Expected length %d, but got %d", batchSize, len(a))
+			}
+		}
+	}
+}
diff --git a/goicicle/curves/bn254/ntt.go b/goicicle/curves/bn254/ntt.go
new file mode 100644
index 000000000..313481692
--- /dev/null
+++ b/goicicle/curves/bn254/ntt.go
@@ -0,0 +1,202 @@
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+package bn254
+
+// #cgo CFLAGS: -I../../../icicle/curves/bn254/
+// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbn254
+// #include "ntt.h"
+import "C"
+import (
+	"errors"
+	"fmt"
+	"unsafe"
+
+	"github.com/ingonyama-zk/icicle/goicicle"
+)
+
+const (
+	NONE = 0
+	DIF  = 1
+	DIT  = 2
+)
+
+func NttBN254(scalars *[]ScalarField, isInverse bool, decimation int, deviceId int) uint64 {
+	scalarsC := (*C.BN254_scalar_t)(unsafe.Pointer(&(*scalars)[0]))
+
+	ret := C.ntt_cuda_bn254(scalarsC, C.uint32_t(len(*scalars)), C.bool(isInverse), C.size_t(decimation), C.size_t(deviceId))
+
+	return uint64(ret)
+}
+
+func NttBatchBN254(scalars *[]ScalarField, isInverse bool, batchSize, deviceId int) uint64 {
+	scalarsC := (*C.BN254_scalar_t)(unsafe.Pointer(&(*scalars)[0]))
+	isInverseC := C.bool(isInverse)
+	batchSizeC := C.uint32_t(batchSize)
+	deviceIdC := C.size_t(deviceId)
+
+	ret := C.ntt_batch_cuda_bn254(scalarsC, C.uint32_t(len(*scalars)), batchSizeC, isInverseC, deviceIdC)
+
+	return uint64(ret)
+}
+
+func EcNttBN254(values *[]PointBN254, isInverse bool, deviceId int) uint64 {
+	valuesC := (*C.BN254_projective_t)(unsafe.Pointer(&(*values)[0]))
+	deviceIdC := C.size_t(deviceId)
+	isInverseC := C.bool(isInverse)
+	n := C.uint32_t(len(*values))
+
+	ret := C.ecntt_cuda_bn254(valuesC, n, isInverseC, deviceIdC)
+
+	return uint64(ret)
+}
+
+func EcNttBatchBN254(values *[]PointBN254, isInverse bool, batchSize, deviceId int) uint64 {
+	valuesC := (*C.BN254_projective_t)(unsafe.Pointer(&(*values)[0]))
+	deviceIdC := C.size_t(deviceId)
+	isInverseC := C.bool(isInverse)
+	n := C.uint32_t(len(*values))
+	batchSizeC := C.uint32_t(batchSize)
+
+	ret := C.ecntt_batch_cuda_bn254(valuesC, n, batchSizeC, isInverseC, deviceIdC)
+
+	return uint64(ret)
+}
+
+func GenerateTwiddles(d_size int, log_d_size int, inverse bool) (up unsafe.Pointer, err error) {
+	domain_size := C.uint32_t(d_size)
+	logn := C.uint32_t(log_d_size)
+	is_inverse := C.bool(inverse)
+
+	dp := C.build_domain_cuda_bn254(domain_size, logn, is_inverse, 0, 0)
+
+	if dp == nil {
+		err = errors.New("nullptr returned from generating twiddles")
+		return unsafe.Pointer(nil), err
+	}
+
+	return unsafe.Pointer(dp), nil
+}
+
+// Reverses d_scalars in-place
+func ReverseScalars(d_scalars unsafe.Pointer, len int) (int, error) {
+	scalarsC := (*C.BN254_scalar_t)(d_scalars)
+	lenC := C.int(len)
+	if success := C.reverse_order_scalars_cuda_bn254(scalarsC, lenC, 0, 0); success != 0 {
+		return -1, errors.New("reversing failed")
+	}
+	return 0, nil
+}
+
+func Interpolate(scalars, twiddles, cosetPowers unsafe.Pointer, size int, isCoset bool) unsafe.Pointer {
+	size_d := size * 32
+	dp, err := goicicle.CudaMalloc(size_d)
+
+	if err != nil {
+		return nil
+	}
+
+	d_out := (*C.BN254_scalar_t)(dp)
+	scalarsC := (*C.BN254_scalar_t)(scalars)
+	twiddlesC := (*C.BN254_scalar_t)(twiddles)
+	cosetPowersC := (*C.BN254_scalar_t)(cosetPowers)
+	sizeC := C.uint(size)
+
+	var ret C.int
+	if isCoset {
+		ret = C.interpolate_scalars_on_coset_cuda_bn254(d_out, scalarsC, twiddlesC, sizeC, cosetPowersC, 0, 0)
+	} else {
+		ret = C.interpolate_scalars_cuda_bn254(d_out, scalarsC, twiddlesC, sizeC, 0, 0)
+	}
+	if ret != 0 {
+		fmt.Print("error interpolating")
+	}
+
+	return unsafe.Pointer(d_out)
+}
+
+func Evaluate(scalars_out, scalars, twiddles, coset_powers unsafe.Pointer, scalars_size, twiddles_size int, isCoset bool) int {
+	scalars_outC := (*C.BN254_scalar_t)(scalars_out)
+	scalarsC := (*C.BN254_scalar_t)(scalars)
+	twiddlesC := (*C.BN254_scalar_t)(twiddles)
+	coset_powersC := (*C.BN254_scalar_t)(coset_powers)
+	sizeC := C.uint(scalars_size)
+	twiddlesC_size := C.uint(twiddles_size)
+
+	var ret C.int
+	if isCoset {
+		ret = C.evaluate_scalars_on_coset_cuda_bn254(scalars_outC, scalarsC, twiddlesC, twiddlesC_size, sizeC, coset_powersC, 0, 0)
+	} else {
+		ret = C.evaluate_scalars_cuda_bn254(scalars_outC, scalarsC, twiddlesC, twiddlesC_size, sizeC, 0, 0)
+	}
+
+	if ret != 0 {
+		fmt.Print("error interpolating")
+		return -1
+	}
+
+	return 0
+}
+
+func VecScalarAdd(in1_d, in2_d unsafe.Pointer, size int) int {
+	in1_dC := (*C.BN254_scalar_t)(in1_d)
+	in2_dC := (*C.BN254_scalar_t)(in2_d)
+	sizeC := C.uint(size)
+
+	ret := C.add_scalars_cuda_bn254(in1_dC, in1_dC, in2_dC, sizeC, 0)
+
+	if ret != 0 {
+		fmt.Print("error adding scalar vectors")
+		return -1
+	}
+
+	return 0
+}
+
+func VecScalarSub(in1_d, in2_d unsafe.Pointer, size int) int {
+	in1_dC := (*C.BN254_scalar_t)(in1_d)
+	in2_dC := (*C.BN254_scalar_t)(in2_d)
+	sizeC := C.uint(size)
+
+	ret := C.sub_scalars_cuda_bn254(in1_dC, in1_dC, in2_dC, sizeC, 0)
+
+	if ret != 0 {
+		fmt.Print("error subtracting scalar vectors")
+		return -1
+	}
+
+	return 0
+}
+
+func ToMontgomery(d_scalars unsafe.Pointer, len int) (int, error) {
+	scalarsC := (*C.BN254_scalar_t)(d_scalars)
+	lenC := C.uint(len)
+	if success := C.to_montgomery_scalars_cuda_bn254(scalarsC, lenC, 0); success != 0 {
+		return -1, errors.New("reversing failed")
+	}
+	return 0, nil
+}
+
+func FromMontgomery(d_scalars unsafe.Pointer, len int) (int, error) {
+	scalarsC := (*C.BN254_scalar_t)(d_scalars)
+	lenC := C.uint(len)
+	if success := C.from_montgomery_scalars_cuda_bn254(scalarsC, lenC, 0); success != 0 {
+		return -1, errors.New("reversing failed")
+	}
+	return 0, nil
+}
+
+
diff --git a/goicicle/curves/bn254/ntt_test.go b/goicicle/curves/bn254/ntt_test.go
new file mode 100644
index 000000000..9c53afb3d
--- /dev/null
+++ b/goicicle/curves/bn254/ntt_test.go
@@ -0,0 +1,219 @@
+package bn254
+
+import (
+	"fmt"
+	"reflect"
+	"testing"
+
+	"github.com/consensys/gnark-crypto/ecc/bn254/fr"
+	"github.com/consensys/gnark-crypto/ecc/bn254/fr/fft"
+	"github.com/stretchr/testify/assert"
+)
+
+func TestNttBN254BBB(t *testing.T) {
+	count := 1 << 20
+	scalars, frScalars := GenerateScalars(count)
+
+	nttResult := make([]ScalarField, len(scalars)) // Make a new slice with the same length
+	copy(nttResult, scalars)
+
+	assert.Equal(t, nttResult, scalars)
+	NttBatchBN254(&nttResult, false, count, 0)
+	assert.NotEqual(t, nttResult, scalars)
+
+	domain := fft.NewDomain(uint64(len(scalars)))
+	// DIT WITH NO INVERSE
+	// DIF WITH INVERSE
+	domain.FFT(frScalars, fft.DIT) //DIF
+
+	nttResultTransformedToGnark := make([]fr.Element, len(scalars)) // Make a new slice with the same length
+
+	for k, v := range nttResult {
+		nttResultTransformedToGnark[k] = *v.toGnarkFr()
+	}
+
+	assert.Equal(t, nttResultTransformedToGnark, frScalars)
+}
+
+func TestNttBN254CompareToGnarkDIF(t *testing.T) {
+	count := 1 << 2
+	scalars, frScalars := GenerateScalars(count)
+
+	nttResult := make([]ScalarField, len(scalars)) // Make a new slice with the same length
+	copy(nttResult, scalars)
+
+	assert.Equal(t, nttResult, scalars)
+	NttBN254(&nttResult, false, DIF, 0)
+	assert.NotEqual(t, nttResult, scalars)
+
+	domain := fft.NewDomain(uint64(len(scalars)))
+	// DIT WITH NO INVERSE
+	// DIF WITH INVERSE
+	domain.FFT(frScalars, fft.DIF) //DIF
+
+	nttResultTransformedToGnark := make([]fr.Element, len(scalars)) // Make a new slice with the same length
+
+	for k, v := range nttResult {
+		nttResultTransformedToGnark[k] = *v.toGnarkFr()
+	}
+
+	assert.Equal(t, nttResultTransformedToGnark, frScalars)
+}
+
+func TestNttBN254CompareToGnarkDIT(t *testing.T) {
+	count := 1 << 2
+	scalars, frScalars := GenerateScalars(count)
+
+	nttResult := make([]ScalarField, len(scalars)) // Make a new slice with the same length
+	copy(nttResult, scalars)
+
+	assert.Equal(t, nttResult, scalars)
+	NttBN254(&nttResult, false, DIT, 0)
+	assert.NotEqual(t, nttResult, scalars)
+
+	domain := fft.NewDomain(uint64(len(scalars)))
+	// DIT WITH NO INVERSE
+	// DIF WITH INVERSE
+	domain.FFT(frScalars, fft.DIT) //DIF
+
+	nttResultTransformedToGnark := make([]fr.Element, len(scalars)) // Make a new slice with the same length
+
+	for k, v := range nttResult {
+		nttResultTransformedToGnark[k] = *v.toGnarkFr()
+	}
+
+	assert.Equal(t, nttResultTransformedToGnark, frScalars)
+}
+
+func TestINttBN254CompareToGnarkDIT(t *testing.T) {
+	count := 1 << 3
+	scalars, frScalars := GenerateScalars(count)
+
+	nttResult := make([]ScalarField, len(scalars)) // Make a new slice with the same length
+	copy(nttResult, scalars)
+
+	assert.Equal(t, nttResult, scalars)
+	NttBN254(&nttResult, true, DIT, 0)
+	assert.NotEqual(t, nttResult, scalars)
+
+	frResScalars := make([]fr.Element, len(frScalars)) // Make a new slice with the same length
+	copy(frResScalars, frScalars)
+
+	domain := fft.NewDomain(uint64(len(scalars)))
+	domain.FFTInverse(frResScalars, fft.DIT)
+
+	assert.NotEqual(t, frResScalars, frScalars)
+
+	nttResultTransformedToGnark := make([]fr.Element, len(scalars)) // Make a new slice with the same length
+
+	for k, v := range nttResult {
+		nttResultTransformedToGnark[k] = *v.toGnarkFr()
+	}
+
+	assert.Equal(t, nttResultTransformedToGnark, frResScalars)
+}
+
+func TestINttBN254CompareToGnarkDIF(t *testing.T) {
+	count := 1 << 3
+	scalars, frScalars := GenerateScalars(count)
+
+	nttResult := make([]ScalarField, len(scalars)) // Make a new slice with the same length
+	copy(nttResult, scalars)
+
+	assert.Equal(t, nttResult, scalars)
+	NttBN254(&nttResult, true, DIF, 0)
+	assert.NotEqual(t, nttResult, scalars)
+
+	domain := fft.NewDomain(uint64(len(scalars)))
+	domain.FFTInverse(frScalars, fft.DIF)
+
+	nttResultTransformedToGnark := make([]fr.Element, len(scalars)) // Make a new slice with the same length
+
+	for k, v := range nttResult {
+		nttResultTransformedToGnark[k] = *v.toGnarkFr()
+	}
+
+	assert.Equal(t, nttResultTransformedToGnark, frScalars)
+}
+
+func TestNttBN254(t *testing.T) {
+	count := 1 << 3
+
+	scalars, _ := GenerateScalars(count)
+
+	nttResult := make([]ScalarField, len(scalars)) // Make a new slice with the same length
+	copy(nttResult, scalars)
+
+	assert.Equal(t, nttResult, scalars)
+	NttBN254(&nttResult, false, NONE, 0)
+	assert.NotEqual(t, nttResult, scalars)
+
+	inttResult := make([]ScalarField, len(nttResult))
+	copy(inttResult, nttResult)
+
+	assert.Equal(t, inttResult, nttResult)
+	NttBN254(&inttResult, true, NONE, 0)
+	assert.Equal(t, inttResult, scalars)
+}
+
+func TestNttBatchBN254(t *testing.T) {
+	count := 1 << 5
+	batches := 4
+
+	scalars, _ := GenerateScalars(count * batches)
+
+	var scalarVecOfVec [][]ScalarField = make([][]ScalarField, 0)
+
+	for i := 0; i < batches; i++ {
+		start := i * count
+		end := (i + 1) * count
+		batch := make([]ScalarField, len(scalars[start:end]))
+		copy(batch, scalars[start:end])
+		scalarVecOfVec = append(scalarVecOfVec, batch)
+	}
+
+	nttBatchResult := make([]ScalarField, len(scalars))
+	copy(nttBatchResult, scalars)
+
+	NttBatchBN254(&nttBatchResult, false, count, 0)
+
+	var nttResultVecOfVec [][]ScalarField
+
+	for i := 0; i < batches; i++ {
+		// Clone the slice
+		clone := make([]ScalarField, len(scalarVecOfVec[i]))
+		copy(clone, scalarVecOfVec[i])
+
+		// Add it to the result vector of vectors
+		nttResultVecOfVec = append(nttResultVecOfVec, clone)
+
+		// Call the ntt_bn254 function
+		NttBN254(&nttResultVecOfVec[i], false, NONE, 0)
+	}
+
+	assert.NotEqual(t, nttBatchResult, scalars)
+
+	// Check that the ntt of each vec of scalars is equal to the intt of the specific batch
+	for i := 0; i < batches; i++ {
+		if !reflect.DeepEqual(nttResultVecOfVec[i], nttBatchResult[i*count:((i+1)*count)]) {
+			t.Errorf("ntt of vec of scalars not equal to intt of specific batch")
+		}
+	}
+}
+
+func BenchmarkNTT(b *testing.B) {
+	LOG_NTT_SIZES := []int{12, 15, 20, 21, 22, 23, 24, 25, 26}
+
+	for _, logNTTSize := range LOG_NTT_SIZES {
+		nttSize := 1 << logNTTSize
+		b.Run(fmt.Sprintf("NTT %d", logNTTSize), func(b *testing.B) {
+			scalars, _ := GenerateScalars(nttSize)
+
+			nttResult := make([]ScalarField, len(scalars)) // Make a new slice with the same length
+			copy(nttResult, scalars)
+			for n := 0; n < b.N; n++ {
+				NttBN254(&nttResult, false, NONE, 0)
+			}
+		})
+	}
+}
diff --git a/goicicle/curves/bn254/utils.go b/goicicle/curves/bn254/utils.go
new file mode 100644
index 000000000..5186e79bf
--- /dev/null
+++ b/goicicle/curves/bn254/utils.go
@@ -0,0 +1,34 @@
+package bn254
+
+import (
+	"encoding/binary"
+	"log"
+	"time"
+)
+
+// Function to convert [8]uint32 to [4]uint64
+func ConvertUint32ArrToUint64Arr(arr32 [8]uint32) [4]uint64 {
+	var arr64 [4]uint64
+	for i := 0; i < len(arr32); i += 2 {
+		arr64[i/2] = (uint64(arr32[i]) << 32) | uint64(arr32[i+1])
+	}
+	return arr64
+}
+
+func ConvertUint64ArrToUint32Arr(arr64 [4]uint64) [8]uint32 {
+	var arr32 [8]uint32
+	for i, v := range arr64 {
+		b := make([]byte, 8)
+		binary.LittleEndian.PutUint64(b, v)
+
+		arr32[i*2] = binary.LittleEndian.Uint32(b[0:4])
+		arr32[i*2+1] = binary.LittleEndian.Uint32(b[4:8])
+	}
+
+	return arr32
+}
+
+func TimeTrack(start time.Time, name string) {
+	elapsed := time.Since(start)
+	log.Printf("%s took %s", name, elapsed)
+}
diff --git a/goicicle/curves/bn254/utils_test.go b/goicicle/curves/bn254/utils_test.go
new file mode 100644
index 000000000..d9f555260
--- /dev/null
+++ b/goicicle/curves/bn254/utils_test.go
@@ -0,0 +1,81 @@
+package bn254
+
+import (
+	"testing"
+)
+
+func TestConvertUint32ArrToUint64Arr(t *testing.T) {
+	testCases := []struct {
+		name  string
+		input [8]uint32
+		want  [4]uint64
+	}{
+		{
+			name:  "Test with incremental array",
+			input: [8]uint32{1, 2, 3, 4, 5, 6, 7, 8},
+			want:  [4]uint64{4294967298, 12884901892, 21474836486, 30064771080},
+		},
+		{
+			name:  "Test with all zeros",
+			input: [8]uint32{0, 0, 0, 0, 0, 0, 0, 0},
+			want:  [4]uint64{0, 0, 0, 0},
+		},
+		{
+			name:  "Test with maximum uint32 values",
+			input: [8]uint32{4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295, 4294967295},
+			want:  [4]uint64{18446744073709551615, 18446744073709551615, 18446744073709551615, 18446744073709551615},
+		},
+		{
+			name:  "Test with alternating min and max uint32 values",
+			input: [8]uint32{0, 4294967295, 0, 4294967295, 0, 4294967295, 0, 4294967295},
+			want:  [4]uint64{4294967295, 4294967295, 4294967295, 4294967295},
+		},
+		{
+			name:  "Test with alternating max and min uint32 values",
+			input: [8]uint32{4294967295, 0, 4294967295, 0, 4294967295, 0, 4294967295, 0},
+			want:  [4]uint64{18446744069414584320, 18446744069414584320, 18446744069414584320, 18446744069414584320},
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			got := ConvertUint32ArrToUint64Arr(tc.input)
+			if got != tc.want {
+				t.Errorf("got %v, want %v", got, tc.want)
+			}
+		})
+	}
+}
+
+func TestConvertUint64ArrToUint32Arr(t *testing.T) {
+	testCases := []struct {
+		name     string
+		input    [4]uint64
+		expected [8]uint32
+	}{
+		{
+			name:     "test one",
+			input:    [4]uint64{1, 2, 3, 4},
+			expected: [8]uint32{1, 0, 2, 0, 3, 0, 4, 0},
+		},
+		{
+			name:     "test two",
+			input:    [4]uint64{100, 200, 300, 400},
+			expected: [8]uint32{100, 0, 200, 0, 300, 0, 400, 0},
+		},
+		{
+			name:     "test three",
+			input:    [4]uint64{1000, 2000, 3000, 4000},
+			expected: [8]uint32{1000, 0, 2000, 0, 3000, 0, 4000, 0},
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			got := ConvertUint64ArrToUint32Arr(tc.input)
+			if got != tc.expected {
+				t.Errorf("got %v, want %v", got, tc.expected)
+			}
+		})
+	}
+}
diff --git a/goicicle/curves/bn254/vec_mod.go b/goicicle/curves/bn254/vec_mod.go
new file mode 100644
index 000000000..348e445e6
--- /dev/null
+++ b/goicicle/curves/bn254/vec_mod.go
@@ -0,0 +1,41 @@
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Code generated by Ingonyama DO NOT EDIT
+
+package bn254
+
+// #cgo CFLAGS: -I../../../icicle/curves/bn254/
+// #cgo LDFLAGS: -L${SRCDIR}/../../ -lbn254
+// #include "ve_mod_mult.h"
+import "C"
+import (
+	"fmt"
+	"unsafe"
+)
+
+func VecScalarMulMod(scalarVec1, scalarVec2 unsafe.Pointer, size int) int {
+	scalarVec1C := (*C.BN254_scalar_t)(scalarVec1)
+	scalarVec2C := (*C.BN254_scalar_t)(scalarVec2)
+	sizeC := C.size_t(size)
+
+	ret := C.vec_mod_mult_device_scalar_bn254(scalarVec1C, scalarVec2C, sizeC, 0)
+
+	if ret != 0 {
+		fmt.Print("error multiplying scalar vectors")
+		return -1
+	}
+
+	return 0
+}
diff --git a/goicicle/go.mod b/goicicle/go.mod
new file mode 100644
index 000000000..13f279ad4
--- /dev/null
+++ b/goicicle/go.mod
@@ -0,0 +1,20 @@
+module github.com/ingonyama-zk/icicle/goicicle
+
+go 1.20
+
+require github.com/consensys/gnark-crypto v0.11.0
+
+require (
+	github.com/davecgh/go-spew v1.1.1 // indirect
+	github.com/pmezard/go-difflib v1.0.0 // indirect
+	gopkg.in/yaml.v3 v3.0.1 // indirect
+)
+
+require (
+	github.com/bits-and-blooms/bitset v1.5.0 // indirect
+	github.com/consensys/bavard v0.1.13
+	github.com/mmcloughlin/addchain v0.4.0 // indirect
+	github.com/stretchr/testify v1.8.3
+	golang.org/x/sys v0.2.0 // indirect
+	rsc.io/tmplfunc v0.0.3 // indirect
+)
diff --git a/goicicle/go.sum b/goicicle/go.sum
new file mode 100644
index 000000000..91618224f
--- /dev/null
+++ b/goicicle/go.sum
@@ -0,0 +1,25 @@
+github.com/bits-and-blooms/bitset v1.5.0 h1:NpE8frKRLGHIcEzkR+gZhiioW1+WbYV6fKwD6ZIpQT8=
+github.com/bits-and-blooms/bitset v1.5.0/go.mod h1:gIdJ4wp64HaoK2YrL1Q5/N7Y16edYb8uY+O0FJTyyDA=
+github.com/consensys/bavard v0.1.13 h1:oLhMLOFGTLdlda/kma4VOJazblc7IM5y5QPd2A/YjhQ=
+github.com/consensys/bavard v0.1.13/go.mod h1:9ItSMtA/dXMAiL7BG6bqW2m3NdSEObYWoH223nGHukI=
+github.com/consensys/gnark-crypto v0.11.0 h1:QqzHQlwEqlQr5jfWblGDkwlKHpT+4QodYqqExkAtyks=
+github.com/consensys/gnark-crypto v0.11.0/go.mod h1:Iq/P3HHl0ElSjsg2E1gsMwhAyxnxoKK5nVyZKd+/KhU=
+github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
+github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/google/subcommands v1.2.0/go.mod h1:ZjhPrFU+Olkh9WazFPsl27BQ4UPiG37m3yTrtFlrHVk=
+github.com/leanovate/gopter v0.2.9 h1:fQjYxZaynp97ozCzfOyOuAGOU4aU/z37zf/tOujFk7c=
+github.com/mmcloughlin/addchain v0.4.0 h1:SobOdjm2xLj1KkXN5/n0xTIWyZA2+s99UCY1iPfkHRY=
+github.com/mmcloughlin/addchain v0.4.0/go.mod h1:A86O+tHqZLMNO4w6ZZ4FlVQEadcoqkyU72HC5wJ4RlU=
+github.com/mmcloughlin/profile v0.1.1/go.mod h1:IhHD7q1ooxgwTgjxQYkACGA77oFTDdFVejUS1/tS/qU=
+github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
+github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/stretchr/testify v1.8.3 h1:RP3t2pwF7cMEbC1dqtB6poj3niw/9gnV4Cjg5oW5gtY=
+github.com/stretchr/testify v1.8.3/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
+golang.org/x/sys v0.2.0 h1:ljd4t30dBnAvMZaQCevtY0xLLD0A+bRZXbgLMLU1F/A=
+golang.org/x/sys v0.2.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
+gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+rsc.io/tmplfunc v0.0.3 h1:53XFQh69AfOa8Tw0Jm7t+GV7KZhOi6jzsCzTtKbMvzU=
+rsc.io/tmplfunc v0.0.3/go.mod h1:AG3sTPzElb1Io3Yg4voV9AGZJuleGAwaVRxL9M49PhA=
diff --git a/goicicle/goicicle.go b/goicicle/goicicle.go
new file mode 100644
index 000000000..834948304
--- /dev/null
+++ b/goicicle/goicicle.go
@@ -0,0 +1,58 @@
+package goicicle
+
+// This file implements CUDA driver context management
+
+// #cgo CFLAGS: -I /usr/loca/cuda/include
+// #cgo LDFLAGS: -L/usr/local/cuda/lib64 -lcudart
+/*
+#include <cuda.h>
+#include <cuda_runtime.h>
+*/
+import "C"
+
+import (
+	"errors"
+	"unsafe"
+)
+
+// Version returns the version of the CUDA driver
+// func Version() int {
+// 	var v C.int
+// 	if err := C.cuDriverGetVersion(&v); err != 0 {
+// 		return -1
+// 	}
+// 	return int(v)
+// }
+
+func CudaMalloc(size int) (dp unsafe.Pointer, err error) {
+	var p C.void
+	dp = unsafe.Pointer(&p)
+	if err := C.cudaMalloc(&dp, C.size_t(size)); err != 0 {
+		return nil, errors.New("could not create memory space")
+	}
+	return dp, nil
+}
+
+func CudaFree(dp unsafe.Pointer) int {
+	if err := C.cudaFree(dp); err != 0 {
+		return -1
+	}
+	return 0
+}
+
+func CudaMemCpyHtoD[T any](dst_d unsafe.Pointer, src []T, size int) int {
+	src_c := unsafe.Pointer(&src[0])
+	if err := C.cudaMemcpy(dst_d, src_c, C.size_t(size), 1); err != 0 {
+		return -1
+	}
+	return 0
+}
+
+func CudaMemCpyDtoH[T any](dst []T, src_d unsafe.Pointer, size int) int {
+	dst_c := unsafe.Pointer(&dst[0])
+
+	if err := C.cudaMemcpy(dst_c, src_d, C.size_t(size), 2); err != 0 {
+		return -1
+	}
+	return 0
+}
diff --git a/goicicle/templates/curves/curves.go b/goicicle/templates/curves/curves.go
new file mode 100644
index 000000000..e56dbbace
--- /dev/null
+++ b/goicicle/templates/curves/curves.go
@@ -0,0 +1,37 @@
+package config
+
+type Curve struct {
+	CurveName   string
+	PackageName string
+	SharedLib   string
+	Prefix      string
+	ScalarSize  int
+	BaseSize    int
+}
+
+var BN_254 = Curve{
+	CurveName:   "BN254",
+	PackageName: "bn254",
+	SharedLib:   "-lbn254",
+	Prefix:      "bn254",
+	ScalarSize:  8,
+	BaseSize:    8,
+}
+
+var BLS_12_377 = Curve{
+	CurveName:   "BLS12377",
+	PackageName: "bls12377",
+	SharedLib:   "-lbn12_377",
+	Prefix:      "bls12_377",
+	ScalarSize:  8,
+	BaseSize:    12,
+}
+
+var BLS_12_381 = Curve{
+	CurveName:   "BLS12381",
+	PackageName: "bls12381",
+	SharedLib:   "-lbn12_381",
+	Prefix:      "bls12_381",
+	ScalarSize:  8,
+	BaseSize:    12,
+}
diff --git a/goicicle/templates/curves/g1.go.tmpl b/goicicle/templates/curves/g1.go.tmpl
new file mode 100644
index 000000000..db7e7e8e4
--- /dev/null
+++ b/goicicle/templates/curves/g1.go.tmpl
@@ -0,0 +1,469 @@
+import (
+	"unsafe"
+
+	"encoding/binary"
+	"fmt"
+
+	{{ template "import_ecc" . }}
+	{{ template "import_fp" . }}
+	{{ template "import_fr" . }}
+)
+
+// #cgo CFLAGS: -I${SRCDIR}/icicle/curves/{{.PackageName}}/
+// #cgo LDFLAGS: -L${SRCDIR}/../../ {{.SharedLib}}
+// #include "c_api.h"
+// #include "ve_mod_mult.h"
+import "C"
+
+const SCALAR_SIZE = {{.ScalarSize}}
+const BASE_SIZE = {{.BaseSize}}
+
+type ScalarField struct {
+	s [SCALAR_SIZE]uint32
+}
+
+type BaseField struct {
+	s [BASE_SIZE]uint32
+}
+
+type Field interface {
+	toGnarkFr() *fr.Element
+}
+
+/*
+ * Common Constrctors
+ */
+
+func NewFieldZero[T BaseField | ScalarField]() *T {
+	var field T
+
+	return &field
+}
+
+func NewFieldFromFrGnark[T BaseField | ScalarField](element fr.Element) *T {
+	s := ConvertUint64ArrToUint32Arr(element.Bits()) // get non-montgomry
+
+	return &T{s}
+}
+
+func NewFieldFromFpGnark[T BaseField | ScalarField](element fp.Element) *T {
+	s := ConvertUint64ArrToUint32Arr(element.Bits()) // get non-montgomry
+
+	return &T{s}
+}
+
+/*
+ * BaseField Constrctors
+ */
+
+func NewBaseFieldOne() *BaseField {
+	var s [BASE_SIZE]uint32
+
+	s[0] = 1
+
+	return &BaseField{s}
+}
+
+func BaseFieldFromLimbs(limbs [BASE_SIZE]uint32) *BaseField {
+	bf := NewFieldZero[BaseField]()
+	copy(bf.s[:], limbs[:])
+
+	return bf
+}
+
+/*
+ * BaseField methods
+ */
+
+func (f *BaseField) limbs() [BASE_SIZE]uint32 {
+	return f.s
+}
+
+func (f *BaseField) toBytesLe() []byte {
+	bytes := make([]byte, len(f.s)*4)
+	for i, v := range f.s {
+		binary.LittleEndian.PutUint32(bytes[i*4:], v)
+	}
+
+	return bytes
+}
+
+func (f *BaseField) toGnarkFr() *fr.Element {
+	fb := f.toBytesLe()
+	var b32 [32]byte
+	copy(b32[:], fb[:32])
+
+	v, e := fr.LittleEndian.Element(&b32)
+
+	if e != nil {
+		panic(fmt.Sprintf("unable to create convert point %v got error %v", f, e))
+	}
+
+	return &v
+}
+
+func (f *BaseField) toGnarkFp() *fp.Element {
+	fb := f.toBytesLe()
+	var b32 [32]byte
+	copy(b32[:], fb[:32])
+
+	v, e := fp.LittleEndian.Element(&b32)
+
+	if e != nil {
+		panic(fmt.Sprintf("unable to create convert point %v got error %v", f, e))
+	}
+
+	return &v
+}
+
+/*
+ * ScalarField methods
+ */
+
+func NewScalarFieldOne() *ScalarField {
+	var s [SCALAR_SIZE]uint32
+
+	s[0] = 1
+
+	return &ScalarField{s}
+}
+
+/*
+ * ScalarField methods
+ */
+
+func (f *ScalarField) limbs() [SCALAR_SIZE]uint32 {
+	return f.s
+}
+
+func (f *ScalarField) toBytesLe() []byte {
+	bytes := make([]byte, len(f.s)*4)
+	for i, v := range f.s {
+		binary.LittleEndian.PutUint32(bytes[i*4:], v)
+	}
+
+	return bytes
+}
+
+func (f ScalarField) toGnarkFr() *fr.Element {
+	fb := f.toBytesLe()
+	var b32 [32]byte
+	copy(b32[:], fb[:32])
+
+	v, e := fr.LittleEndian.Element(&b32)
+
+	if e != nil {
+		panic(fmt.Sprintf("unable to create convert point %v got error %v", f, e))
+	}
+
+	return &v
+}
+
+func (f *ScalarField) toGnarkFp() *fp.Element {
+	fb := f.toBytesLe()
+	var b32 [32]byte
+	copy(b32[:], fb[:32])
+
+	v, e := fp.LittleEndian.Element(&b32)
+
+	if e != nil {
+		panic(fmt.Sprintf("unable to create convert point %v got error %v", f, e))
+	}
+
+	return &v
+}
+
+/*
+ * Point{{.CurveName}}
+ */
+
+type Point{{.CurveName}} struct {
+	x, y, z BaseField
+}
+
+func NewPoint{{.CurveName}}Zero() *Point{{.CurveName}} {
+	return &Point{{.CurveName}}{
+		x: *NewFieldZero[BaseField](),
+		y: *NewBaseFieldOne(),
+		z: *NewFieldZero[BaseField](),
+	}
+}
+
+func (p *Point{{.CurveName}}) eq(pCompare *Point{{.CurveName}}) bool {
+	// Cast *Point{{.CurveName}} to *C.{{.CurveName}}_projective_t
+	// The unsafe.Pointer cast is necessary because Go doesn't allow direct casts
+	// between different pointer types.
+	// It's your responsibility to ensure that the types are compatible.
+	pC := (*C.{{.CurveName}}_projective_t)(unsafe.Pointer(p))
+	pCompareC := (*C.{{.CurveName}}_projective_t)(unsafe.Pointer(pCompare))
+
+	// Call the C function
+	// The C function doesn't keep any references to the data,
+	// so it's fine if the Go garbage collector moves or deletes the data later.
+	return bool(C.eq_{{.PackageName}}(pC, pCompareC))
+}
+
+func (p *Point{{.CurveName}}) strip_z() *PointAffineNoInfinity{{.CurveName}} {
+	return &PointAffineNoInfinity{{.CurveName}}{
+		x: p.x,
+		y: p.y,
+	}
+}
+
+func (p *Point{{.CurveName}}) toGnarkAffine() *{{.PackageName}}.G1Affine {
+	px := p.x.toGnarkFp()
+	py := p.y.toGnarkFp()
+	pz := p.z.toGnarkFp()
+
+	zInv := new(fp.Element)
+	x := new(fp.Element)
+	y := new(fp.Element)
+
+	zInv.Inverse(pz)
+
+	x.Mul(px, zInv)
+	y.Mul(py, zInv)
+
+	return &{{.PackageName}}.G1Affine{X: *x, Y: *y}
+}
+
+func (p *Point{{.CurveName}}) ToGnarkJac() *{{.PackageName}}.G1Jac {
+	var p1 {{.PackageName}}.G1Jac
+	p1.FromAffine(p.toGnarkAffine())
+
+	return &p1
+}
+
+func Point{{.CurveName}}FromG1AffineGnark(gnark *{{.PackageName}}.G1Affine) *Point{{.CurveName}} {
+	point := Point{{.CurveName}}{
+		x: *NewFieldFromFpGnark[BaseField](gnark.X),
+		y: *NewFieldFromFpGnark[BaseField](gnark.Y),
+		z: *NewBaseFieldOne(),
+	}
+
+	return &point
+}
+
+// converts jac fromat to projective
+func Point{{.CurveName}}FromJacGnark(gnark *{{.PackageName}}.G1Jac) *Point{{.CurveName}} {
+	var pointAffine {{.PackageName}}.G1Affine
+	pointAffine.FromJacobian(gnark)
+
+	point := Point{{.CurveName}}{
+		x: *NewFieldFromFpGnark[BaseField](pointAffine.X),
+		y: *NewFieldFromFpGnark[BaseField](pointAffine.Y),
+		z: *NewBaseFieldOne(),
+	}
+
+	return &point
+}
+
+func Point{{.CurveName}}fromLimbs(x, y, z *[]uint32) *Point{{.CurveName}} {
+	return &Point{{.CurveName}}{
+		x: *BaseFieldFromLimbs(getFixedLimbs(x)),
+		y: *BaseFieldFromLimbs(getFixedLimbs(y)),
+		z: *BaseFieldFromLimbs(getFixedLimbs(z)),
+	}
+}
+
+/*
+ * PointAffineNoInfinity{{.CurveName}}
+ */
+
+type PointAffineNoInfinity{{.CurveName}} struct {
+	x, y BaseField
+}
+
+func NewPointAffineNoInfinity{{.CurveName}}Zero() *PointAffineNoInfinity{{.CurveName}} {
+	return &PointAffineNoInfinity{{.CurveName}}{
+		x: *NewFieldZero[BaseField](),
+		y: *NewFieldZero[BaseField](),
+	}
+}
+
+func (p *PointAffineNoInfinity{{.CurveName}}) toProjective() *Point{{.CurveName}} {
+	return &Point{{.CurveName}}{
+		x: p.x,
+		y: p.y,
+		z: *NewBaseFieldOne(),
+	}
+}
+
+func (p *PointAffineNoInfinity{{.CurveName}}) toGnarkAffine() *{{.PackageName}}.G1Affine {
+	return p.toProjective().toGnarkAffine()
+}
+
+func PointAffineNoInfinity{{.CurveName}}FromLimbs(x, y *[]uint32) *PointAffineNoInfinity{{.CurveName}} {
+	return &PointAffineNoInfinity{{.CurveName}}{
+		x: *BaseFieldFromLimbs(getFixedLimbs(x)),
+		y: *BaseFieldFromLimbs(getFixedLimbs(y)),
+	}
+}
+
+/*
+ * Multiplication
+ */
+
+func MultiplyVec(a []Point{{.CurveName}}, b []ScalarField, deviceID int) {
+	if len(a) != len(b) {
+		panic("a and b have different lengths")
+	}
+
+	pointsC := (*C.{{.CurveName}}_projective_t)(unsafe.Pointer(&a[0]))
+	scalarsC := (*C.{{.CurveName}}_scalar_t)(unsafe.Pointer(&b[0]))
+	deviceIdC := C.size_t(deviceID)
+	nElementsC := C.size_t(len(a))
+
+	C.vec_mod_mult_point_{{.PackageName}}(pointsC, scalarsC, nElementsC, deviceIdC)
+}
+
+func MultiplyScalar(a []ScalarField, b []ScalarField, deviceID int) {
+	if len(a) != len(b) {
+		panic("a and b have different lengths")
+	}
+
+	aC := (*C.{{.CurveName}}_scalar_t)(unsafe.Pointer(&a[0]))
+	bC := (*C.{{.CurveName}}_scalar_t)(unsafe.Pointer(&b[0]))
+	deviceIdC := C.size_t(deviceID)
+	nElementsC := C.size_t(len(a))
+
+	C.vec_mod_mult_scalar_{{.PackageName}}(aC, bC, nElementsC, deviceIdC)
+}
+
+// Multiply a matrix by a scalar:
+//
+//	`a` - flattenned matrix;
+//	`b` - vector to multiply `a` by;
+func MultiplyMatrix(a []ScalarField, b []ScalarField, deviceID int) {
+	c := make([]ScalarField, len(b))
+	for i := range c {
+		c[i] = *NewFieldZero[ScalarField]()
+	}
+
+	aC := (*C.{{.CurveName}}_scalar_t)(unsafe.Pointer(&a[0]))
+	bC := (*C.{{.CurveName}}_scalar_t)(unsafe.Pointer(&b[0]))
+	cC := (*C.{{.CurveName}}_scalar_t)(unsafe.Pointer(&c[0]))
+	deviceIdC := C.size_t(deviceID)
+	nElementsC := C.size_t(len(a))
+
+	C.matrix_vec_mod_mult_{{.PackageName}}(aC, bC, cC, nElementsC, deviceIdC)
+}
+
+/*
+ * Utils
+ */
+
+func getFixedLimbs(slice *[]uint32) [BASE_SIZE]uint32 {
+	if len(*slice) <= BASE_SIZE {
+		limbs := [BASE_SIZE]uint32{}
+		copy(limbs[:len(*slice)], *slice)
+		return limbs
+	}
+
+	panic("slice has too many elements")
+}
+
+func BatchConvertFromFrGnark[T BaseField | ScalarField](elements []fr.Element) []T {
+	var newElements []T
+	for _, e := range elements {
+		converted := NewFieldFromFrGnark[T](e)
+		newElements = append(newElements, *converted)
+	}
+
+	return newElements
+}
+
+func BatchConvertFromFrGnarkThreaded[T BaseField | ScalarField](elements []fr.Element, routines int) []T {
+	var newElements []T
+
+	if routines > 1 {
+		channels := make([]chan []T, routines)
+		for i := 0; i < routines; i ++ {
+			channels[i] = make(chan []T, 1)
+		} 
+
+		convert := func(elements []fr.Element, chanIndex int) {
+			var convertedElements []T
+			for _, e := range elements {
+				converted := NewFieldFromFrGnark[T](e)
+				convertedElements = append(convertedElements, *converted)
+			}
+
+			channels[chanIndex] <- convertedElements
+		}
+
+		batchLen := len(elements)/routines
+		for i := 0; i < routines; i ++ {
+			elemsToConv := elements[batchLen*i:batchLen*(i+1)]
+			go convert(elemsToConv, i)
+		}
+
+		for i := 0; i < routines; i ++ {
+			newElements = append(newElements, <-channels[i]...)
+		}
+	} else {
+		for _, e := range elements {
+			converted := NewFieldFromFrGnark[T](e)
+			newElements = append(newElements, *converted)
+		}
+	}
+
+	return newElements
+}
+
+func BatchConvertToFrGnark[T Field](elements []T) []fr.Element {
+	var newElements []fr.Element
+	for _, e := range elements {
+		converted := e.toGnarkFr()
+		newElements = append(newElements, *converted)
+	}
+
+	return newElements
+}
+
+func BatchConvertToFrGnarkThreaded[T Field](elements []T, routines int) []fr.Element {
+	var newElements []fr.Element
+
+	if routines > 1 {
+		channels := make([]chan []fr.Element, routines)
+		for i := 0; i < routines; i ++ {
+			channels[i] = make(chan []fr.Element, 1)
+		} 
+
+		convert := func(elements []T, chanIndex int) {
+			var convertedElements []fr.Element
+			for _, e := range elements {
+				converted := e.toGnarkFr()
+				convertedElements = append(convertedElements, *converted)
+			}
+
+			channels[chanIndex] <- convertedElements
+		}
+
+		batchLen := len(elements)/routines
+		for i := 0; i < routines; i ++ {
+			elemsToConv := elements[batchLen*i:batchLen*(i+1)]
+			go convert(elemsToConv, i)
+		}
+
+		for i := 0; i < routines; i ++ {
+			newElements = append(newElements, <-channels[i]...)
+		}
+	} else {
+		for _, e := range elements {
+			converted := e.toGnarkFr()
+			newElements = append(newElements, *converted)
+		}
+	}
+
+	return newElements
+}
+
+func BatchConvertFromG1Affine(elements []{{.PackageName}}.G1Affine) []PointAffineNoInfinity{{.CurveName}} {
+	var newElements []PointAffineNoInfinity{{.CurveName}}
+	for _, e := range elements {
+		newElement := Point{{.CurveName}}FromG1AffineGnark(&e).strip_z()
+		newElements = append(newElements, *newElement)
+	}
+	return newElements
+}
diff --git a/goicicle/templates/curves/g2.go.tmpl b/goicicle/templates/curves/g2.go.tmpl
new file mode 100644
index 000000000..89f736b6c
--- /dev/null
+++ b/goicicle/templates/curves/g2.go.tmpl
@@ -0,0 +1,83 @@
+import (
+	"unsafe"
+	{{ template "import_ecc" . }}
+)
+
+// #cgo CFLAGS: -I${SRCDIR}/icicle/curves/{{toLower .CurveName}}/
+// #cgo LDFLAGS: -L${SRCDIR}/../../ {{.SharedLib}}
+// #include "c_api.h"
+// #include "ve_mod_mult.h"
+import "C"
+
+func BatchConvertFromG2Affine(elements []{{.PackageName}}.G2Affine) []G2PointAffine {
+	var newElements []G2PointAffine
+	for _, gg2Affine := range elements {
+		var newElement G2PointAffine
+		newElement.FromGnarkAffine(&gg2Affine)
+
+		newElements = append(newElements, newElement)
+	}
+	return newElements
+}
+
+// G2 extension field
+
+type G2Element [4]uint64
+
+type ExtentionField struct {
+	A0, A1 G2Element
+}
+
+type G2PointAffine struct {
+	x, y ExtentionField
+}
+
+type G2Point struct {
+	x, y, z ExtentionField
+}
+
+func (p *G2Point) eqg2(pCompare *G2Point) bool {
+	// Cast *Point{{.CurveName}} to *C.{{.CurveName}}_projective_t
+	// The unsafe.Pointer cast is necessary because Go doesn't allow direct casts
+	// between different pointer types.
+	// It's your responsibility to ensure that the types are compatible.
+	pC := (*C.{{.CurveName}}_g2_projective_t)(unsafe.Pointer(p))
+	pCompareC := (*C.{{.CurveName}}_g2_projective_t)(unsafe.Pointer(pCompare))
+
+	// Call the C function
+	// The C function doesn't keep any references to the data,
+	// so it's fine if the Go garbage collector moves or deletes the data later.
+	return bool(C.eq_g2_{{.PackageName}}(pC, pCompareC))
+}
+
+func (p *G2PointAffine) ToProjective() G2Point {
+	return G2Point{
+		x: p.x,
+		y: p.y,
+		z: ExtentionField{
+			A0: G2Element{1, 0, 0, 0},
+			A1: G2Element{0, 0, 0, 0},
+		},
+	}
+}
+
+func (g *G2PointAffine) FromGnarkAffine(gnark *{{.PackageName}}.G2Affine) *G2PointAffine {
+	g.x.A0 = gnark.X.A0.Bits()
+	g.x.A1 = gnark.X.A1.Bits()
+	g.y.A0 = gnark.Y.A0.Bits()
+	g.y.A1 = gnark.Y.A1.Bits()
+
+	return g
+}
+
+func (g *G2PointAffine) FromGnarkJac(gnark *{{.PackageName}}.G2Jac) *G2PointAffine {
+	var pointAffine {{.PackageName}}.G2Affine
+	pointAffine.FromJacobian(gnark)
+
+	g.x.A0 = pointAffine.X.A0.Bits()
+	g.x.A1 = pointAffine.X.A1.Bits()
+	g.y.A0 = pointAffine.Y.A0.Bits()
+	g.y.A1 = pointAffine.Y.A1.Bits()
+
+	return g
+}
diff --git a/goicicle/templates/curves/imports.go.tmpl b/goicicle/templates/curves/imports.go.tmpl
new file mode 100644
index 000000000..d04d0acea
--- /dev/null
+++ b/goicicle/templates/curves/imports.go.tmpl
@@ -0,0 +1,34 @@
+{{ define "import_fr" }}
+
+{{ if eq .CurveName "BN254"}}
+	"github.com/consensys/gnark-crypto/ecc/bn254/fr"
+{{ else if eq .CurveName "BLS12377"}}
+	"github.com/consensys/gnark-crypto/ecc/bls12-377/fr"
+{{ else if eq .CurveName "BLS12381"}}
+	"github.com/consensys/gnark-crypto/ecc/bls12-381/fr"
+{{end}}
+
+{{end}}
+
+{{ define "import_fp" }}
+{{ if eq .CurveName "BN254"}}
+	"github.com/consensys/gnark-crypto/ecc/bn254/fp"
+{{ else if eq .CurveName "BLS12377"}}
+	"github.com/consensys/gnark-crypto/ecc/bls12-377/fp"
+{{ else if eq .CurveName "BLS12381"}}
+	"github.com/consensys/gnark-crypto/ecc/bls12-381/fp"
+{{end}}
+
+{{end}}
+
+{{ define "import_ecc" }}
+
+{{ if eq .CurveName "BN254"}}
+	"github.com/consensys/gnark-crypto/ecc/bn254"
+{{ else if eq .CurveName "BLS12377"}}
+	"github.com/consensys/gnark-crypto/ecc/bls12-377"
+{{ else if eq .CurveName "BLS12381"}}
+	"github.com/consensys/gnark-crypto/ecc/bls12-381"
+{{end}}
+
+{{end}}
diff --git a/goicicle/templates/hfiles/c_api.h.tmpl b/goicicle/templates/hfiles/c_api.h.tmpl
new file mode 100644
index 000000000..e0a5ea7d2
--- /dev/null
+++ b/goicicle/templates/hfiles/c_api.h.tmpl
@@ -0,0 +1,15 @@
+#include <stdbool.h>
+#include <cuda.h>
+// c_api.h
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {{.CurveName}}_projective_t {{.CurveName}}_projective_t;
+
+bool eq_{{.Prefix}}({{.CurveName}}_projective_t *point1, {{.CurveName}}_projective_t *point2, size_t device_id);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/goicicle/templates/hfiles/msm.h.tmpl b/goicicle/templates/hfiles/msm.h.tmpl
new file mode 100644
index 000000000..50d01abab
--- /dev/null
+++ b/goicicle/templates/hfiles/msm.h.tmpl
@@ -0,0 +1,35 @@
+#include <stdbool.h>
+#include <cuda.h>
+// msm.h
+
+#ifndef _{{.CurveName}}_MSM_H
+#define _{{.CurveName}}_MSM_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Incomplete declaration of {{.CurveName}} projective and affine structs
+typedef struct {{.CurveName}}_projective_t {{.CurveName}}_projective_t;
+typedef struct {{.CurveName}}_affine_t {{.CurveName}}_affine_t;
+typedef struct {{.CurveName}}_scalar_t {{.CurveName}}_scalar_t;
+
+int msm_cuda_{{.Prefix}}({{.CurveName}}_projective_t* out, {{.CurveName}}_affine_t* points,
+                   {{.CurveName}}_scalar_t* scalars, size_t count, size_t device_id);
+
+int msm_batch_cuda_{{.Prefix}}({{.CurveName}}_projective_t* out, {{.CurveName}}_affine_t* points,
+                         {{.CurveName}}_scalar_t* scalars, size_t batch_size,
+                         size_t msm_size, size_t device_id);
+
+int commit_cuda_{{.Prefix}}({{.CurveName}}_projective_t* d_out, {{.CurveName}}_scalar_t* d_scalars,
+                      {{.CurveName}}_affine_t* d_points, size_t count, size_t device_id);
+
+int commit_batch_cuda_{{.Prefix}}({{.CurveName}}_projective_t* d_out, {{.CurveName}}_scalar_t* d_scalars,
+                            {{.CurveName}}_affine_t* d_points, size_t count,
+                            size_t batch_size, size_t device_id);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _{{.CurveName}}_MSM_H */
diff --git a/goicicle/templates/hfiles/ntt.h.tmpl b/goicicle/templates/hfiles/ntt.h.tmpl
new file mode 100644
index 000000000..004d8cdf4
--- /dev/null
+++ b/goicicle/templates/hfiles/ntt.h.tmpl
@@ -0,0 +1,27 @@
+#include <stdbool.h>
+#include <cuda.h>
+// ntt.h
+
+#ifndef _{{.CurveName}}_NTT_H
+#define _{{.CurveName}}_NTT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Incomplete declaration of {{.CurveName}} projective and affine structs
+typedef struct {{.CurveName}}_projective_t {{.CurveName}}_projective_t;
+typedef struct {{.CurveName}}_affine_t {{.CurveName}}_affine_t;
+typedef struct {{.CurveName}}_scalar_t {{.CurveName}}_scalar_t;
+
+int ntt_cuda_{{.Prefix}}({{.CurveName}}_scalar_t *arr, uint32_t n, bool inverse, size_t decimation, size_t device_id);
+int ntt_batch_cuda_{{.Prefix}}({{.CurveName}}_scalar_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id);
+
+int ecntt_cuda_{{.Prefix}}({{.CurveName}}_projective_t *arr, uint32_t n, bool inverse, size_t device_id);
+int ecntt_batch_cuda_{{.Prefix}}({{.CurveName}}_projective_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _{{.CurveName}}_NTT_H */
diff --git a/goicicle/templates/hfiles/ve_mod_mult.h.tmpl b/goicicle/templates/hfiles/ve_mod_mult.h.tmpl
new file mode 100644
index 000000000..01d1241f2
--- /dev/null
+++ b/goicicle/templates/hfiles/ve_mod_mult.h.tmpl
@@ -0,0 +1,24 @@
+#include <stdbool.h>
+#include <cuda.h>
+// ve_mod_mult.h
+
+#ifndef _{{.CurveName}}_VEC_MULT_H
+#define _{{.CurveName}}_VEC_MULT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {{.CurveName}}_projective_t {{.CurveName}}_projective_t;
+typedef struct {{.CurveName}}_scalar_t {{.CurveName}}_scalar_t;
+
+int32_t vec_mod_mult_point_{{.Prefix}}({{.CurveName}}_projective_t *inout, {{.CurveName}}_scalar_t *scalar_vec, size_t n_elments, size_t device_id);
+int32_t vec_mod_mult_scalar_{{.Prefix}}({{.CurveName}}_scalar_t *inout, {{.CurveName}}_scalar_t *scalar_vec, size_t n_elments, size_t device_id);
+int32_t matrix_vec_mod_mult_{{.Prefix}}({{.CurveName}}_scalar_t *matrix_flattened, {{.CurveName}}_scalar_t *input, {{.CurveName}}_scalar_t *output, size_t n_elments, size_t device_id);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _{{.CurveName}}_VEC_MULT_H */
diff --git a/goicicle/templates/main.go b/goicicle/templates/main.go
new file mode 100644
index 000000000..712f3e0a4
--- /dev/null
+++ b/goicicle/templates/main.go
@@ -0,0 +1,161 @@
+package main
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+
+	"github.com/consensys/bavard"
+	config "github.com/ingonyama-zk/icicle/goicicle/templates/curves"
+)
+
+const (
+	copyrightHolder = "Ingonyama"
+	generatedBy     = "Ingonyama"
+	copyrightYear   = 2023
+	baseDir         = "../curves/"
+	hBaseDir        = "../../icicle/curves/"
+)
+
+var bgen = bavard.NewBatchGenerator(copyrightHolder, copyrightYear, generatedBy)
+
+func main() {
+	bn254_entries := []bavard.Entry{
+		{File: filepath.Join(baseDir, "bn254", "g1.go"), Templates: []string{"g1.go.tmpl", "imports.go.tmpl"}},
+	}
+
+	bls12377_entries := []bavard.Entry{
+		{File: filepath.Join(baseDir, "bls12377", "g1.go"), Templates: []string{"g1.go.tmpl", "imports.go.tmpl"}},
+	}
+
+	bls12381_entries := []bavard.Entry{
+		{File: filepath.Join(baseDir, "bls12381", "g1.go"), Templates: []string{"g1.go.tmpl", "imports.go.tmpl"}},
+	}
+
+	assertNoError(bgen.Generate(config.BLS_12_377, config.BLS_12_377.PackageName, "./curves/", bls12377_entries...))
+	assertNoError(bgen.Generate(config.BN_254, config.BN_254.PackageName, "./curves/", bn254_entries...))
+	assertNoError(bgen.Generate(config.BLS_12_381, config.BLS_12_381.PackageName, "./curves/", bls12381_entries...))
+
+	bn254_g2_entries := []bavard.Entry{
+		{File: filepath.Join(baseDir, "bn254", "g2.go"), Templates: []string{"g2.go.tmpl", "imports.go.tmpl"}},
+	}
+
+	bls12377_g2_entries := []bavard.Entry{
+		{File: filepath.Join(baseDir, "bls12377", "g2.go"), Templates: []string{"g2.go.tmpl", "imports.go.tmpl"}},
+	}
+
+	bls12381_g2_entries := []bavard.Entry{
+		{File: filepath.Join(baseDir, "bls12381", "g2.go"), Templates: []string{"g2.go.tmpl", "imports.go.tmpl"}},
+	}
+
+	assertNoError(bgen.Generate(config.BLS_12_377, config.BLS_12_377.PackageName, "./curves/", bls12377_g2_entries...))
+	assertNoError(bgen.Generate(config.BN_254, config.BN_254.PackageName, "./curves/", bn254_g2_entries...))
+	assertNoError(bgen.Generate(config.BLS_12_381, config.BLS_12_381.PackageName, "./curves/", bls12381_g2_entries...))
+
+	//bn254_msm_entries := []bavard.Entry{
+	//	{File: filepath.Join(baseDir, "bn254", "msm.go"), Templates: []string{"msm.go.tmpl", "../curves/imports.go.tmpl"}},
+	//}
+	//
+	//bls12377_msm_entries := []bavard.Entry{
+	//	{File: filepath.Join(baseDir, "bls12377", "msm.go"), Templates: []string{"msm.go.tmpl", "../curves/imports.go.tmpl"}},
+	//}
+	//
+	//bls12381_msm_entries := []bavard.Entry{
+	//	{File: filepath.Join(baseDir, "bls12381", "msm.go"), Templates: []string{"msm.go.tmpl", "../curves/imports.go.tmpl"}},
+	//}
+	//
+	//assertNoError(bgen.Generate(config.BLS_12_377, config.BLS_12_377.PackageName, "./msm/", bls12377_msm_entries...))
+	//assertNoError(bgen.Generate(config.BN_254, config.BN_254.PackageName, "./msm/", bn254_msm_entries...))
+	//assertNoError(bgen.Generate(config.BLS_12_381, config.BLS_12_381.PackageName, "./msm/", bls12381_msm_entries...))
+	//
+	//bn254_ntt_entries := []bavard.Entry{
+	//	{File: filepath.Join(baseDir, "bn254", "ntt.go"), Templates: []string{"ntt.go.tmpl", "../curves/imports.go.tmpl"}},
+	//}
+	//
+	//bls12377_ntt_entries := []bavard.Entry{
+	//	{File: filepath.Join(baseDir, "bls12377", "ntt.go"), Templates: []string{"ntt.go.tmpl", "../curves/imports.go.tmpl"}},
+	//}
+	//
+	//bls12381_ntt_entries := []bavard.Entry{
+	//	{File: filepath.Join(baseDir, "bls12381", "ntt.go"), Templates: []string{"ntt.go.tmpl", "../curves/imports.go.tmpl"}},
+	//}
+	//
+	//assertNoError(bgen.Generate(config.BLS_12_377, config.BLS_12_377.PackageName, "./ntt/", bls12377_ntt_entries...))
+	//assertNoError(bgen.Generate(config.BN_254, config.BN_254.PackageName, "./ntt/", bn254_ntt_entries...))
+	//assertNoError(bgen.Generate(config.BLS_12_381, config.BLS_12_381.PackageName, "./ntt/", bls12381_ntt_entries...))
+
+	/*
+		h_files
+	*/
+
+	//h_msm_bn254 := []bavard.Entry{
+	//	{File: filepath.Join(hBaseDir, "bn254", "msm.h"), Templates: []string{"msm.h.tmpl", "../curves/imports.go.tmpl"}},
+	//}
+	//
+	//h_msm_bls12_377 := []bavard.Entry{
+	//	{File: filepath.Join(hBaseDir, "bls12_377", "msm.h"), Templates: []string{"msm.h.tmpl", "../curves/imports.go.tmpl"}},
+	//}
+	//
+	//h_msm_bls12_381 := []bavard.Entry{
+	//	{File: filepath.Join(hBaseDir, "bls12_381", "msm.h"), Templates: []string{"msm.h.tmpl", "../curves/imports.go.tmpl"}},
+	//}
+	//
+	//assertNoError(bgen.Generate(config.BLS_12_377, config.BLS_12_377.PackageName, "./hfiles/", h_msm_bls12_377...))
+	//assertNoError(bgen.Generate(config.BN_254, config.BN_254.PackageName, "./hfiles/", h_msm_bn254...))
+	//assertNoError(bgen.Generate(config.BLS_12_381, config.BLS_12_381.PackageName, "./hfiles/", h_msm_bls12_381...))
+	//
+	//h_ntt_bn254 := []bavard.Entry{
+	//	{File: filepath.Join(hBaseDir, "bn254", "ntt.h"), Templates: []string{"ntt.h.tmpl", "../curves/imports.go.tmpl"}},
+	//}
+	//
+	//h_ntt_bls12_377 := []bavard.Entry{
+	//	{File: filepath.Join(hBaseDir, "bls12_377", "ntt.h"), Templates: []string{"ntt.h.tmpl", "../curves/imports.go.tmpl"}},
+	//}
+	//
+	//h_ntt_bls12_381 := []bavard.Entry{
+	//	{File: filepath.Join(hBaseDir, "bls12_381", "ntt.h"), Templates: []string{"ntt.h.tmpl", "../curves/imports.go.tmpl"}},
+	//}
+	//
+	//assertNoError(bgen.Generate(config.BLS_12_377, config.BLS_12_377.PackageName, "./hfiles/", h_ntt_bls12_377...))
+	//assertNoError(bgen.Generate(config.BN_254, config.BN_254.PackageName, "./hfiles/", h_ntt_bn254...))
+	//assertNoError(bgen.Generate(config.BLS_12_381, config.BLS_12_381.PackageName, "./hfiles/", h_ntt_bls12_381...))
+	//
+	//ve_mod_mult_h_bn254 := []bavard.Entry{
+	//	{File: filepath.Join(hBaseDir, "bn254", "ve_mod_mult.h"), Templates: []string{"ve_mod_mult.h.tmpl", "../curves/imports.go.tmpl"}},
+	//}
+	//
+	//ve_mod_mult_h_bls12_377 := []bavard.Entry{
+	//	{File: filepath.Join(hBaseDir, "bls12_377", "ve_mod_mult.h"), Templates: []string{"ve_mod_mult.h.tmpl", "../curves/imports.go.tmpl"}},
+	//}
+	//
+	//ve_mod_mult_ht_bls12_381 := []bavard.Entry{
+	//	{File: filepath.Join(hBaseDir, "bls12_381", "ve_mod_mult.h"), Templates: []string{"ve_mod_mult.h.tmpl", "../curves/imports.go.tmpl"}},
+	//}
+	//
+	//assertNoError(bgen.Generate(config.BLS_12_377, config.BLS_12_377.PackageName, "./hfiles/", ve_mod_mult_h_bls12_377...))
+	//assertNoError(bgen.Generate(config.BN_254, config.BN_254.PackageName, "./hfiles/", ve_mod_mult_h_bn254...))
+	//assertNoError(bgen.Generate(config.BLS_12_381, config.BLS_12_381.PackageName, "./hfiles/", ve_mod_mult_ht_bls12_381...))
+	//
+	//c_api_bn254 := []bavard.Entry{
+	//	{File: filepath.Join(hBaseDir, "bn254", "c_api.h"), Templates: []string{"c_api.h.tmpl", "../curves/imports.go.tmpl"}},
+	//}
+	//
+	//c_api_bls12_377 := []bavard.Entry{
+	//	{File: filepath.Join(hBaseDir, "bls12_377", "c_api.h"), Templates: []string{"c_api.h.tmpl", "../curves/imports.go.tmpl"}},
+	//}
+	//
+	//c_api_bls12_381 := []bavard.Entry{
+	//	{File: filepath.Join(hBaseDir, "bls12_381", "c_api.h"), Templates: []string{"c_api.h.tmpl", "../curves/imports.go.tmpl"}},
+	//}
+	//
+	//assertNoError(bgen.Generate(config.BLS_12_377, config.BLS_12_377.PackageName, "./hfiles/", c_api_bls12_377...))
+	//assertNoError(bgen.Generate(config.BN_254, config.BN_254.PackageName, "./hfiles/", c_api_bn254...))
+	//assertNoError(bgen.Generate(config.BLS_12_381, config.BLS_12_381.PackageName, "./hfiles/", c_api_bls12_381...))
+}
+
+func assertNoError(err error) {
+	if err != nil {
+		fmt.Printf("\n%s\n", err.Error())
+		os.Exit(-1)
+	}
+}
diff --git a/goicicle/templates/msm/msm.go.tmpl b/goicicle/templates/msm/msm.go.tmpl
new file mode 100644
index 000000000..1840e789f
--- /dev/null
+++ b/goicicle/templates/msm/msm.go.tmpl
@@ -0,0 +1,71 @@
+import (
+	"errors"
+	"fmt"
+	"unsafe"
+)
+
+// #cgo CFLAGS: -I../../../icicle/curves/{{toLower .CurveName}}/
+// #cgo LDFLAGS: -L${SRCDIR}/../../ {{.SharedLib}}
+// #include "msm.h"
+import "C"
+
+func Msm{{.CurveName}}(points []PointAffineNoInfinity{{.CurveName}}, scalars []ScalarField, device_id int) (*Point{{.CurveName}}, error) {
+	if len(points) != len(scalars) {
+		return nil, errors.New("error on: len(points) != len(scalars)")
+	}
+
+	out := new(Point{{.CurveName}})
+
+	pointsC := (*C.{{.CurveName}}_affine_t)(unsafe.Pointer(&points[0]))
+	scalarsC := (*C.{{.CurveName}}_scalar_t)(unsafe.Pointer(&scalars[0]))
+	outC := (*C.{{.CurveName}}_projective_t)(unsafe.Pointer(out))
+
+	ret := C.msm_cuda_{{.Prefix}}(outC, pointsC, scalarsC, C.size_t(len(points)), C.size_t(device_id))
+
+	if ret != 0 {
+		return nil, fmt.Errorf("msm_cuda_{{.Prefix}} returned error code: %d", ret)
+	}
+
+	return out, nil
+}
+
+func MsmBatch{{.CurveName}}(points *[]PointAffineNoInfinity{{.CurveName}}, scalars *[]ScalarField, batchSize, deviceId int) ([]*Point{{.CurveName}}, error) {
+	// Check for nil pointers
+	if points == nil || scalars == nil {
+		return nil, errors.New("points or scalars is nil")
+	}
+
+	if len(*points) != len(*scalars) {
+		return nil, errors.New("error on: len(points) != len(scalars)")
+	}
+
+	// Check for empty slices
+	if len(*points) == 0 || len(*scalars) == 0 {
+		return nil, errors.New("points or scalars is empty")
+	}
+
+	// Check for zero batchSize
+	if batchSize <= 0 {
+		return nil, errors.New("error on: batchSize must be greater than zero")
+	}
+
+	out := make([]*Point{{.CurveName}}, batchSize)
+
+	for i := 0; i < len(out); i++ {
+		out[i] = NewPoint{{.CurveName}}Zero()
+	}
+
+	outC := (*C.{{.CurveName}}_projective_t)(unsafe.Pointer(&out[0]))
+	pointsC := (*C.{{.CurveName}}_affine_t)(unsafe.Pointer(&(*points)[0]))
+	scalarsC := (*C.{{.CurveName}}_scalar_t)(unsafe.Pointer(&(*scalars)[0]))
+	msmSizeC := C.size_t(len(*points) / batchSize)
+	deviceIdC := C.size_t(deviceId)
+	batchSizeC := C.size_t(batchSize)
+
+	ret := C.msm_batch_cuda_{{.Prefix}}(outC, pointsC, scalarsC, batchSizeC, msmSizeC, deviceIdC)
+	if ret != 0 {
+		return nil, fmt.Errorf("msm_batch_cuda_{{.Prefix}} returned error code: %d", ret)
+	}
+
+	return out, nil
+}
diff --git a/goicicle/templates/ntt/ntt.go.tmpl b/goicicle/templates/ntt/ntt.go.tmpl
new file mode 100644
index 000000000..8896725f9
--- /dev/null
+++ b/goicicle/templates/ntt/ntt.go.tmpl
@@ -0,0 +1,54 @@
+
+// #cgo CFLAGS: -I../../../icicle/curves//{{toLower .CurveName}}/
+// #cgo LDFLAGS: -L${SRCDIR}/../../ {{.SharedLib}}
+// #include "ntt.h"
+import "C"
+import "unsafe"
+
+const (
+	NONE = 0
+	DIF  = 1
+	DIT  = 2
+)
+
+func Ntt{{.CurveName}}(scalars *[]ScalarField, isInverse bool, decimation int, deviceId int) uint64 {
+	scalarsC := (*C.{{.CurveName}}_scalar_t)(unsafe.Pointer(&(*scalars)[0]))
+
+	ret := C.ntt_cuda_{{.Prefix}}(scalarsC, C.uint32_t(len(*scalars)), C.bool(isInverse), C.size_t(decimation), C.size_t(deviceId))
+
+	return uint64(ret)
+}
+
+func NttBatch{{.CurveName}}(scalars *[]ScalarField, isInverse bool, batchSize, deviceId int) uint64 {
+	scalarsC := (*C.{{.CurveName}}_scalar_t)(unsafe.Pointer(&(*scalars)[0]))
+	isInverseC := C.bool(isInverse)
+	batchSizeC := C.uint32_t(batchSize)
+	deviceIdC := C.size_t(deviceId)
+
+	ret := C.ntt_batch_cuda_{{.Prefix}}(scalarsC, C.uint32_t(len(*scalars)), batchSizeC, isInverseC, deviceIdC)
+
+	return uint64(ret)
+}
+
+func EcNtt{{.CurveName}}(values *[]Point{{.CurveName}}, isInverse bool, deviceId int) uint64 {
+	valuesC := (*C.{{.CurveName}}_projective_t)(unsafe.Pointer(&(*values)[0]))
+	deviceIdC := C.size_t(deviceId)
+	isInverseC := C.bool(isInverse)
+	n := C.uint32_t(len(*values))
+
+	ret := C.ecntt_cuda_{{.Prefix}}(valuesC, n, isInverseC, deviceIdC)
+
+	return uint64(ret)
+}
+
+func EcNttBatch{{.CurveName}}(values *[]Point{{.CurveName}}, isInverse bool, batchSize, deviceId int) uint64 {
+	valuesC := (*C.{{.CurveName}}_projective_t)(unsafe.Pointer(&(*values)[0]))
+	deviceIdC := C.size_t(deviceId)
+	isInverseC := C.bool(isInverse)
+	n := C.uint32_t(len(*values))
+	batchSizeC := C.uint32_t(batchSize)
+
+	ret := C.ecntt_batch_cuda_{{.Prefix}}(valuesC, n, batchSizeC, isInverseC, deviceIdC)
+
+	return uint64(ret)
+}
diff --git a/icicle/CMakeLists.txt b/icicle/CMakeLists.txt
index 4836df87a..b4c3f345b 100644
--- a/icicle/CMakeLists.txt
+++ b/icicle/CMakeLists.txt
@@ -1,8 +1,8 @@
-cmake_minimum_required(VERSION 3.16)
+cmake_minimum_required(VERSION 3.18)
 
 # GoogleTest requires at least C++14
 set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CUDA_STANDARD 14)
+set(CMAKE_CUDA_STANDARD 17)
 set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
 set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
 # add the target cuda architectures
@@ -22,6 +22,10 @@ FetchContent_Declare(
   URL https://github.com/google/googletest/archive/refs/tags/v1.13.0.zip
 )
 # For Windows: Prevent overriding the parent project's compiler/linker settings
+
+# boosting lib
+include_directories("/home/miner/include/boost_1_80_0")
+
 set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
 FetchContent_MakeAvailable(googletest)
 
diff --git a/icicle/appUtils/msm/msm.cu b/icicle/appUtils/msm/msm.cu
index 9bb6b4597..728f6a187 100644
--- a/icicle/appUtils/msm/msm.cu
+++ b/icicle/appUtils/msm/msm.cu
@@ -3,6 +3,7 @@
 #pragma once
 #include <stdexcept>
 #include <cuda.h>
+#include <cooperative_groups.h>
 #include "../../primitives/affine.cuh"
 #include <iostream>
 #include <vector>
@@ -14,9 +15,385 @@
 #include "../../primitives/field.cuh"
 #include "msm.cuh"
 
+#define TEMP_NUM 10
+#define MAX_TH 256
 
-#define BIG_TRIANGLE
+// #define SIGNED_DIG
+// #define BIG_TRIANGLE
+// #define ZPRIZE
 // #define SSM_SUM  //WIP
+// #define PHASE1_TEST
+
+#define SIZE 32
+#define SHMEM_SIZE 64 * 4 //why this size?
+
+// For last iteration (saves useless work)
+// Use volatile to prevent caching in registers (compiler optimization)
+// No __syncthreads() necessary!
+template <typename P>
+__device__ void warpReduce(P* shmem_ptr, int t, int first, int last) {
+  for (int i=first; i>last; i>>=1){
+    shmem_ptr[t] = shmem_ptr[t] + shmem_ptr[t + i];
+  }
+}
+
+template <typename P>
+__global__ void general_sum_reduction_kernel(P *v, P *v_r, unsigned nof_partial_sums, unsigned write_stride, unsigned write_phase) {
+	// Allocate shared memory
+	__shared__ P partial_sum[SHMEM_SIZE]; //use memory allocation like coop groups
+	// int partial_sum[];
+
+	// Calculate thread ID
+	// int tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+	// Load elements AND do first add of reduction
+	// Vector now 2x as long as number of threads, so scale i
+	int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x;
+
+	// Store first partial result instead of just the elements
+	partial_sum[threadIdx.x] = v[i] + v[i + blockDim.x];
+	__syncthreads();
+
+	// Start at 1/2 block stride and divide by two each iteration
+	// Stop early (call device function instead)
+	for (int s = blockDim.x / 2; s > nof_partial_sums-1; s >>= 1) {
+		// Each thread does work unless it is further than the stride
+		if (threadIdx.x < s) {
+			partial_sum[threadIdx.x] = partial_sum[threadIdx.x] + partial_sum[threadIdx.x + s];
+		}
+		__syncthreads();
+	}
+  //todo - add device function
+	// if (threadIdx.x < 32) {
+	// 	warpReduce(partial_sum, threadIdx.x);
+	// }
+
+	// Let the thread 0 for this block write it's result to main memory
+	// Result is inexed by this block
+	if (threadIdx.x < nof_partial_sums) {
+    unsigned write_ind = nof_partial_sums*blockIdx.x + threadIdx.x;
+		v_r[((write_ind/write_stride)*2 + write_phase)*write_stride + write_ind%write_stride] = partial_sum[threadIdx.x];
+	}
+}
+
+template <typename P>
+__global__ void single_stage_multi_reduction_kernel(P *v, P *v_r, unsigned block_size, unsigned write_stride, unsigned write_phase, unsigned padding) {
+	// Allocate shared memory
+	// __shared__ P partial_sum[SHMEM_SIZE]; //use memory allocation like coop groups
+	// int partial_sum[];
+
+	// Calculate thread ID
+	int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int tid_p = padding? (tid/(2*padding))*padding + tid%padding: tid;
+  int jump =block_size/2;
+  int block_id = tid_p/jump;
+  int block_tid = tid_p%jump;
+
+  // if (block_tid < jump){
+  unsigned read_ind = block_size*block_id + block_tid; //fix
+  // unsigned padded_read_ind = block_size*block_id + block_tid; //fix
+  // unsigned write_ind = jump*block_id + block_tid;
+  unsigned write_ind = tid;
+  if (padding) printf(" %u %u %u %u\n",tid,tid_p,read_ind,((write_ind/write_stride)*2 + write_phase)*write_stride + write_ind%write_stride);
+	v_r[write_stride? ((write_ind/write_stride)*2 + write_phase)*write_stride + write_ind%write_stride : write_ind] = padding? (tid%(2*padding)<padding)? v[read_ind] + v[read_ind + jump] : P::zero() :v[read_ind] + v[read_ind + jump];
+  // }
+}
+
+template <typename P>
+__global__ void variable_block_multi_reduction_kernel(P *v, P *v_r, unsigned *block_sizes, unsigned *block_offsets, unsigned write_stride, unsigned write_phase) {
+	// Allocate shared memory
+	// __shared__ P partial_sum[SHMEM_SIZE]; //use memory allocation like coop groups
+	// int partial_sum[];
+
+	// Calculate thread ID
+	int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int jump =block_sizes[tid]/2; //????
+  int block_offset = block_offsets[tid]; //block 
+  int block_tid = tid - block_offset/2; //fix
+
+  // if (block_tid < jump){
+  unsigned read_ind = block_offset + block_tid; //fix
+  // unsigned padded_read_ind = block_size*block_id + block_tid; //fix
+  // unsigned write_ind = jump*block_id + block_tid;
+  unsigned write_ind = block_offset/2 + block_tid;
+  // if (padding) printf(" %u %u %u %u\n",tid,tid_p,read_ind,((write_ind/write_stride)*2 + write_phase)*write_stride + write_ind%write_stride);
+	v_r[write_stride? ((write_ind/write_stride)*2 + write_phase)*write_stride + write_ind%write_stride : write_ind] = v[read_ind] + v[read_ind + jump];
+  // }
+}
+
+template <typename P>
+__global__ void pad_buckets_kernel(P *v, P *v_r, unsigned block_size) {
+	// Allocate shared memory
+	// __shared__ P partial_sum[SHMEM_SIZE]; //use memory allocation like coop groups
+	// int partial_sum[];
+
+	// Calculate thread ID
+	int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int dont_write = (tid/block_size)%2;
+
+  v_r[tid] = dont_write? P::zero() : v[(tid/(block_size*2))*block_size + tid%block_size];
+}
+
+
+template <typename P> //todo-add SM and device function
+__global__ void reduce_triangles_kernel(P *source_buckets,P* temp_buckets, P *target_buckets, const unsigned source_c, const unsigned source_nof_bms) {
+	// Allocate shared memory
+	// __shared__ int partial_sum[SHMEM_SIZE];
+	
+	// Calculate thread ID
+	int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  const unsigned source_nof_buckets = source_nof_bms<<source_c;//2*2^8
+  // if (tid ==0) printf("source_nof_buckets %u\n",source_nof_buckets);
+  // if (tid ==9) printf("dims %u %u %u\n",blockIdx.x,blockDim.x,threadIdx.x);
+  const unsigned source_nof_bm_buckets = 1<<source_c;//2^8
+  unsigned temp_nof_bm_buckets = source_nof_bm_buckets;//2^8
+  const unsigned target_nof_bms = source_nof_bms<<1;//4
+  const unsigned target_c = source_c>>1;//4
+  // const unsigned target_nof_buckets = target_nof_bms<<target_c;
+  const unsigned target_nof_bm_buckets = 1<<target_c;//2^4
+  unsigned nof_threads_per_bm = source_nof_bm_buckets>>1;//2^7
+  // unsigned nof_threads_per_bm = target_nof_bm_buckets>>1;
+  // if (tid >= source_nof_buckets>>1) return; //total threads
+  unsigned bm_index = tid/nof_threads_per_bm; //blockidx
+  unsigned bm_bucket_index = tid%nof_threads_per_bm; //threadidx
+  unsigned bucket_index = bm_index*source_nof_bm_buckets + bm_bucket_index;
+
+  // if (tid ==0) printf("source_nof_buckets %u\n",source_nof_buckets);
+  // if (tid ==0) printf("source_nof_bm_buckets %u\n",source_nof_bm_buckets);
+  // if (tid ==0) printf("temp_nof_bm_buckets %u\n",temp_nof_bm_buckets);
+  // if (tid ==0) printf("target_nof_bms %u\n",target_nof_bms);
+  // if (tid ==0) printf("target_c %u\n",target_c);
+  // if (tid ==0) printf("target_nof_bm_buckets %u\n",target_nof_bm_buckets);
+  // if (tid ==0) printf("nof_threads_per_bm %u\n",nof_threads_per_bm);
+	// Load elements AND do first add of reduction
+	// Vector now 2x as long as number of threads, so scale i
+	// int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x;
+  // __syncthreads();
+  // if (tid ==0){ 
+  //   // printf("t\n");
+  //   // for (int i=0;i<source_nof_bms;i++){
+  //   // for (int j=0;j<source_nof_bm_buckets;j++)
+  //   // {printf("%u ",source_buckets[i*source_nof_bm_buckets+j].x.x);}
+  //   // printf("\n");
+  //   // }
+  //   printf("t\n");
+  //   for (int i=0;i<TEMP_NUM;i++)
+  //   {printf("%u ",source_buckets[i]);}
+  //   printf("\n");
+  //   }
+  // if (tid ==0) printf("\n");
+    // __syncthreads();
+	// Store first partial result instead of just the elements
+	temp_buckets[bucket_index] = source_buckets[bucket_index] + source_buckets[bucket_index + nof_threads_per_bm];
+  // cooperative_groups::grid_group g = cooperative_groups::this_grid(); 
+  // g.sync();
+	__syncthreads();
+  // if (tid ==32) printf("tid %u bucket_index %u temp_buckets[tid] %u\n",tid,bucket_index,temp_buckets[tid].x.x);
+
+  // if (tid ==0){ 
+  //   // for (int i=0;i<source_nof_bms<<1;i++){
+  //   // for (int j=0;j<source_nof_bm_buckets>>1;j++)
+  //   // {printf("%u ",temp_buckets[i*(source_nof_bm_buckets>>1)+j].x.x);}
+  //   // printf("\n");
+  //   // }
+  //   for (int i=0;i<TEMP_NUM;i++)
+  //   {printf("%u ",temp_buckets[i]);}
+  //   printf("\n");
+  //   }
+	// Start at 1/2 block stride and divide by two each iteration
+	// Stop early (call device function instead)
+	// for (int s = blockDim.x / 2; s > 32; s >>= 1) {
+	for (int s = nof_threads_per_bm/2; s > target_nof_bm_buckets/2; s >>= 1) {
+		// Each thread does work unless it is further than the stride
+    // temp_nof_bm_buckets = temp_nof_bm_buckets>>1;
+    // nof_threads_per_bm = temp_nof_bm_buckets>>1;
+    // bm_index = tid/nof_threads_per_bm;
+    // bm_bucket_index = tid%nof_threads_per_bm;
+    // bucket_index = bm_index*source_nof_bm_buckets + bm_bucket_index;
+    // if (tid ==9) printf("inds %u %u %u\n",bm_index,bm_bucket_index,bucket_index);
+		// if (tid < source_nof_bms*s) {
+    if (threadIdx.x < s) {
+			temp_buckets[bucket_index] = temp_buckets[bucket_index] + temp_buckets[bucket_index + s];
+		}
+		__syncthreads();
+    // if (tid ==0){ 
+    //   for (int i=0;i<TEMP_NUM;i++)
+    //    {printf("%u ",temp_buckets[i]);}
+    //    printf("\n");
+    //   }
+	}
+
+
+	// if (bm_bucket_index < 32) {
+	// 	warpReduce(temp_buckets, bucket_index, min(32,nof_threads_per_bm/2), target_nof_bm_buckets/2);
+	// }
+
+	// Let the thread 0 for this block write it's result to main memory
+	// Result is inexed by this block
+	// if (tid < source_nof_bms*target_nof_bm_buckets) { //optimize - last calculation needs to write too
+	if (threadIdx.x < target_nof_bm_buckets) { //optimize - last calculation needs to write too
+    // if (tid ==9) printf("inds %u %u %u\n",bm_index*temp_nof_bm_buckets + bm_bucket_index,bucket_index);
+		target_buckets[bm_index*target_nof_bm_buckets*2 + bm_bucket_index] = temp_buckets[bucket_index];
+    // if (bm_index*target_nof_bm_buckets*2 + bm_bucket_index==0) printf("tidddddd %u\n",temp_buckets[bucket_index].x.x);
+	}
+  // if (tid ==0){ 
+  //   for (int i=0;i<TEMP_NUM;i++)
+  //    {printf("%u ",target_buckets[i]);}
+  //    printf("\n");
+  //   }
+}
+
+template <typename P>
+__global__ void reduce_rectangles_kernel(P *source_buckets,P* temp_buckets, P *target_buckets, const unsigned source_c, const unsigned source_nof_bms) {
+	// Allocate shared memory
+	// __shared__ int partial_sum[SHMEM_SIZE];
+	
+	// Calculate thread ID
+	int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  const unsigned source_nof_buckets = source_nof_bms<<source_c;
+  unsigned source_nof_bm_buckets = 1<<source_c;
+  // const unsigned target_nof_bms = source_nof_bms<<1;
+  const unsigned target_c = source_c>>1;
+  // const unsigned target_nof_buckets = target_nof_bms<<target_c;
+  const unsigned source_nof_segment_buckets = source_nof_bm_buckets>>target_c;
+  unsigned temp_nof_segment_buckets = source_nof_segment_buckets;
+  unsigned target_nof_bm_buckets = 1<<target_c; //==segments per bm
+  // unsigned temp_nof_bm_buckets = 1<<target_c;
+  unsigned nof_threads_per_bm = source_nof_bm_buckets>>1;//2^7
+  unsigned nof_threads_per_segment = source_nof_segment_buckets>>1; //difference between kernels
+  // if (tid >= source_nof_buckets>>1) return; //total threads
+  unsigned bm_index = tid/nof_threads_per_bm; //blockidx
+  unsigned bm_bucket_index = tid%nof_threads_per_bm; //threadidx
+  unsigned segment_index = bm_bucket_index/nof_threads_per_segment;
+  unsigned segment_bucket_index = bm_bucket_index%nof_threads_per_segment;
+  unsigned bucket_index = bm_index*source_nof_bm_buckets + segment_index*source_nof_segment_buckets + segment_bucket_index;
+
+	// Load elements AND do first add of reduction
+	// Vector now 2x as long as number of threads, so scale i
+	// int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x;
+
+	// Store first partial result instead of just the elements
+  // if (tid ==0){ 
+  //   printf("rtar\n");
+  //   for (int i=0;i<TEMP_NUM;i++)
+  //   {printf("%u ",target_buckets[i]);}
+  //   printf("\n");
+  //   }
+	temp_buckets[bucket_index] = source_buckets[bucket_index] + source_buckets[bucket_index + nof_threads_per_segment];
+	__syncthreads();
+
+  // if (tid ==0){ 
+  //   for (int i=0;i<TEMP_NUM;i++)
+  //   {printf("%u ",temp_buckets[i]);}
+  //   printf("\n");
+  //   }
+	// Start at 1/2 block stride and divide by two each iteration
+	// Stop early (call device function instead)
+	// for (int s = blockDim.x / 2; s > 32; s >>= 1) {
+	for (int s = nof_threads_per_segment/2; s > 0; s >>= 1) {
+		// Each thread does work unless it is further than the stride
+    // temp_nof_segment_buckets = temp_nof_segment_buckets>>1;
+    // nof_threads_per_segment = temp_nof_segment_buckets>>1;
+    // segment_index = tid/nof_threads_per_segment;
+    // segment_bucket_index = tid%nof_threads_per_segment;
+    // bucket_index = segment_index*source_nof_segment_buckets + segment_bucket_index;
+		// if (tid < source_nof_bms*target_nof_bm_buckets*s) { //nof segments per bm
+		if (segment_bucket_index < s) { //nof segments per bm
+			temp_buckets[bucket_index] = temp_buckets[bucket_index] + temp_buckets[bucket_index + s];
+		}
+		__syncthreads();
+    // if (tid ==0){ 
+    //   for (int i=0;i<TEMP_NUM;i++)
+    //    {printf("%u ",temp_buckets[i]);}
+    //    printf("\n");
+    //   }
+	}
+
+
+	// if (bm_bucket_index < 32) {
+	// 	warpReduce(temp_buckets, bucket_index, min(32,nof_threads_per_bm/2), target_nof_bm_buckets/2);
+	// }
+
+	// Let the thread 0 for this block write it's result to main memory
+	// Result is inexed by this block
+
+	// if (tid < source_nof_bms*target_nof_bm_buckets) {
+	if (segment_bucket_index == 0) {
+    unsigned src_idx = bm_index*source_nof_bm_buckets + segment_index*source_nof_segment_buckets + segment_bucket_index;
+    unsigned dst_idx = target_nof_bm_buckets*(1+bm_index*2) + segment_index;
+    target_buckets[dst_idx] = temp_buckets[src_idx];
+    // if (dst_idx==0) printf("tirrrr %u %u\n\n\n\n\n",tid,temp_buckets[src_idx]);
+    // printf("tid %u dst_idx %u\n",tid,dst_idx);
+    // segment_index = tid/target_nof_bm_buckets;
+    // segment_bucket_index = tid%target_nof_bm_buckets;
+    // bucket_index = target_nof_bm_buckets + segment_index*target_nof_bm_buckets*2 + segment_bucket_index;
+		// target_buckets[bucket_index] = temp_buckets[target_nof_bm_buckets*tid];
+	}
+  // if (tid ==0){ 
+  //   for (int i=0;i<TEMP_NUM;i++)
+  //    {printf("%u ",target_buckets[i]);}
+  //    printf("\n");
+  //   }
+}
+
+unsigned log2_floor(const unsigned value) {
+  unsigned v = value;
+  unsigned result = 0;
+  while (v >>= 1)
+    result++;
+  return result;
+}
+
+unsigned log2_ceiling(const unsigned value) { return value <= 1 ? 0 : log2_floor(value - 1) + 1; }
+
+unsigned get_optimal_log_data_split(const unsigned mpc, const unsigned source_window_bits, const unsigned target_window_bits,
+  const unsigned target_windows_count) {
+#define MAX_THREADS 32
+#define MIN_BLOCKS 12
+const unsigned full_occupancy = mpc * MAX_THREADS * MIN_BLOCKS;
+const unsigned target = full_occupancy << 6;
+const unsigned unit_threads_count = target_windows_count << target_window_bits;
+const unsigned split_target = log2_ceiling(target / unit_threads_count);
+const unsigned split_limit = source_window_bits - target_window_bits - 1;
+return std::min(split_target, split_limit);
+}
+
+template <typename T>
+static constexpr __device__ __forceinline__ T ld_single(const T *ptr) {
+return __ldg(ptr);
+};
+
+template <class T, typename U, unsigned STRIDE>
+static constexpr __device__ __forceinline__ T ld(const T *address, const unsigned offset) {
+  static_assert(alignof(T) % alignof(U) == 0);
+  static_assert(sizeof(T) % sizeof(U) == 0);
+  constexpr size_t count = sizeof(T) / sizeof(U);
+  T result = {};
+  auto pa = reinterpret_cast<const U *>(address) + offset;
+  auto pr = reinterpret_cast<U *>(&result);
+#pragma unroll
+  for (unsigned i = 0; i < count; i++) {
+    const auto pai = pa + i * STRIDE;
+    const auto pri = pr + i;
+    *pri = ld_single<U>(pai);
+  }
+  return result;
+}
+
+template <class T, unsigned STRIDE = 1, typename U = std::enable_if_t<sizeof(T) % sizeof(uint4) == 0, uint4>>
+static constexpr __device__ __forceinline__ T memory_load(const T *address, const unsigned offset = 0, [[maybe_unused]] uint4 _dummy = {}) {
+  return ld<T, U, STRIDE>(address, offset);
+};
+
+template <class T, unsigned STRIDE = 1, typename U = std::enable_if_t<(sizeof(T) % sizeof(uint4) != 0) && (sizeof(T) % sizeof(uint2) == 0), uint2>>
+static constexpr __device__ __forceinline__ T memory_load(const T *address, const unsigned offset = 0, [[maybe_unused]] uint2 _dummy = {}) {
+  return ld<T, U, STRIDE>(address, offset);
+};
+
+template <class T, unsigned STRIDE = 1, typename U = std::enable_if_t<sizeof(T) % sizeof(uint2) != 0, unsigned>>
+static constexpr __device__ __forceinline__ T memory_load(const T *address, const unsigned offset = 0, [[maybe_unused]] unsigned _dummy = {}) {
+  return ld<T, U, STRIDE>(address, offset);
+};
 
 //this kernel performs single scalar multiplication
 //each thread multilies a single scalar and point
@@ -62,42 +439,157 @@ __global__ void initialize_buckets_kernel(P *buckets, unsigned N) {
 //this kernel splits the scalars into digits of size c
 //each thread splits a single scalar into nof_bms digits
 template <typename S>
-__global__ void split_scalars_kernel(unsigned *buckets_indices, unsigned *point_indices, S *scalars, unsigned total_size, unsigned msm_log_size, unsigned nof_bms, unsigned bm_bitsize, unsigned c){
+__global__ void split_scalars_kernel(unsigned *buckets_indices, unsigned *point_indices, S *scalars, unsigned total_size, unsigned msm_log_size, unsigned nof_bms, unsigned bm_bitsize, unsigned c, unsigned top_bm_nof_missing_bits){
   
+  constexpr unsigned sign_mask = 0x80000000;
+  // constexpr unsigned trash_bucket = 0x80000000;
   unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
   unsigned bucket_index;
+  unsigned bucket_index2;
   unsigned current_index;
   unsigned msm_index = tid >> msm_log_size;
+  unsigned borrow = 0;
   if (tid < total_size){
     S scalar = scalars[tid];
+    // A point = points[tid];
+    // if (scalar == S::zero() || scalar == S::one()) return;
+    // if (scalar == S::zero()) return;
+    // if (tid == 0) printf("scalar %u", scalar);
 
     for (unsigned bm = 0; bm < nof_bms; bm++)
     {
+      // bucket_index = scalar.get_scalar_digit(bm, c) + (bm==nof_bms-1? ((tid&top_bm_nof_missing_bits)<<(c-top_bm_nof_missing_bits)) : 0);
       bucket_index = scalar.get_scalar_digit(bm, c);
+      #ifdef SIGNED_DIG
+      bucket_index += borrow;
+      borrow = 0;
+      unsigned sign = 0;
+      // if (tid == 0) printf("index %u", bucket_index);
+      if (bucket_index > (1<<(c-1))) {
+        bucket_index = (1 << c) - bucket_index;
+        borrow = 1;
+        sign = sign_mask;
+      }
+      #endif
+      // if (tid == 0) printf("new index %u", bucket_index);
+      // if (bm==nof_bms-1) {
+      //   bucket_index2 = bucket_index + ((tid&((1<<top_bm_nof_missing_bits)-1))<<(c-top_bm_nof_missing_bits));
+      //   if (tid<10) printf("tid %u bi1 %u bi2 %u\n",tid, bucket_index, bucket_index2);
+      //   bucket_index = bucket_index2;
+      // }
       current_index = bm * total_size + tid;
+      #ifdef SIGNED_DIG
+      // buckets_indices[current_index] = (msm_index<<(c-1+bm_bitsize)) | (bm<<(c-1)) | bucket_index;  //the bucket module number and the msm number are appended at the msbs
+      point_indices[current_index] = sign | tid; //the point index is saved for later
+      #else
       buckets_indices[current_index] = (msm_index<<(c+bm_bitsize)) | (bm<<c) | bucket_index;  //the bucket module number and the msm number are appended at the msbs
+      if (scalar == S::zero() || scalar == S::one()) buckets_indices[current_index] = 0; //will be skipped
       point_indices[current_index] = tid; //the point index is saved for later
+      #endif
     }
   }
 }
 
+template <typename P, typename A, typename S>
+__global__ void add_ones_kernel(A *points, S* scalars, P* results, const unsigned msm_size, const unsigned run_length){
+  unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
+  const unsigned nof_threads = (msm_size + run_length - 1)/run_length;
+  if (tid>=nof_threads) {
+    results[tid] = P::zero();
+    return;
+  }
+  const unsigned start_index = tid*run_length;
+  P sum = P::zero();
+  for (int i=start_index;i<min(start_index+run_length,msm_size);i++){
+    if (scalars[i] == S::one()) sum = sum + points[i];
+  }
+  results[tid] = sum;
+}
+
 //this kernel adds up the points in each bucket
-template <typename P, typename A>
 // __global__ void accumulate_buckets_kernel(P *__restrict__ buckets, unsigned *__restrict__ bucket_offsets,
-              //  unsigned *__restrict__ bucket_sizes, unsigned *__restrict__ single_bucket_indices, unsigned *__restrict__ point_indices, A *__restrict__ points, unsigned nof_buckets, unsigned batch_size, unsigned msm_idx_shift){
-__global__ void accumulate_buckets_kernel(P *buckets, unsigned *bucket_offsets, unsigned *bucket_sizes, unsigned *single_bucket_indices, unsigned *point_indices, A *points, unsigned nof_buckets, unsigned *nof_buckets_to_compute, unsigned msm_idx_shift){
+  //  unsigned *__restrict__ bucket_sizes, unsigned *__restrict__ single_bucket_indices, unsigned *__restrict__ point_indices, A *__restrict__ points, unsigned nof_buckets, unsigned batch_size, unsigned msm_idx_shift){
+template <typename P, typename A>
+__global__ void accumulate_buckets_kernel(P *__restrict__ buckets, const unsigned *__restrict__ bucket_offsets, const unsigned *__restrict__ bucket_sizes, const unsigned *__restrict__ single_bucket_indices, const unsigned *__restrict__ point_indices, A *__restrict__ points, const unsigned nof_buckets, const unsigned *nof_buckets_to_compute, const unsigned msm_idx_shift, const unsigned c){
   
+  constexpr unsigned sign_mask = 0x80000000;
+  // constexpr unsigned trash_bucket = 0x80000000;
   unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
-  if (tid >= *nof_buckets_to_compute){ 
+  // if (tid>=*nof_buckets_to_compute || tid<11){ 
+  if (tid>=*nof_buckets_to_compute){ 
     return;
   }
+  if ((single_bucket_indices[tid]&((1<<c)-1))==0)
+  {
+    // printf("cond %u %u\n",tid,single_bucket_indices[tid]);
+    return; //skip zero buckets
+  } 
+  #ifdef SIGNED_DIG //todo - fix
+  const unsigned msm_index = single_bucket_indices[tid]>>msm_idx_shift;
+  const unsigned bm_index = (single_bucket_indices[tid]&((1<<msm_idx_shift)-1))>>c;
+  const unsigned bucket_index = msm_index * nof_buckets + bm_index * ((1<<(c-1))+1) + (single_bucket_indices[tid]&((1<<c)-1));
+  #else
   unsigned msm_index = single_bucket_indices[tid]>>msm_idx_shift;
   unsigned bucket_index = msm_index * nof_buckets + (single_bucket_indices[tid]&((1<<msm_idx_shift)-1));
-  unsigned bucket_offset = bucket_offsets[tid];
+  #endif
+  const unsigned bucket_offset = bucket_offsets[tid];
+  const unsigned bucket_size = bucket_sizes[tid];
+  // if (bucket_size == 0) {printf("watt"); return;}
+  // if (bucket_size > 10) {printf(">10: %u %u %u\n",tid,single_bucket_indices[tid],single_bucket_indices[tid]&((1<<c)-1));}
+  // if (tid<10) printf("tid %u size %u\n", tid, bucket_sizes[tid]);
+  // if (tid>=*nof_buckets_to_compute-10) printf("tid %u size %u\n", tid, bucket_sizes[tid]);
+  // if (tid==0) return;
+  // if ((bucket_index>>20)==13) return;
+  // if (bucket_sizes[tid]==16777216) printf("tid %u size %u bucket %u offset %u\n", tid, bucket_sizes[tid], bucket_index, bucket_offset);
+  // const unsigned *indexes = point_indices + bucket_offset;
+  // P bucket = P::zero(); //todo: get rid of init buckets? no.. because what about buckets with no points
+  P bucket; //todo: get rid of init buckets? no.. because what about buckets with no points
+  // unsigned point_ind;
   for (unsigned i = 0; i < bucket_sizes[tid]; i++)  //add the relevant points starting from the relevant offset up to the bucket size
   {
-    buckets[bucket_index] = buckets[bucket_index] + points[point_indices[bucket_offset+i]];
+    // unsigned point_ind = *indexes++;
+    // auto point = memory_load<A>(points + point_ind);
+    // point_ind = point_indices[bucket_offset+i];
+    // bucket = bucket + P::one();
+    unsigned point_ind = point_indices[bucket_offset+i];
+    #ifdef SIGNED_DIG
+    unsigned sign = point_ind & sign_mask;
+    point_ind &= ~sign_mask;
+    // printf("tid %u sign %u point ind %u \n", tid,sign, point_ind);
+    A point = points[point_ind];
+    if (sign) point = A::neg(point);
+    #else
+    A point = points[point_ind];
+    #endif
+    bucket = i? bucket + point : P::from_affine(point);
+    // const unsigned* pa = reinterpret_cast<const unsigned*>(points[point_ind]);
+    // P point;
+    // Dummy_Scalar scal;
+    // scal.x = __ldg(pa);
+    // point.x = scal;
+    // bucket = bucket + point;
+  }
+  // buckets[tid] = bucket;
+  buckets[bucket_index] = bucket;
+}
+
+template <typename P, typename A, typename S>
+__global__ void accumulate_buckets_kernel2(P *buckets, A *points, S *scalars, const unsigned c,const unsigned nof_bms, const unsigned size){
+  
+  unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if (tid>=size) return;
+  
+  S scalar = scalars[tid];
+  A point = points[tid];
+  unsigned bucket_index;
+
+  for (unsigned bm = 0; bm < nof_bms; bm++)
+  {
+    // bucket_index = scalar.get_scalar_digit(bm, c) + (bm==nof_bms-1? ((tid&top_bm_nof_missing_bits)<<(c-top_bm_nof_missing_bits)) : 0);
+    bucket_index = scalar.get_scalar_digit(bm, c);
+    buckets[bucket_index] = buckets[bucket_index] + point;
   }
+
 }
 
 //this kernel sums the entire bucket module
@@ -106,16 +598,103 @@ template <typename P>
 __global__ void big_triangle_sum_kernel(P* buckets, P* final_sums, unsigned nof_bms, unsigned c){
 
   unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
-  if (tid >= nof_bms) return;
-  P line_sum = buckets[(tid+1)*(1<<c)-1];
+  if (tid>=nof_bms) return;
+  #ifdef SIGNED_DIG
+  unsigned buckets_in_bm = (1<<c)+1;
+  #else
+  unsigned buckets_in_bm = (1<<c);
+  #endif
+  P line_sum = buckets[(tid+1)*buckets_in_bm-1];
   final_sums[tid] = line_sum;
-  for (unsigned i = (1<<c)-2; i >0; i--)
+  for (unsigned i = buckets_in_bm-2; i >0; i--)
   {
-    line_sum = line_sum + buckets[tid*(1<<c) + i];  //using the running sum method
+    line_sum = line_sum + buckets[tid*buckets_in_bm + i];  //using the running sum method
     final_sums[tid] = final_sums[tid] + line_sum;
   }
 }
 
+template <typename P>
+__global__ void split_windows_kernel_inner(const unsigned source_window_bits_count, const unsigned source_windows_count,
+  const P *__restrict__ source_buckets, P *__restrict__ target_buckets, const unsigned count) {
+const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
+if (gid >= count) //0,1,2,2^8,2^8+1,2^8+2,32*2^8,32*2^8+1,32*2^8+2^8,32*2^8+2^8+1
+return;
+const unsigned target_window_bits_count = (source_window_bits_count + 1) >> 1; //8
+const unsigned target_windows_count = source_windows_count << 1; //32
+const unsigned target_partition_buckets_count = target_windows_count << target_window_bits_count; // 32*2^8
+const unsigned target_partitions_count = count / target_partition_buckets_count; //2^7
+const unsigned target_partition_index = gid / target_partition_buckets_count; //*0,0,0,0,0,0,1,1,1,1
+const unsigned target_partition_tid = gid % target_partition_buckets_count; //*0,1,2,2^8,2^8+1,2^8+2,0,1,2^8,2^8+1
+const unsigned target_window_buckets_count = 1 << target_window_bits_count; // 2^8
+const unsigned target_window_index = target_partition_tid / target_window_buckets_count; //* 0,0.0,1,1,1,0,0,1,1
+const unsigned target_window_tid = target_partition_tid % target_window_buckets_count; //* 0,1,2,0,1,2,0,1,0,1,2
+const unsigned split_index = target_window_index & 1; //*0,0,0,1,1,1,0,0,1,1,1
+const unsigned source_window_buckets_per_target = source_window_bits_count & 1 // is c odd?
+? split_index ? (target_window_tid >> (target_window_bits_count - 1) ? 0 : target_window_buckets_count) //is the target odd?
+             : 1 << (source_window_bits_count - target_window_bits_count)
+: target_window_buckets_count; //2^8
+const unsigned source_window_index = target_window_index >> 1; //*0,0,0,0,0,0,0,0,0,0,0
+const unsigned source_offset = source_window_index << source_window_bits_count; //*0,0,0,0,0,0,0,0,0,0,
+const unsigned target_shift = target_window_bits_count * split_index; //*0,0,0,8,8,8,0,0,8,8,8
+const unsigned target_offset = target_window_tid << target_shift;//*0,1,2,0,2^8,2^9,0,1,0,2^8,2*2^8
+const unsigned global_offset = source_offset + target_offset;//*0,1,2,0,2^8,2^9,0,1
+const unsigned index_mask = (1 << target_shift) - 1; //*0,0,0,2^8-1,2^8-1,2^8-1,0,0,2^8-1,2^8-1
+P target_bucket = P::zero();
+#pragma unroll 1
+for (unsigned i = target_partition_index; i < source_window_buckets_per_target; i += target_partitions_count) { //from the partition start(*0,0,0,0,0,0,1,1,1,1), stride 2^7, until 2^8 = loop twice
+const unsigned index_offset = i & index_mask | (i & ~index_mask) << target_window_bits_count; //*0 2^15,0 2^15,0 2^15,0 2^15,0 2^15,0 2^15,2^8 2^8+2^15,2^8 2^8+2^15,2^8 2^8+2^15,2^8 2^8+2^15
+const unsigned load_offset = global_offset + index_offset;//*0 2^15,1 2^15+1,2 2^15+2, 0 2^15, 2^8 2^8+2^15, 2^8 2^8+2^15, 2^8+1 2^8+2^15+1
+const auto source_bucket = source_buckets[load_offset];
+target_bucket = i == target_partition_index ? source_bucket : target_bucket + source_bucket; //*0+2^15,1+2^15+1,2+2^15+2,...2^8-1+2^15+2^8-1| 0+2^7, 2^8+2^8+2^7...||2^8+2^8+2^15, 2^8+1+2^8+2^15+1...2^9-1+2^9-1+2^15|1+2^7+1, 2^8+1+2^8+2^7+1...
+}
+target_buckets[gid] = target_bucket; //0,1,2^8,2^8+1,32*2^8,32*2^8+1
+}
+
+template <typename P>
+__global__ void reduce_buckets_kernel(P *buckets, const unsigned count) {
+  const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (gid >= count)
+    return;
+  // buckets += gid;
+  const auto a = buckets[gid];
+  const auto b = buckets[gid+count];
+  const P result = a+b;
+  buckets[gid] = result;
+}
+
+template <typename P>
+__global__ void reduce_buckets_kernel2(P *source, P *target, const unsigned count) {
+  const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (gid >= count)
+    return;
+  const auto a = source[gid];
+  const auto b = source[gid+count];
+  const P result = a+b;
+  target[gid] = result;
+}
+
+template <typename P>
+__global__ void last_pass_gather_kernel(const unsigned bits_count_pass_one, const P *__restrict__ source, P *__restrict__ target,
+  const unsigned count) {
+const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
+if (gid >= count)
+return;
+unsigned window_index = gid / bits_count_pass_one;
+unsigned window_tid = gid % bits_count_pass_one;
+for (unsigned bits_count = bits_count_pass_one; bits_count > 1;) {
+bits_count = (bits_count + 1) >> 1;
+window_index <<= 1;
+if (window_tid >= bits_count) {
+window_index++;
+window_tid -= bits_count;
+}
+}
+const unsigned sid = (window_index << 1) + 1;
+const auto pz = source[sid];
+// const point_jacobian pj = point_xyzz::to_jacobian(pz, f);
+target[gid] = pz;
+}
+
 //this kernel uses single scalar multiplication to multiply each bucket by its index
 //each thread deals with a single bucket
 template <typename P, typename S>
@@ -130,10 +709,17 @@ __global__ void ssm_buckets_kernel(P* buckets, unsigned* single_bucket_indices,
 
 }
 
+template <typename P>
+__global__ void last_pass_kernel(P*final_buckets, P*final_sums, unsigned num_sums){
+  unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if (tid>num_sums) return;
+  final_sums[tid] = final_buckets[2*tid+1];
+}
+
 //this kernel computes the final result using the double and add algorithm
 //it is done by a single thread
 template <typename P, typename S>
-__global__ void final_accumulation_kernel(P* final_sums, P* final_results, unsigned nof_msms, unsigned nof_bms, unsigned c){
+__global__ void final_accumulation_kernel(P* final_sums, P* ones_result, P* final_results, unsigned nof_msms, unsigned nof_bms, unsigned c){
   
   unsigned tid = (blockIdx.x * blockDim.x) + threadIdx.x;
   if (tid>nof_msms) return;
@@ -146,14 +732,168 @@ __global__ void final_accumulation_kernel(P* final_sums, P* final_results, unsig
       final_result = final_result + final_result;
     }
   }
-  final_results[tid] = final_result + final_sums[tid*nof_bms];
+  final_results[tid] = final_result + final_sums[tid*nof_bms] + ones_result[0];
+  // final_results[tid] = final_result + final_sums[tid*nof_bms];
+
+}
+
+template <typename P>
+void test_reduce_triangle(P* h_buckets){
+  for (int i=0; i<TEMP_NUM; i++) std::cout<<h_buckets[i]<<" ";
+  std::cout<<std::endl;
+  P*buckets;
+  P*temp;
+  P*target;
+  unsigned count = TEMP_NUM;
+  cudaMalloc(&buckets, sizeof(P) * count);
+  cudaMemcpy(buckets, h_buckets, sizeof(P) * count, cudaMemcpyHostToDevice);
+  cudaMalloc(&temp, sizeof(P) * count);
+  cudaMalloc(&target, sizeof(P) * count);
+  // reduce_triangles_kernel<<<4,8>>>(buckets,temp,target,4,4);
+  // reduce_triangles_kernel<<<5,8>>>(buckets,temp,target,4,4);
+  general_sum_reduction_kernel<<<5,8>>>(buckets,target,4,4,0);
+  cudaDeviceSynchronize();
+  printf("cuda error %u\n",cudaGetLastError());
+  
+  std::vector<P> h_target;
+  h_target.reserve(TEMP_NUM);
+  cudaMemcpy(h_target.data(), target, sizeof(P) * TEMP_NUM, cudaMemcpyDeviceToHost);
+    std::cout<<cudaGetLastError()<<std::endl;
+  std::cout<<"target"<<std::endl;
+  for (int i = 0; i < TEMP_NUM; i++)
+  {
+    std::cout<<h_target[i]<<" ";
+  }
+  std::cout<<std::endl;
+
+  // std::vector<P> h_buckets;
+  // h_buckets.reserve(nof_buckets);
+  //   cudaMemcpy(h_buckets.data(), buckets, sizeof(P) * nof_buckets, cudaMemcpyDeviceToHost);
+  //   std::cout<<"buckets accumulated"<<std::endl;
+  //   for (unsigned i = 0; i < nof_buckets; i++)
+  //   {
+  //     std::cout<<h_buckets[i]<<" ";
+  //   }
+  //   std::cout<<std::endl;
+
+}
+
+template <typename P>
+void test_reduce_var(P* h_buckets){
+  for (int i=0; i<TEMP_NUM; i++) std::cout<<h_buckets[i]<<" ";
+  std::cout<<std::endl;
+  P*buckets;
+  P*temp;
+  P*target;
+  unsigned count = TEMP_NUM;
+  cudaMalloc(&buckets, sizeof(P) * count);
+  cudaMemcpy(buckets, h_buckets, sizeof(P) * count, cudaMemcpyHostToDevice);
+  cudaMalloc(&temp, sizeof(P) * count);
+  cudaMalloc(&target, sizeof(P) * count);
+  // reduce_rectangles_kernel<<<5,8>>>(buckets,temp,target,4,4);
+  // single_stage_multi_reduction_kernel<<<1,64>>>(buckets,target,16,8,0);
+  unsigned h_sizes[10] = {4,4,4,4};
+  unsigned h_offsets[10] = {2,2,6,6};
+  unsigned *sizes;
+  unsigned *offsets;
+  cudaMalloc(&sizes, sizeof(unsigned) * count);
+  cudaMalloc(&offsets, sizeof(unsigned) * count);
+  cudaMemcpy(sizes, h_sizes, sizeof(unsigned) * count, cudaMemcpyHostToDevice);
+  cudaMemcpy(offsets, h_offsets, sizeof(unsigned) * count, cudaMemcpyHostToDevice);
+  variable_block_multi_reduction_kernel<<<1,4>>>(buckets,target,sizes,offsets,0,0);
+  
+  cudaDeviceSynchronize();
+  printf("cuda error %u\n",cudaGetLastError());
+  std::vector<P> h_target;
+  h_target.reserve(TEMP_NUM);
+  cudaMemcpy(h_target.data(), target, sizeof(P) * TEMP_NUM, cudaMemcpyDeviceToHost);
+    std::cout<<cudaGetLastError()<<std::endl;
+  std::cout<<"target"<<std::endl;
+  for (int i = 0; i < TEMP_NUM; i++)
+  {
+    std::cout<<h_target[i]<<" ";
+  }
+  std::cout<<std::endl;
+}
+
+
+template <typename P>
+void test_reduce_single(P* h_buckets){
+  for (int i=0; i<TEMP_NUM; i++) std::cout<<h_buckets[i]<<" ";
+  std::cout<<std::endl;
+  P*buckets;
+  P*temp;
+  P*target;
+  unsigned count = TEMP_NUM;
+  cudaMalloc(&buckets, sizeof(P) * count);
+  cudaMemcpy(buckets, h_buckets, sizeof(P) * count, cudaMemcpyHostToDevice);
+  cudaMalloc(&temp, sizeof(P) * count);
+  cudaMalloc(&target, sizeof(P) * count);
+  // reduce_rectangles_kernel<<<5,8>>>(buckets,temp,target,4,4);
+  // single_stage_multi_reduction_kernel<<<1,64>>>(buckets,target,16,8,0);
+  single_stage_multi_reduction_kernel<<<2,32>>>(buckets,target,2,0,0);
+  
+  cudaDeviceSynchronize();
+  printf("cuda error %u\n",cudaGetLastError());
+  std::vector<P> h_target;
+  h_target.reserve(TEMP_NUM);
+  cudaMemcpy(h_target.data(), target, sizeof(P) * TEMP_NUM, cudaMemcpyDeviceToHost);
+    std::cout<<cudaGetLastError()<<std::endl;
+  std::cout<<"target"<<std::endl;
+  for (int i = 0; i < TEMP_NUM; i++)
+  {
+    std::cout<<h_target[i]<<" ";
+  }
+  std::cout<<std::endl;
+}
 
+template <typename P>
+void test_reduce_rectangle(P* h_buckets){
+  for (int i=0; i<TEMP_NUM; i++) std::cout<<h_buckets[i]<<" ";
+  std::cout<<std::endl;
+  P*buckets;
+  P*temp;
+  P*target;
+  unsigned count = TEMP_NUM;
+  cudaMalloc(&buckets, sizeof(P) * count);
+  cudaMemcpy(buckets, h_buckets, sizeof(P) * count, cudaMemcpyHostToDevice);
+  cudaMalloc(&temp, sizeof(P) * count);
+  cudaMalloc(&target, sizeof(P) * count);
+  // reduce_rectangles_kernel<<<5,8>>>(buckets,temp,target,4,4);
+  general_sum_reduction_kernel<<<20,2>>>(buckets,target,1,4,1);
+  
+  cudaDeviceSynchronize();
+  printf("cuda error %u\n",cudaGetLastError());
+  std::vector<P> h_target;
+  h_target.reserve(TEMP_NUM);
+  cudaMemcpy(h_target.data(), target, sizeof(P) * TEMP_NUM, cudaMemcpyDeviceToHost);
+    std::cout<<cudaGetLastError()<<std::endl;
+  std::cout<<"target"<<std::endl;
+  for (int i = 0; i < TEMP_NUM; i++)
+  {
+    std::cout<<h_target[i]<<" ";
+  }
+  std::cout<<std::endl;
 }
 
+
 //this function computes msm using the bucket method
 template <typename S, typename P, typename A>
-void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsigned size, P* final_result, bool on_device, cudaStream_t stream) {
+void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsigned size, P* final_result, bool on_device, bool big_triangle, cudaStream_t stream) {
   
+  // std::cout<<"points"<<std::endl;
+  // for (int i = 0; i < size; i++)
+  // {
+  //   std::cout<<points[i]<<" ";
+  // }
+  // std::cout<<std::endl;
+  // std::cout<<"scalars"<<std::endl;
+  // for (int i = 0; i < size; i++)
+  // {
+  //   std::cout<<scalars[i]<<" ";
+  // }
+  // std::cout<<std::endl;
+
   S *d_scalars;
   A *d_points;
   if (!on_device) {
@@ -173,17 +913,63 @@ void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsi
   unsigned nof_bms = bitsize/c;
   unsigned msm_log_size = ceil(log2(size));
   unsigned bm_bitsize = ceil(log2(nof_bms));
-
   if (bitsize%c){
     nof_bms++;
   }
+  unsigned top_bm_nof_missing_bits = c*nof_bms - bitsize;
+  std::cout << "top_bm_nof_missing_bits" << top_bm_nof_missing_bits <<std::endl;
+  #ifdef SIGNED_DIG
+  unsigned nof_buckets = nof_bms*((1<<(c-1))+1); //signed digits
+  #else
   unsigned nof_buckets = nof_bms<<c;
+  #endif
   cudaMallocAsync(&buckets, sizeof(P) * nof_buckets, stream);
 
   // launch the bucket initialization kernel with maximum threads
   unsigned NUM_THREADS = 1 << 10;
   unsigned NUM_BLOCKS = (nof_buckets + NUM_THREADS - 1) / NUM_THREADS;
   initialize_buckets_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(buckets, nof_buckets);
+  // cudaDeviceSynchronize();
+//   printf("cuda error %u\n",cudaGetLastError());
+
+  //accumulate ones
+  P *ones_results; //fix whole division, in last run in kernel too
+  const unsigned nof_runs = max(1<<(msm_log_size-6), 16);
+  const unsigned run_length = (size + nof_runs -1)/nof_runs;
+  cudaMallocAsync(&ones_results, sizeof(P) * nof_runs, stream);
+  NUM_THREADS = min(1 << 8,nof_runs);
+  NUM_BLOCKS = (nof_runs + NUM_THREADS - 1) / NUM_THREADS;
+  add_ones_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(d_points, d_scalars, ones_results, size, run_length);
+  // cudaDeviceSynchronize();
+//   printf("cuda error ones  %u\n",cudaGetLastError());
+
+  // cudaDeviceSynchronize();
+  // std::vector<P> h_ones_results;
+  // h_ones_results.reserve(nof_runs);
+  // cudaMemcpy(h_ones_results.data(), ones_results, sizeof(P) * nof_runs, cudaMemcpyDeviceToHost);
+  // std::cout<<"one results"<<std::endl;
+  // for (int i = 0; i < nof_runs; i++)
+  // {
+  //   std::cout<<h_ones_results[i]<<" ";
+  // }
+  // std::cout<<std::endl;
+
+  for (int s=nof_runs>>1;s>0;s>>=1){
+  NUM_THREADS = min(MAX_TH,s);
+  NUM_BLOCKS = (s + NUM_THREADS - 1) / NUM_THREADS;
+  single_stage_multi_reduction_kernel<<<NUM_BLOCKS, NUM_THREADS,0,stream>>>(ones_results,ones_results,s*2,0,0,0);
+  // cudaDeviceSynchronize();
+  // printf("cuda error ones  %u\n",cudaGetLastError());
+  // cudaDeviceSynchronize();
+  // cudaMemcpy(h_ones_results.data(), ones_results, sizeof(P) * nof_runs, cudaMemcpyDeviceToHost);
+  // std::cout<<"one results"<<std::endl;
+  // for (int i = 0; i < nof_runs; i++)
+  // {
+  //   std::cout<<h_ones_results[i]<<" ";
+  // }
+  // std::cout<<std::endl;
+  }
+  #ifndef PHASE1_TEST
 
   unsigned *bucket_indices;
   unsigned *point_indices;
@@ -194,8 +980,46 @@ void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsi
   NUM_THREADS = 1 << 10;
   NUM_BLOCKS = (size * (nof_bms+1) + NUM_THREADS - 1) / NUM_THREADS;
   split_scalars_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(bucket_indices + size, point_indices + size, d_scalars, size, msm_log_size, 
-                                                    nof_bms, bm_bitsize, c); //+size - leaving the first bm free for the out of place sort later
-  
+                                                    nof_bms, bm_bitsize, c, top_bm_nof_missing_bits); //+size - leaving the first bm free for the out of place sort later
+                                                    // cudaDeviceSynchronize();
+                                                    // printf("cuda error %u\n",cudaGetLastError());
+
+
+  // cudaDeviceSynchronize();
+  // std::vector<unsigned> h_bucket_ind;
+  // std::vector<unsigned> h_point_ind;
+  // h_bucket_ind.reserve(size * (nof_bms+1));
+  // h_point_ind.reserve(size * (nof_bms+1));
+  // cudaMemcpy(h_bucket_ind.data(), bucket_indices, sizeof(unsigned) * size * (nof_bms+1), cudaMemcpyDeviceToHost);
+  // cudaMemcpy(h_point_ind.data(), point_indices, sizeof(unsigned) * size * (nof_bms+1), cudaMemcpyDeviceToHost);
+  //   std::cout<<cudaGetLastError()<<std::endl;
+  // std::cout<<"buckets inds"<<std::endl;
+  // for (int i = 0; i < size * (nof_bms+1); i++)
+  // {
+  //   std::cout<<h_bucket_ind[i]<<" ";
+  // }
+  // std::cout<<std::endl;
+  // std::cout<<"points inds"<<std::endl;
+  // for (int i = 0; i < size * (nof_bms+1); i++)
+  // {
+  //   std::cout<<h_point_ind[i]<<" ";
+  // }
+  // std::cout<<std::endl;
+
+  // std::cout<<"pure buckets inds"<<std::endl;
+  // for (int i = 0; i < size * (nof_bms+1); i++)
+  // {
+  //   std::cout<<h_bucket_ind[i]%(1<<(c-1))<<" ";
+  // }
+  // std::cout<<std::endl;
+  // std::cout<<"pure points inds"<<std::endl;
+  // for (int i = 0; i < size * (nof_bms+1); i++)
+  // {
+  //   std::cout<<h_point_ind[i]%(1<<31)<<" ";
+  // }
+  // std::cout<<std::endl;
+                                                    
+
   //sort indices - the indices are sorted from smallest to largest in order to group together the points that belong to each bucket
   unsigned *sort_indices_temp_storage{};
   size_t sort_indices_temp_storage_bytes;
@@ -240,11 +1064,61 @@ void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsi
   cub::DeviceScan::ExclusiveSum(offsets_temp_storage, offsets_temp_storage_bytes, bucket_sizes, bucket_offsets, nof_buckets, stream);
   cudaFreeAsync(offsets_temp_storage, stream);
 
+  //sort by bucket sizes
+  unsigned* sorted_bucket_sizes;
+  cudaMallocAsync(&sorted_bucket_sizes, sizeof(unsigned)*nof_buckets, stream);
+  unsigned* sorted_bucket_offsets;
+  cudaMallocAsync(&sorted_bucket_offsets, sizeof(unsigned)*nof_buckets, stream);
+  // unsigned* sort_offsets_temp_storage{};
+  // size_t sort_offsets_temp_storage_bytes = 0;
+  // cub::DeviceRadixSort::SortPairsDescending(sort_offsets_temp_storage, sort_offsets_temp_storage_bytes, bucket_sizes,
+  //   sorted_bucket_sizes, bucket_offsets, sorted_bucket_offsets, nof_buckets, 0, sizeof(unsigned) * 8, stream);
+  // cudaMallocAsync(&sort_offsets_temp_storage, sort_offsets_temp_storage_bytes, stream);
+  // cub::DeviceRadixSort::SortPairsDescending(sort_offsets_temp_storage, sort_offsets_temp_storage_bytes, bucket_sizes,
+  //   sorted_bucket_sizes, bucket_offsets, sorted_bucket_offsets, nof_buckets, 0, sizeof(unsigned) * 8, stream);
+  // cudaFreeAsync(sort_offsets_temp_storage, stream);
+      
+      
+  // unsigned* sorted_single_bucket_indices;
+  // cudaMallocAsync(&sorted_single_bucket_indices, sizeof(unsigned)*nof_buckets, stream);
+  // unsigned* sort_single_temp_storage{};
+  // size_t sort_single_temp_storage_bytes = 0;
+  // cub::DeviceRadixSort::SortPairsDescending(sort_single_temp_storage, sort_single_temp_storage_bytes, bucket_sizes,
+  //   sorted_bucket_sizes, single_bucket_indices, sorted_single_bucket_indices, nof_buckets, 0, sizeof(unsigned) * 8, stream);
+  // cudaMallocAsync(&sort_single_temp_storage, sort_single_temp_storage_bytes, stream);
+  // cub::DeviceRadixSort::SortPairsDescending(sort_single_temp_storage, sort_single_temp_storage_bytes, bucket_sizes,
+  //   sorted_bucket_sizes, single_bucket_indices, sorted_single_bucket_indices, nof_buckets, 0, sizeof(unsigned) * 8, stream);
+  // cudaFreeAsync(sort_single_temp_storage, stream);
+  
+
   //launch the accumulation kernel with maximum threads
   NUM_THREADS = 1 << 8;
+  // NUM_THREADS = 1 << 5;
   NUM_BLOCKS = (nof_buckets + NUM_THREADS - 1) / NUM_THREADS;
   accumulate_buckets_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(buckets, bucket_offsets, bucket_sizes, single_bucket_indices, point_indices, 
-                                                         d_points, nof_buckets, nof_buckets_to_compute, c+bm_bitsize);
+                                                         d_points, nof_buckets, nof_buckets_to_compute, c+bm_bitsize, c);                                              
+// accumulate_buckets_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(buckets, sorted_bucket_offsets, sorted_bucket_sizes, sorted_single_bucket_indices, point_indices, 
+//                                                           d_points, nof_buckets, nof_buckets_to_compute, c+bm_bitsize, c);                   
+   // accumulate_buckets_kernel<<<NUM_BLOCKS, NUM_THREADS>>>(buckets, sorted_bucket_offsets, sorted_bucket_sizes, sorted_single_bucket_indices, point_indices, 
+   //                                                        d_points, nof_buckets, nof_buckets_to_compute, c-1+bm_bitsize);                                              
+                                                          // cudaDeviceSynchronize();
+                                                        //   printf("cuda error acc %u\n",cudaGetLastError());
+#else
+NUM_THREADS = 1 << 8;
+// NUM_THREADS = 1 << 5;
+NUM_BLOCKS = (size + NUM_THREADS - 1) / NUM_THREADS;
+accumulate_buckets_kernel2<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(buckets, points, scalars, c, nof_bms, size); 
+// cudaDeviceSynchronize();
+// printf("cuda error 111%u\n",cudaGetLastError());
+#endif
+//reduce top bm
+// NUM_THREADS = min(MAX_TH,(source_buckets_count>>(1+j)));
+//   printf("NUM_THREADS 1 %u \n" ,NUM_THREADS);
+//   NUM_BLOCKS = ((source_buckets_count>>(1+j)) + NUM_THREADS - 1) / NUM_THREADS;
+//   printf("NUM_BLOCKS 1 %u \n" ,NUM_BLOCKS);
+//   single_stage_multi_reduction_kernel<<<NUM_BLOCKS, NUM_THREADS,0,stream>>>(j==0?source_buckets:temp_buckets1,j==target_bits_count-1? target_buckets: temp_buckets1,1<<(source_bits_count-j),j==target_bits_count-1? 1<<target_bits_count: 0,0,0);
+//   // cudaDeviceSynchronize();
+//   printf("cuda error %u\n",cudaGetLastError());
 
   #ifdef SSM_SUM
     //sum each bucket
@@ -260,22 +1134,420 @@ void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsi
     sum_reduction_kernel<<<NUM_BLOCKS,NUM_THREADS, 0, stream>>>(buckets, final_results);
   #endif
 
-  #ifdef BIG_TRIANGLE
-    P* final_results;
+  P* final_results;
+  if (big_triangle){
     cudaMallocAsync(&final_results, sizeof(P) * nof_bms, stream);
     //launch the bucket module sum kernel - a thread for each bucket module
     NUM_THREADS = nof_bms;
     NUM_BLOCKS = 1;
-    big_triangle_sum_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(buckets, final_results, nof_bms, c);
-  #endif
+    #ifdef SIGNED_DIG
+    big_triangle_sum_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(buckets, final_results, nof_bms, c-1); //sighed digits
+    #else
+    big_triangle_sum_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(buckets, final_results, nof_bms, c); 
+    #endif
+    // cudaDeviceSynchronize();
+    // printf("cuda error %u\n",cudaGetLastError());
+  }
+  #ifdef ZPRIZE
+  else{
+
+  unsigned source_bits_count = c;
+  unsigned source_windows_count = nof_bms;
+  P *source_buckets = buckets;
+  buckets = nullptr;
+  P *target_buckets;
+  for (unsigned i = 0;; i++) {
+    const unsigned target_bits_count = (source_bits_count + 1) >> 1; //c/2=8
+    const unsigned target_windows_count = source_windows_count << 1; //nof bms*2 = 32
+    const unsigned target_buckets_count = target_windows_count << target_bits_count; // bms*2^c = 32*2^8
+    const unsigned log_data_split =
+        get_optimal_log_data_split(84, source_bits_count, target_bits_count, target_windows_count); //todo - get num of multiprossecors
+    const unsigned total_buckets_count = target_buckets_count << log_data_split; //32*2^8*2^7
+    cudaMallocAsync(&target_buckets, sizeof(P) * total_buckets_count, stream); //32*2^8*2^7 buckets
+    NUM_THREADS = 32;
+    NUM_BLOCKS = (total_buckets_count + NUM_THREADS - 1) / NUM_THREADS;
+    // const unsigned block_dim = total_buckets_count < 32 ? total_buckets_count : 32;
+    // const unsigned grid_dim = (total_buckets_count - 1) / block_dim.x + 1;
+    split_windows_kernel_inner<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(source_bits_count, source_windows_count, source_buckets, target_buckets, total_buckets_count);
+    // cudaDeviceSynchronize();
+    // printf("cuda error %u\n",cudaGetLastError());
+    cudaFreeAsync(source_buckets, stream);
+
+    for (unsigned j = 0; j < log_data_split; j++){
+    const unsigned count = total_buckets_count >> (j + 1);
+    // const unsigned block_dim = count < 32 ? count : 32;
+    // const unsigned grid_dim = (count - 1) / block_dim.x + 1;
+    NUM_THREADS = 32;
+    NUM_BLOCKS = (count + NUM_THREADS - 1) / NUM_THREADS;
+    reduce_buckets_kernel<<<NUM_BLOCKS, NUM_THREADS,0,stream>>>(target_buckets, count);
+    // cudaDeviceSynchronize();
+    // printf("cuda error %u\n",cudaGetLastError());
+    }
+    if (target_bits_count == 1) {
+      // P results;
+      // // const unsigned result_windows_count = min(fd_q::MBC, windows_count_pass_one * bits_count_pass_one);
+      const unsigned result_windows_count = bitsize;
+      // if (copy_results)
+      //   HANDLE_CUDA_ERROR(allocate(results, result_windows_count, pool, stream));
+      // HANDLE_CUDA_ERROR(last_pass_gather(bits_count_pass_one, target_buckets, copy_results ? results : ec.results, result_windows_count, stream));
+      // if (copy_results) {
+      //   HANDLE_CUDA_ERROR(cudaMemcpyAsync(ec.results, results, sizeof(point_jacobian) * result_windows_count, cudaMemcpyDeviceToHost, stream));
+      //   if (ec.d2h_copy_finished)
+      //     HANDLE_CUDA_ERROR(cudaEventRecord(ec.d2h_copy_finished, stream));
+      //   if (ec.d2h_copy_finished_callback)
+      //     HANDLE_CUDA_ERROR(cudaLaunchHostFunc(stream, ec.d2h_copy_finished_callback, ec.d2h_copy_finished_callback_data));
+      // }
+      // if (copy_results)
+      //   HANDLE_CUDA_ERROR(free(results, stream));
+      // HANDLE_CUDA_ERROR(free(target_buckets, stream));
+      nof_bms = bitsize;
+      cudaMallocAsync(&final_results, sizeof(P) * nof_bms, stream);
+      NUM_THREADS = 32;
+      NUM_BLOCKS = (result_windows_count + NUM_THREADS - 1) / NUM_THREADS;
+      // const dim3 block_dim = result_windows_count < 32 ? count : 32;
+      // const dim3 grid_dim = (result_windows_count - 1) / block_dim.x + 1;
+      last_pass_gather_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(c, target_buckets, final_results, result_windows_count);
+      // cudaDeviceSynchronize();
+    //   printf("cuda error %u\n",cudaGetLastError());
+      c = 1;
+      break;
+    }
+    source_buckets = target_buckets;
+    target_buckets = nullptr;
+    source_bits_count = target_bits_count;
+    source_windows_count = target_windows_count;
+  }
+}
+#else
+else{
+  // cudaDeviceSynchronize();
+//   printf("cuda erddsdfsdfsror %u\n",cudaGetLastError());
+//     cudaDeviceSynchronize();
+// std::vector<P> h_buckets;
+//   h_buckets.reserve(nof_buckets);
+//     cudaMemcpy(h_buckets.data(), buckets, sizeof(P) * nof_buckets, cudaMemcpyDeviceToHost);
+//     std::cout<<"buckets accumulated"<<std::endl;
+//     for (unsigned i = 0; i < nof_buckets; i++)
+//     {
+//       std::cout<<h_buckets[i]<<" ";
+//     }
+//     std::cout<<std::endl;
+  unsigned source_bits_count = c;
+  bool odd_source_c = source_bits_count%2;
+  unsigned source_windows_count = nof_bms;
+  // unsigned source_window_buckets_count = 1 << source_bits_count;
+  unsigned source_buckets_count = nof_buckets;
+  P *source_buckets = buckets;
+  buckets = nullptr;
+  P *target_buckets;
+  P *temp_buckets1;
+  P *temp_buckets2;
+  for (unsigned i = 0;; i++) {
+    // if (odd_source_c){
+    //   source_buckets_count = source_buckets_count<<1;
+    //   P *source_buckets_padded;
+    //   cudaMalloc(&source_buckets_padded, sizeof(P) * source_buckets_count);
+    //   NUM_THREADS = min(MAX_TH,(source_buckets_count));
+    //   NUM_BLOCKS = ((source_buckets_count) + NUM_THREADS - 1) / NUM_THREADS;
+    //   pad_buckets_kernel<<<NUM_BLOCKS, NUM_THREADS,0,stream>>>(source_buckets,source_buckets_padded,1<<source_bits_count);
+    //   std::vector<P> s_buckets;
+    // s_buckets.reserve(source_buckets_count);
+    // cudaMemcpy(s_buckets.data(), source_buckets_padded, sizeof(P) * source_buckets_count, cudaMemcpyDeviceToHost);
+    // std::cout<<"source_buckets_padded"<<std::endl;
+    // for (unsigned i = 0; i < source_buckets_count; i++)
+    // {
+    //   std::cout<<s_buckets[i]<<" ";
+    // }
+    // std::cout<<std::endl;
+    // source_buckets=source_buckets_padded;
+    // source_bits_count = source_bits_count+1;
+    // }
+    // printf("round %u \n" ,i);
+    const unsigned target_bits_count = (source_bits_count + 1) >> 1; //c/2=8
+    // printf("target_bits_count %u \n" ,target_bits_count);
+    const unsigned target_windows_count = source_windows_count << 1; //nof bms*2 = 32
+    // const unsigned target_window_buckets_count = 1 << target_bits_count; // 2^8
+    const unsigned target_buckets_count = target_windows_count << target_bits_count; // bms*2^c = 32*2^8
+    // const unsigned log_data_split =
+    //     get_optimal_log_data_split(84, source_bits_count, target_bits_count, target_windows_count); //todo - get num of multiprossecors
+    // const unsigned total_buckets_count = target_buckets_count << log_data_split; //32*2^8*2^7
+    cudaMallocAsync(&target_buckets, sizeof(P) * target_buckets_count,stream); //32*2^8*2^7 buckets
+    cudaMallocAsync(&temp_buckets1, sizeof(P) * source_buckets_count/2,stream); //32*2^8*2^7 buckets
+    cudaMallocAsync(&temp_buckets2, sizeof(P) * source_buckets_count/2,stream); //32*2^8*2^7 buckets
+    // const unsigned block_dim = total_buckets_count < 32 ? total_buckets_count : 32;
+    // const unsigned grid_dim = (total_buckets_count - 1) / block_dim.x + 1;
+    //input output, streams
+    // reduce_buckets_kernel<<<NUM_BLOCKS, NUM_THREADS,0,0>>>(source_buckets, target_buckets, source_windows_count>>1);
+    // for (unsigned j = 0; j < target_windows_count-1; j++) //another loop
+    // reduce_buckets_kernel<<<NUM_BLOCKS, NUM_THREADS,0,0>>>(target_buckets, target_buckets, source_windows_count>>(j+2));
+
+    // cudaStream_t stream2;
+    // cudaStreamCreate(&streams[0]);
+    // cudaStreamCreate(&streams[1]);
+    // cudaStreamCreate(&stream2);
+
+    // if (source_bits_count>8){
+    if (source_bits_count>0){
+      for(unsigned j=0;j<target_bits_count;j++){
+        // cudaDeviceSynchronize();
+        // std::vector<P> t1_buckets;
+        // std::vector<P> t2_buckets;
+        // t1_buckets.reserve(source_buckets_count);
+        // t2_buckets.reserve(source_buckets_count);
+        //     cudaMemcpy(t1_buckets.data(), temp_buckets1, sizeof(P) * source_buckets_count, cudaMemcpyDeviceToHost);
+        //     cudaMemcpy(t2_buckets.data(), temp_buckets2, sizeof(P) * source_buckets_count, cudaMemcpyDeviceToHost);
+        //     std::cout<<"0 buckets temp1"<<std::endl;
+        //     for (unsigned i = 0; i < source_buckets_count; i++)
+        //     {
+        //       std::cout<<t1_buckets[i]<<" ";
+        //     }
+        //     std::cout<<std::endl;
+        //     std::cout<<"0 buckets temp2"<<std::endl;
+        //     for (unsigned i = 0; i < source_buckets_count; i++)
+        //     {
+        //       std::cout<<t2_buckets[i]<<" ";
+        //     }
+        //     std::cout<<std::endl;
+        // if (!odd_source_c || j!=target_bits_count-1){
+        // unsigned last_j = odd_source_c? target_bits_count-2 : target_bits_count-1;
+        unsigned last_j = target_bits_count-1;
+        NUM_THREADS = min(MAX_TH,(source_buckets_count>>(1+j)));
+        // printf("NUM_THREADS 1 %u \n" ,NUM_THREADS);
+        NUM_BLOCKS = ((source_buckets_count>>(1+j)) + NUM_THREADS - 1) / NUM_THREADS;
+        // printf("NUM_BLOCKS 1 %u \n" ,NUM_BLOCKS);
+        single_stage_multi_reduction_kernel<<<NUM_BLOCKS, NUM_THREADS,0,stream>>>(j==0?source_buckets:temp_buckets1,j==target_bits_count-1? target_buckets: temp_buckets1,1<<(source_bits_count-j),j==target_bits_count-1? 1<<target_bits_count: 0,0,0);
+        // cudaDeviceSynchronize();
+        // printf("cuda error %u\n",cudaGetLastError());
+        // }
+        // std::vector<P> t1_buckets;
+        // std::vector<P> t2_buckets;
+        // t1_buckets.reserve(source_buckets_count/2);
+        // t2_buckets.reserve(source_buckets_count/2);
+        //     cudaMemcpy(t1_buckets.data(), temp_buckets1, sizeof(P) * source_buckets_count, cudaMemcpyDeviceToHost);
+        //     cudaMemcpy(t2_buckets.data(), temp_buckets2, sizeof(P) * source_buckets_count, cudaMemcpyDeviceToHost);
+        //     std::cout<<"1 buckets temp1"<<std::endl;
+        //     for (unsigned i = 0; i < source_buckets_count; i++)
+        //     {
+        //       std::cout<<t1_buckets[i]<<" ";
+        //     }
+        //     std::cout<<std::endl;
+        //     std::cout<<"1 buckets temp2"<<std::endl;
+        //     for (unsigned i = 0; i < source_buckets_count; i++)
+        //     {
+        //       std::cout<<t2_buckets[i]<<" ";
+        //     }
+        //     std::cout<<std::endl;
+        //     cudaMemcpy(t1_buckets.data(), target_buckets, sizeof(P) * target_buckets_count, cudaMemcpyDeviceToHost);
+        // std::cout<<"1 buckets target"<<std::endl;
+        // for (unsigned i = 0; i < target_buckets_count; i++)
+        // {
+        //   std::cout<<t1_buckets[i]<<" ";
+        // }
+        // std::cout<<std::endl;
+  
+        // unsigned nof_threads = (source_buckets_count>>(1+j))*((odd_source_c&&j==target_bits_count-1)? 2 :1);
+        unsigned nof_threads = (source_buckets_count>>(1+j));
+        NUM_THREADS = min(MAX_TH,nof_threads);
+        // printf("NUM_THREADS 2 %u \n" ,NUM_THREADS);
+        NUM_BLOCKS = (nof_threads + NUM_THREADS - 1) / NUM_THREADS;
+        // printf("NUM_BLOCKS 2 %u \n" ,NUM_BLOCKS);
+        single_stage_multi_reduction_kernel<<<NUM_BLOCKS, NUM_THREADS,0,stream>>>(j==0?source_buckets:temp_buckets2,j==target_bits_count-1? target_buckets: temp_buckets2,1<<(target_bits_count-j),j==target_bits_count-1? 1<<target_bits_count: 0,1,0);
+        // cudaDeviceSynchronize();
+        // printf("cuda error %u\n",cudaGetLastError());
+
+        // std::vector<P> t1_buckets;
+        // std::vector<P> t2_buckets;
+        // t1_buckets.reserve(source_buckets_count/2);
+        // t2_buckets.reserve(source_buckets_count/2);
+            // cudaMemcpy(t1_buckets.data(), temp_buckets1, sizeof(P) * source_buckets_count, cudaMemcpyDeviceToHost);
+            // cudaMemcpy(t2_buckets.data(), temp_buckets2, sizeof(P) * source_buckets_count, cudaMemcpyDeviceToHost);
+            // std::cout<<"2 buckets temp1"<<std::endl;
+            // for (unsigned i = 0; i < source_buckets_count; i++)
+            // {
+            //   std::cout<<t1_buckets[i]<<" ";
+            // }
+            // std::cout<<std::endl;
+            // std::cout<<"2 buckets temp2"<<std::endl;
+            // for (unsigned i = 0; i < source_buckets_count; i++)
+            // {
+            //   std::cout<<t2_buckets[i]<<" ";
+            // }
+            // std::cout<<std::endl;
+        
+        //         std::vector<P> t1_buckets;
+        // std::vector<P> t2_buckets;
+        // t1_buckets.reserve(source_buckets_count/2);
+        // t2_buckets.reserve(source_buckets_count/2);
+        // cudaMemcpy(t1_buckets.data(), target_buckets, sizeof(P) * target_buckets_count, cudaMemcpyDeviceToHost);
+        // std::cout<<"2 buckets target"<<std::endl;
+        // for (unsigned i = 0; i < target_buckets_count; i++)
+        // {
+        //   std::cout<<t1_buckets[i]<<" ";
+        // }
+        // std::cout<<std::endl;
+
+      }
+    }
+    else{
+    // NUM_THREADS = 256;
+    // NUM_BLOCKS = ((source_buckets_count>>1) + NUM_THREADS - 1) / NUM_THREADS;
+    // NUM_THREADS = 1<<(source_bits_count-1);
+    // printf("NUM_THREADS %u \n" ,NUM_THREADS);
+    // NUM_BLOCKS = source_windows_count;
+    // printf("NUM_BLOCKS %u \n" ,NUM_BLOCKS);
+    // reduce_triangles_kernel<<<NUM_BLOCKS, NUM_THREADS,0,streams[0]>>>(source_buckets,temp_buckets1,target_buckets,source_bits_count,source_windows_count);
+    // for(unsigned j=0;;j++){
+    NUM_THREADS = 1<<(source_bits_count-1);
+    // printf("NUM_THREADS 1 %u \n" ,NUM_THREADS);
+    NUM_BLOCKS = source_windows_count;
+    // printf("NUM_BLOCKS 1 %u \n" ,NUM_BLOCKS);
+    general_sum_reduction_kernel<<<NUM_BLOCKS, NUM_THREADS,0,stream>>>(source_buckets,target_buckets,1<<target_bits_count,1<<target_bits_count,0);
+    // cudaDeviceSynchronize();
+    // printf("cuda error %u\n",cudaGetLastError());
+    // cudaDeviceSynchronize();
+    // std::vector<P> t_buckets;
+    // t_buckets.reserve(target_buckets_count);
+    //     cudaMemcpy(t_buckets.data(), target_buckets, sizeof(P) * target_buckets_count, cudaMemcpyDeviceToHost);
+    //     std::cout<<"buckets target1"<<std::endl;
+    //     for (unsigned i = 0; i < target_buckets_count; i++)
+    //     {
+    //       std::cout<<t_buckets[i]<<" ";
+    //     }
+    //     std::cout<<std::endl;
+    // reduce_rectangles_kernel<<<NUM_BLOCKS, NUM_THREADS,0,streams[1]>>>(source_buckets,temp_buckets2,target_buckets,source_bits_count,source_windows_count);
+    NUM_THREADS = 1<<(target_bits_count-1);
+    // printf("NUM_THREADS 2 %u \n" ,NUM_THREADS);
+    NUM_BLOCKS = source_windows_count<<target_bits_count;
+    // printf("NUM_BLOCKS 2 %u \n" ,NUM_BLOCKS);
+    general_sum_reduction_kernel<<<NUM_BLOCKS, NUM_THREADS,0,stream>>>(source_buckets,target_buckets,1,1<<target_bits_count,1);
+    // cudaDeviceSynchronize();
+    // printf("cuda error %u\n",cudaGetLastError());
+    // }
+    }
+    // cudaStreamSynchronize(stream);
+    // cudaStreamDestroy(streams[0]);
+    // cudaStreamSynchronize(stream2);
+    // cudaStreamDestroy(stream2);
+
+    // cudaDeviceSynchronize();
+    // // std::vector<P> t_buckets;
+    // // t_buckets.reserve(target_buckets_count);
+    //     cudaMemcpy(t_buckets.data(), target_buckets, sizeof(P) * target_buckets_count, cudaMemcpyDeviceToHost);
+    //     std::cout<<"buckets target2"<<std::endl;
+    //     for (unsigned i = 0; i < target_buckets_count; i++)
+    //     {
+    //       std::cout<<t_buckets[i]<<" ";
+    //     }
+    //     std::cout<<std::endl;
+
+    // unsigned NUM_STREAMS = source_windows_count + source_windows_count*target_window_buckets_count;
+    // cudaStream_t streams[NUM_STREAMS];
+    // for (int k = 0; k < NUM_STREAMS; ++k) { cudaStreamCreate(&streams[k]); }
+
+    // NUM_THREADS = 32;
+    // for (unsigned j = 0; j < source_windows_count; j++){ //loop on every source bm //0-15
+    //   unsigned source_offset = j*source_window_buckets_count; //0,2^16,2*2^16
+    //   unsigned target_offset = j*target_window_buckets_count*2; //0,2^16,2*2^16
+    //   NUM_BLOCKS = ((source_window_buckets_count>>1) + NUM_THREADS - 1) / NUM_THREADS; //2^15
+    //   reduce_buckets_kernel2<<<NUM_BLOCKS, NUM_THREADS,0,streams[j]>>>(source_buckets+source_offset, temp_buckets1+source_offset, source_window_buckets_count>>1); //same source different target
+    //   for (unsigned k = 0; k < target_bits_count-2; k++){ //0..5
+    //   NUM_BLOCKS = ((source_window_buckets_count>>(k+2)) + NUM_THREADS - 1) / NUM_THREADS;//2^14..2^9
+    //   reduce_buckets_kernel2<<<NUM_BLOCKS, NUM_THREADS,0,streams[j]>>>(temp_buckets1+source_offset, temp_buckets1+source_offset, source_window_buckets_count>>(k+2)); //stream j
+    //   }
+    //   NUM_BLOCKS = ((source_window_buckets_count>>target_bits_count) + NUM_THREADS - 1) / NUM_THREADS;//2^8
+    //   reduce_buckets_kernel2<<<NUM_BLOCKS, NUM_THREADS,0,streams[j]>>>(temp_buckets1+source_offset, target_buckets+target_offset, source_window_buckets_count>>target_bits_count); //stream j
+    // }
+
+    // for (unsigned j = 0; j < source_windows_count*target_window_buckets_count; j++){ //loop on every segment of every source bm // 0..16*2^8-1
+    //   unsigned source_offset = j*target_window_buckets_count;
+    //   unsigned target_offset = j%target_window_buckets_count+(j/target_window_buckets_count)*target_window_buckets_count*2 + target_window_buckets_count;
+    //   NUM_BLOCKS = ((target_window_buckets_count>>1) + NUM_THREADS - 1) / NUM_THREADS; //2^7
+    //   reduce_buckets_kernel2<<<NUM_BLOCKS, NUM_THREADS,0,streams[j+source_windows_count]>>>(source_buckets+source_offset, temp_buckets2+source_offset, target_window_buckets_count>>1); //same source different target
+    //   for (unsigned k = 0; k < target_bits_count-2; k++){ //0..5
+    //   NUM_BLOCKS = ((target_window_buckets_count>>(k+2)) + NUM_THREADS - 1) / NUM_THREADS; //last blocks are single threaded.. //2^6..2^1
+    //   reduce_buckets_kernel2<<<NUM_BLOCKS, NUM_THREADS,0,streams[j+source_windows_count]>>>(temp_buckets2+source_offset, temp_buckets2+source_offset, target_window_buckets_count>>(k+2));// stream j + source_windows_count
+    //   }
+    //   NUM_BLOCKS = 1; //last blocks are single threaded.. //
+    //   reduce_buckets_kernel2<<<NUM_BLOCKS, NUM_THREADS,0,streams[j+source_windows_count]>>>(temp_buckets2+source_offset, target_buckets+target_offset, 1);// stream j + source_windows_count
+    // }
+
+    // for (int k = 0; k < NUM_STREAMS; ++k)
+    // {
+    //     cudaStreamSynchronize(streams[k]);
+    //     cudaStreamDestroy(streams[k]);
+    // }
+
+    // cudaFreeAsync(source_buckets, stream);
+    if (target_bits_count == 1) {
+      // P results;
+      // // const unsigned result_windows_count = min(fd_q::MBC, windows_count_pass_one * bits_count_pass_one);
+      // const unsigned result_windows_count = bitsize;
+      // if (copy_results)
+      //   HANDLE_CUDA_ERROR(allocate(results, result_windows_count, pool, stream));
+      // HANDLE_CUDA_ERROR(last_pass_gather(bits_count_pass_one, target_buckets, copy_results ? results : ec.results, result_windows_count, stream));
+      // if (copy_results) {
+      //   HANDLE_CUDA_ERROR(cudaMemcpyAsync(ec.results, results, sizeof(point_jacobian) * result_windows_count, cudaMemcpyDeviceToHost, stream));
+      //   if (ec.d2h_copy_finished)
+      //     HANDLE_CUDA_ERROR(cudaEventRecord(ec.d2h_copy_finished, stream));
+      //   if (ec.d2h_copy_finished_callback)
+      //     HANDLE_CUDA_ERROR(cudaLaunchHostFunc(stream, ec.d2h_copy_finished_callback, ec.d2h_copy_finished_callback_data));
+      // }
+      // if (copy_results)
+      //   HANDLE_CUDA_ERROR(free(results, stream));
+      // HANDLE_CUDA_ERROR(free(target_buckets, stream));
+      nof_bms = bitsize;
+      cudaMallocAsync(&final_results, sizeof(P) * nof_bms, stream);
+      NUM_THREADS = 32;
+      NUM_BLOCKS = (nof_bms + NUM_THREADS - 1) / NUM_THREADS;
+      last_pass_kernel<<<NUM_BLOCKS,NUM_THREADS>>>(target_buckets,final_results,nof_bms);
+      // for (int k=0;k<bitsize;k++) {
+      //   printf("k %u\n",final_results[k]);
+      //   printf("k %u\n",target_buckets[k]);
+      //   // final_results[k]=target_buckets[2*k+1];
+      // }
+      // final_results = target_buckets;
+      c = 1;
+      cudaFreeAsync(source_buckets,stream);
+      cudaFreeAsync(target_buckets,stream);
+      cudaFreeAsync(temp_buckets1,stream);
+      cudaFreeAsync(temp_buckets2,stream);
+      break;
+    }
+    cudaFreeAsync(source_buckets,stream);
+    cudaFreeAsync(temp_buckets1,stream);
+    cudaFreeAsync(temp_buckets2,stream);
+    source_buckets = target_buckets;
+    target_buckets = nullptr;
+    temp_buckets1 = nullptr;
+    temp_buckets2 = nullptr;
+    source_bits_count = target_bits_count;
+    odd_source_c = source_bits_count%2;
+    source_windows_count = target_windows_count;
+    // source_window_buckets_count = 1 << source_bits_count;
+    source_buckets_count = target_buckets_count;
+  }
+}
+#endif
+  // cudaDeviceSynchronize();
+  //   std::vector<P> h_final_results;
+  //   h_final_results.reserve(nof_bms);
+  //   cudaMemcpy(h_final_results.data(), final_results, sizeof(P) * nof_bms, cudaMemcpyDeviceToHost);
+  //   std::cout<<"buckets summed"<<std::endl;
+  //   for (unsigned i = 0; i < nof_bms; i++)
+  //   {
+  //     std::cout<<h_final_results[i]<<" ";
+  //   }
+  //   std::cout<<std::endl;
+
 
   P* d_final_result;
   if (!on_device)
     cudaMallocAsync(&d_final_result, sizeof(P), stream);
 
   //launch the double and add kernel, a single thread
-  final_accumulation_kernel<P, S><<<1,1,0,stream>>>(final_results, on_device ? final_result : d_final_result, 1, nof_bms, c);
-  
+  final_accumulation_kernel<P, S><<<1,1,0,stream>>>(final_results, ones_results, on_device ? final_result : d_final_result, 1, nof_bms, c);
+  // cudaDeviceSynchronize();
+//   printf("cuda error %u\n",cudaGetLastError());
   //copy final result to host
   cudaStreamSynchronize(stream);
   if (!on_device)
@@ -288,15 +1560,23 @@ void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsi
     cudaFreeAsync(d_final_result, stream);
   }
   cudaFreeAsync(buckets, stream);
+  #ifndef PHASE1_TEST
   cudaFreeAsync(bucket_indices, stream);
   cudaFreeAsync(point_indices, stream);
   cudaFreeAsync(single_bucket_indices, stream);
   cudaFreeAsync(bucket_sizes, stream);
   cudaFreeAsync(nof_buckets_to_compute, stream);
   cudaFreeAsync(bucket_offsets, stream);
+  #endif
+  // cudaFreeAsync(sorted_bucket_sizes,stream);
+  // cudaFreeAsync(sorted_bucket_offsets,stream);
+  // cudaFreeAsync(sorted_single_bucket_indices,stream);
   cudaFreeAsync(final_results, stream);
+  cudaFreeAsync(ones_results, stream);
 
   cudaStreamSynchronize(stream);
+
+
 }
 
 //this function computes msm using the bucket method
@@ -344,7 +1624,7 @@ void batched_bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *poin
   NUM_THREADS = 1 << 8;
   NUM_BLOCKS = (total_size * nof_bms + msm_size + NUM_THREADS - 1) / NUM_THREADS;
   split_scalars_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(bucket_indices + msm_size, point_indices + msm_size, d_scalars, total_size, 
-                                                    msm_log_size, nof_bms, bm_bitsize, c); //+size - leaving the first bm free for the out of place sort later
+                                                    msm_log_size, nof_bms, bm_bitsize, c,0); //+size - leaving the first bm free for the out of place sort later
 
   //sort indices - the indices are sorted from smallest to largest in order to group together the points that belong to each bucket
   unsigned *sorted_bucket_indices;
@@ -395,30 +1675,30 @@ void batched_bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *poin
   NUM_THREADS = 1 << 8;
   NUM_BLOCKS = (total_nof_buckets + NUM_THREADS - 1) / NUM_THREADS;
   accumulate_buckets_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(buckets, bucket_offsets, bucket_sizes, single_bucket_indices, sorted_point_indices,
-                                                        d_points, nof_buckets, total_nof_buckets_to_compute, c+bm_bitsize);
+                                                        d_points, nof_buckets, total_nof_buckets_to_compute, c+bm_bitsize,c);
 
-  #ifdef SSM_SUM
-    //sum each bucket
-    NUM_THREADS = 1 << 10;
-    NUM_BLOCKS = (nof_buckets + NUM_THREADS - 1) / NUM_THREADS;
-    ssm_buckets_kernel<P, S><<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(buckets, single_bucket_indices, nof_buckets, c);
+  // #ifdef SSM_SUM
+  //   //sum each bucket
+  //   NUM_THREADS = 1 << 10;
+  //   NUM_BLOCKS = (nof_buckets + NUM_THREADS - 1) / NUM_THREADS;
+  //   ssm_buckets_kernel<P, S><<<NUM_BLOCKS, NUM_THREADS>>>(buckets, single_bucket_indices, nof_buckets, c);
    
-    //sum each bucket module
-    P* final_results;
-    cudaMallocAsync(&final_results, sizeof(P) * nof_bms, stream);
-    NUM_THREADS = 1<<c;
-    NUM_BLOCKS = nof_bms;
-    sum_reduction_kernel<<<NUM_BLOCKS,NUM_THREADS, 0, stream>>>(buckets, final_results);
-  #endif
-
-  #ifdef BIG_TRIANGLE
+  //   //sum each bucket module
+  //   P* final_results;
+  //   cudaMalloc(&final_results, sizeof(P) * nof_bms);
+  //   NUM_THREADS = 1<<c;
+  //   NUM_BLOCKS = nof_bms;
+  //   sum_reduction_kernel<<<NUM_BLOCKS,NUM_THREADS>>>(buckets, final_results);
+  // #endif
+
+  // #ifdef BIG_TRIANGLE
     P* bm_sums;
     cudaMallocAsync(&bm_sums, sizeof(P) * nof_bms * batch_size, stream);
     //launch the bucket module sum kernel - a thread for each bucket module
     NUM_THREADS = 1<<8;
     NUM_BLOCKS = (nof_bms*batch_size + NUM_THREADS - 1) / NUM_THREADS;
     big_triangle_sum_kernel<<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(buckets, bm_sums, nof_bms*batch_size, c);
-  #endif
+  // #endif
 
   P* d_final_results;
   if (!on_device)
@@ -427,8 +1707,10 @@ void batched_bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *poin
   //launch the double and add kernel, a single thread for each msm
   NUM_THREADS = 1<<8;
   NUM_BLOCKS = (batch_size + NUM_THREADS - 1) / NUM_THREADS;
-  final_accumulation_kernel<P, S><<<NUM_BLOCKS,NUM_THREADS, 0, stream>>>(bm_sums, on_device ? final_results : d_final_results, batch_size, nof_bms, c);
+  final_accumulation_kernel<P, S><<<NUM_BLOCKS,NUM_THREADS, 0, stream>>>(bm_sums,bm_sums, on_device ? final_results : d_final_results, batch_size, nof_bms, c);
   
+  final_accumulation_kernel<P, S><<<NUM_BLOCKS,NUM_THREADS>>>(bm_sums,bm_sums, on_device ? final_results : d_final_results, batch_size, nof_bms, c);
+
   //copy final result to host
   if (!on_device)
     cudaMemcpyAsync(final_results, d_final_results, sizeof(P)*batch_size, cudaMemcpyDeviceToHost, stream);
@@ -532,18 +1814,19 @@ void reference_msm(S* scalars, A* a_points, unsigned size){
 unsigned get_optimal_c(const unsigned size) {
   if (size < 17)
     return 1;
-  // return 15;
+  // return 17;
   return ceil(log2(size))-4;
 }
 
 //this function is used to compute msms of size larger than 256
 template <typename S, typename P, typename A>
-void large_msm(S* scalars, A* points, unsigned size, P* result, bool on_device, cudaStream_t stream){
-  unsigned c = get_optimal_c(size);
-  // unsigned c = 6;
-  // unsigned bitsize = 32;
-  unsigned bitsize = 255;
-  bucket_method_msm(bitsize, c, scalars, points, size, result, on_device, stream);
+void large_msm(S* scalars, A* points, unsigned size, P* result, bool on_device, bool big_triangle, cudaStream_t stream){
+  // unsigned c = get_optimal_c(size);
+  unsigned c = 16;
+  // unsigned c = 8;
+  unsigned bitsize = S::NBITS;
+  // unsigned bitsize = 254; //get from field
+  bucket_method_msm(bitsize, c, scalars, points, size, result, on_device, big_triangle, stream);
 }
 
 // this function is used to compute a batches of msms of size larger than 256
@@ -555,4 +1838,4 @@ void batched_large_msm(S* scalars, A* points, unsigned batch_size, unsigned msm_
   unsigned bitsize = 255;
   batched_bucket_method_msm(bitsize, c, scalars, points, batch_size, msm_size, result, on_device, stream);
 }
-#endif
+#endif
\ No newline at end of file
diff --git a/icicle/appUtils/msm/msm.cuh b/icicle/appUtils/msm/msm.cuh
index c6e8b0566..7da8fdc65 100644
--- a/icicle/appUtils/msm/msm.cuh
+++ b/icicle/appUtils/msm/msm.cuh
@@ -3,7 +3,7 @@
 #pragma once
 
 template <typename S, typename P, typename A>
-void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsigned size, P* final_result, bool on_device, cudaStream_t stream);
+void bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsigned size, P* final_result, bool on_device, bool big_triangle, cudaStream_t stream);
 
 template <typename S, typename P, typename A>
 void batched_bucket_method_msm(unsigned bitsize, unsigned c, S *scalars, A *points, unsigned batch_size, unsigned msm_size, P* final_results, bool on_device, cudaStream_t stream);
@@ -12,7 +12,7 @@ template <typename S, typename P, typename A>
 void batched_large_msm(S* scalars, A* points, unsigned batch_size, unsigned msm_size, P* result, bool on_device, cudaStream_t stream);
 
 template <typename S, typename P, typename A>
-void large_msm(S* scalars, A* points, unsigned size, P* result, bool on_device, cudaStream_t stream);
+void large_msm(S* scalars, A* points, unsigned size, P* result, bool on_device, bool big_triangle, cudaStream_t stream);
 
 template <typename S, typename P, typename A>
 void short_msm(S *h_scalars, A *h_points, unsigned size, P* h_final_result, cudaStream_t stream);
diff --git a/icicle/appUtils/msm/tests/msm_test.cu b/icicle/appUtils/msm/tests/msm_test.cu
index 5833e9cc3..e12d221d8 100644
--- a/icicle/appUtils/msm/tests/msm_test.cu
+++ b/icicle/appUtils/msm/tests/msm_test.cu
@@ -5,15 +5,27 @@
 #include "../../utils/cuda_utils.cuh"
 #include "../../primitives/projective.cuh"
 #include "../../primitives/field.cuh"
-#include "../../curves/bls12_381/curve_config.cuh"
+// #include "../../curves/bls12_377/curve_config.cuh"
+#include "../../curves/bn254/curve_config.cuh"
 
-using namespace BLS12_381;
+// using namespace BLS12_377;
+using namespace BN254;
 
 class Dummy_Scalar {
   public:
     static constexpr unsigned NBITS = 32;
 
     unsigned x;
+    unsigned p = 10;
+    // unsigned p = 1<<30;
+
+    static HOST_DEVICE_INLINE Dummy_Scalar zero() {
+      return {0};
+    }
+
+    static HOST_DEVICE_INLINE Dummy_Scalar one() {
+      return {1};
+    }
 
     friend HOST_INLINE std::ostream& operator<<(std::ostream& os, const Dummy_Scalar& scalar) {
       os << scalar.x;
@@ -25,7 +37,7 @@ class Dummy_Scalar {
     }
 
     friend HOST_DEVICE_INLINE Dummy_Scalar operator+(Dummy_Scalar p1, const Dummy_Scalar& p2) {   
-      return {p1.x+p2.x};
+      return {(p1.x+p2.x)%p1.p};
     }
 
     friend HOST_DEVICE_INLINE bool operator==(const Dummy_Scalar& p1, const Dummy_Scalar& p2) {
@@ -36,11 +48,12 @@ class Dummy_Scalar {
       return (p1.x == p2);
     }
 
-    // static HOST_DEVICE_INLINE Dummy_Scalar neg(const Dummy_Scalar &scalar) { 
-    //   return {Dummy_Scalar::neg(point.x)};
-    // }
+    static HOST_DEVICE_INLINE Dummy_Scalar neg(const Dummy_Scalar &scalar) { 
+      return {scalar.p-scalar.x};
+    }
     static HOST_INLINE Dummy_Scalar rand_host() {
-      return {(unsigned)rand()};
+      return {(unsigned)rand()%10};
+      // return {(unsigned)rand()};
     }
 };
 
@@ -53,6 +66,10 @@ class Dummy_Projective {
       return {0};
     }
 
+    static HOST_DEVICE_INLINE Dummy_Projective one() {
+      return {1};
+    }
+
     static HOST_DEVICE_INLINE Dummy_Projective to_affine(const Dummy_Projective &point) {
       return {point.x};
     }
@@ -61,9 +78,9 @@ class Dummy_Projective {
       return {point.x};
     }
 
-    // static HOST_DEVICE_INLINE Dummy_Projective neg(const Dummy_Projective &point) { 
-    //   return {Dummy_Scalar::neg(point.x)};
-    // }
+    static HOST_DEVICE_INLINE Dummy_Projective neg(const Dummy_Projective &point) { 
+      return {Dummy_Scalar::neg(point.x)};
+    }
 
     friend HOST_DEVICE_INLINE Dummy_Projective operator+(Dummy_Projective p1, const Dummy_Projective& p2) {   
       return {p1.x+p2.x};
@@ -103,7 +120,8 @@ class Dummy_Projective {
     }
 
     static HOST_INLINE Dummy_Projective rand_host() {
-      return {(unsigned)rand()};
+      return {(unsigned)rand()%10};
+      // return {(unsigned)rand()};
     }
 };
 
@@ -119,62 +137,99 @@ typedef affine_t test_affine;
 
 int main()
 {
-  unsigned batch_size = 4;
-  unsigned msm_size = 1<<15;
+  unsigned batch_size = 1;
+//   unsigned msm_size = 1<<21;
+  unsigned msm_size = 12180757;
   unsigned N = batch_size*msm_size;
 
   test_scalar *scalars = new test_scalar[N];
   test_affine *points = new test_affine[N];
   
   for (unsigned i=0;i<N;i++){
-    scalars[i] = (i%msm_size < 10)? test_scalar::rand_host() : scalars[i-10];
+    // scalars[i] = (i%msm_size < 10)? test_scalar::rand_host() : scalars[i-10];
     points[i] = (i%msm_size < 10)? test_projective::to_affine(test_projective::rand_host()): points[i-10];
-    // scalars[i] = test_scalar::rand_host();
+    scalars[i] = test_scalar::rand_host();
+    // scalars[i] = i < N/2? test_scalar::rand_host() : test_scalar::one();
     // points[i] = test_projective::to_affine(test_projective::rand_host());
   }
   std::cout<<"finished generating"<<std::endl;
 
   // projective_t *short_res = (projective_t*)malloc(sizeof(projective_t));
   // test_projective *large_res = (test_projective*)malloc(sizeof(test_projective));
-  test_projective large_res[batch_size];
-  test_projective batched_large_res[batch_size];
+  test_projective large_res[batch_size*2];
+  // test_projective batched_large_res[batch_size];
   // fake_point *large_res = (fake_point*)malloc(sizeof(fake_point));
   // fake_point batched_large_res[256];
 
 
   // short_msm<scalar_t, projective_t, affine_t>(scalars, points, N, short_res);
-  for (unsigned i=0;i<batch_size;i++){
-    large_msm<test_scalar, test_projective, test_affine>(scalars+msm_size*i, points+msm_size*i, msm_size, large_res+i, false);
+  // for (unsigned i=0;i<batch_size;i++){
+    // large_msm<test_scalar, test_projective, test_affine>(scalars+msm_size*i, points+msm_size*i, msm_size, large_res+i, false);
     // std::cout<<"final result large"<<std::endl;
     // std::cout<<test_projective::to_affine(*large_res)<<std::endl;
-  }
+  // }
+
+  test_scalar *scalars_d;
+  test_affine *points_d;
+  test_projective *large_res_d;
+
+  cudaMalloc(&scalars_d, sizeof(test_scalar) * msm_size);
+  cudaMalloc(&points_d, sizeof(test_affine) * msm_size);
+  cudaMalloc(&large_res_d, sizeof(test_projective));
+  cudaMemcpy(scalars_d, scalars, sizeof(test_scalar) * msm_size, cudaMemcpyHostToDevice);
+  cudaMemcpy(points_d, points, sizeof(test_affine) * msm_size, cudaMemcpyHostToDevice);
+  
+  std::cout<<"finished copying"<<std::endl;
+
+  // batched_large_msm<test_scalar, test_projective, test_affine>(scalars, points, batch_size, msm_size, batched_large_res, false);
+  cudaStream_t stream1;
+  cudaStream_t stream2;
+  cudaStreamCreate(&stream1);
+  cudaStreamCreate(&stream2);
+  auto begin1 = std::chrono::high_resolution_clock::now();
+  large_msm<test_scalar, test_projective, test_affine>(scalars, points, msm_size, large_res, false, true,stream1);
+  auto end1 = std::chrono::high_resolution_clock::now();
+  auto elapsed1 = std::chrono::duration_cast<std::chrono::nanoseconds>(end1 - begin1);
+  printf("Big Triangle : %.3f seconds.\n", elapsed1.count() * 1e-9);
+  // std::cout<<test_projective::to_affine(large_res[0])<<std::endl;
   auto begin = std::chrono::high_resolution_clock::now();
-  batched_large_msm<test_scalar, test_projective, test_affine>(scalars, points, batch_size, msm_size, batched_large_res, false);
-  // large_msm<test_scalar, test_projective, test_affine>(scalars, points, msm_size, large_res, false);
+  large_msm<test_scalar, test_projective, test_affine>(scalars_d, points_d, msm_size, large_res_d, true, false,stream2);
+  // test_reduce_triangle(scalars);
+  // test_reduce_rectangle(scalars);
+  // test_reduce_single(scalars);
+  // test_reduce_var(scalars);
   auto end = std::chrono::high_resolution_clock::now();
   auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin);
-  printf("Time measured: %.3f seconds.\n", elapsed.count() * 1e-9);
-  std::cout<<test_projective::to_affine(large_res[0])<<std::endl;
+  printf("On Device No Big Triangle: %.3f seconds.\n", elapsed.count() * 1e-9);
+    cudaStreamSynchronize(stream1);
+    cudaStreamSynchronize(stream2);
+    cudaStreamDestroy(stream1);
+    cudaStreamDestroy(stream2);
 
-  // reference_msm<test_affine, test_scalar, test_projective>(scalars, points, msm_size);
+  std::cout<<test_projective::to_affine(large_res[0])<<std::endl;
 
-  std::cout<<"final results batched large"<<std::endl;
-  bool success = true;
-  for (unsigned i = 0; i < batch_size; i++)
-  {
-    std::cout<<test_projective::to_affine(batched_large_res[i])<<std::endl;
-    if (test_projective::to_affine(large_res[i])==test_projective::to_affine(batched_large_res[i])){
-      std::cout<<"good"<<std::endl;
-    }
-    else{
-      std::cout<<"miss"<<std::endl;
-      std::cout<<test_projective::to_affine(large_res[i])<<std::endl;
-      success = false;
-    }
-  }
-  if (success){
-    std::cout<<"success!"<<std::endl;
-  }
+  cudaMemcpy(&large_res[1], large_res_d, sizeof(test_projective), cudaMemcpyDeviceToHost);
+  std::cout<<test_projective::to_affine(large_res[1])<<std::endl;
+
+//   reference_msm<test_affine, test_scalar, test_projective>(scalars, points, msm_size);
+
+  // std::cout<<"final results batched large"<<std::endl;
+  // bool success = true;
+  // for (unsigned i = 0; i < batch_size; i++)
+  // {
+  //   std::cout<<test_projective::to_affine(batched_large_res[i])<<std::endl;
+  //   if (test_projective::to_affine(large_res[i])==test_projective::to_affine(batched_large_res[i])){
+  //     std::cout<<"good"<<std::endl;
+  //   }
+  //   else{
+  //     std::cout<<"miss"<<std::endl;
+  //     std::cout<<test_projective::to_affine(large_res[i])<<std::endl;
+  //     success = false;
+  //   }
+  // }
+  // if (success){
+  //   std::cout<<"success!"<<std::endl;
+  // }
   
   // std::cout<<batched_large_res[0]<<std::endl;
   // std::cout<<batched_large_res[1]<<std::endl;
diff --git a/icicle/appUtils/ntt/lde.cu b/icicle/appUtils/ntt/lde.cu
index 76d2fab63..c21afab60 100644
--- a/icicle/appUtils/ntt/lde.cu
+++ b/icicle/appUtils/ntt/lde.cu
@@ -5,6 +5,33 @@
 #include "lde.cuh"
 #include "../vector_manipulation/ve_mod_mult.cuh"
 
+template < typename E, bool SUB > __global__ void add_sub_array(E* res, E* in1, E* in2, uint32_t n) {
+    int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (tid < n) {
+      res[tid] = SUB ? in1[tid] - in2[tid] : in1[tid] + in2[tid];
+    }
+  }
+  
+  template <typename E>
+  int sub_polys(E* d_out, E* d_in1, E* d_in2, unsigned n, cudaStream_t stream) {
+    uint32_t NUM_THREADS = MAX_THREADS_BATCH;
+    uint32_t NUM_BLOCKS = (n + NUM_THREADS - 1) / NUM_THREADS;
+  
+    add_sub_array <E, true> <<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(d_out, d_in1, d_in2, n);
+  
+    return 0;
+  }
+  
+  template <typename E>
+  int add_polys(E* d_out, E* d_in1, E* d_in2, unsigned n, cudaStream_t stream) {
+    uint32_t NUM_THREADS = MAX_THREADS_BATCH;
+    uint32_t NUM_BLOCKS = (n + NUM_THREADS - 1) / NUM_THREADS;
+  
+    add_sub_array <E, false> <<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(d_out, d_in1, d_in2, n);
+  
+    return 0;
+  }
+  
 /**
  * Interpolate a batch of polynomials from their evaluations on the same subgroup.
  * Note: this function does not preform any bit-reverse permutations on its inputs or outputs.
@@ -14,9 +41,9 @@
  * @param n Length of `d_domain` array, also equal to the number of evaluations of each polynomial.
  * @param batch_size The size of the batch; the length of `d_evaluations` is `n` * `batch_size`.
  */
-template <typename E, typename S> int interpolate_batch(E * d_out, E * d_evaluations, S * d_domain, unsigned n, unsigned batch_size, cudaStream_t stream) {
+template <typename E, typename S> int interpolate_batch(E * d_out, E * d_evaluations, S * d_domain, unsigned n, unsigned batch_size, bool coset, S * coset_powers, cudaStream_t stream) {
   cudaMemcpyAsync(d_out, d_evaluations, sizeof(E) * n * batch_size, cudaMemcpyDeviceToDevice, stream);
-  ntt_inplace_batch_template(d_out, d_domain, n, batch_size, true, stream, true);
+  ntt_inplace_batch_template(d_out, d_domain, n, batch_size, true, coset, coset_powers, stream, true);
   return 0;
 }
 
@@ -28,8 +55,8 @@ template <typename E, typename S> int interpolate_batch(E * d_out, E * d_evaluat
  * @param d_domain Domain on which the polynomial is evaluated. Must be a subgroup.
  * @param n Length of `d_evaluations` and the size `d_domain` arrays (they should have equal length).
  */
-template <typename E, typename S> int interpolate(E * d_out, E * d_evaluations, S * d_domain, unsigned n, cudaStream_t stream) {
-  return interpolate_batch <E, S> (d_out, d_evaluations, d_domain, n, 1, stream);
+template <typename E, typename S> int interpolate(E * d_out, E * d_evaluations, S * d_domain, unsigned n, bool coset, S * coset_powers, cudaStream_t stream) {
+  return interpolate_batch <E, S> (d_out, d_evaluations, d_domain, n, 1, coset, coset_powers, stream);
 }
 
 template < typename E > __global__ void fill_array(E * arr, E val, uint32_t n) {
@@ -73,8 +100,9 @@ int evaluate_batch(E * d_out, E * d_coefficients, S * d_domain, unsigned domain_
 
   if (coset)
     batch_vector_mult(coset_powers, d_out, domain_size, batch_size, stream);
-
-  ntt_inplace_batch_template(d_out, d_domain, domain_size, batch_size, false, stream, true);
+  
+  S* _null = nullptr;
+  ntt_inplace_batch_template(d_out, d_domain, domain_size, batch_size, false, false, _null, stream, true);
   return 0;
 }
 
@@ -96,22 +124,26 @@ int evaluate(E * d_out, E * d_coefficients, S * d_domain, unsigned domain_size,
 
 template <typename S> 
 int interpolate_scalars(S* d_out, S* d_evaluations, S* d_domain, unsigned n, cudaStream_t stream) {
-  return interpolate(d_out, d_evaluations, d_domain, n, stream);
+  S* _null = nullptr;
+  return interpolate(d_out, d_evaluations, d_domain, n, false, _null, stream);
 }
 
 template <typename S> 
 int interpolate_scalars_batch(S* d_out, S* d_evaluations, S* d_domain, unsigned n, unsigned batch_size, cudaStream_t stream) {
-  return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, stream);
+  S* _null = nullptr;
+  return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, false, _null, stream);
 }
 
 template <typename E, typename S> 
 int interpolate_points(E* d_out, E* d_evaluations, S* d_domain, unsigned n, cudaStream_t stream) {
-  return interpolate(d_out, d_evaluations, d_domain, n, stream);
+  S* _null = nullptr;
+  return interpolate(d_out, d_evaluations, d_domain, n, false, _null, stream);
 }
 
 template <typename E, typename S> 
 int interpolate_points_batch(E* d_out, E* d_evaluations, S* d_domain, unsigned n, unsigned batch_size, cudaStream_t stream) {
-  return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, stream);
+  S* _null = nullptr;
+  return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, false, _null, stream);
 }
 
 template <typename S> 
@@ -139,6 +171,18 @@ int evaluate_points_batch(E* d_out, E* d_coefficients, S* d_domain,
   return evaluate_batch(d_out, d_coefficients, d_domain, domain_size, n, batch_size, false, _null, stream);
 }
 
+template <typename S> 
+int interpolate_scalars_on_coset(S* d_out, S* d_evaluations, S* d_domain,
+                                 unsigned n, S* coset_powers, cudaStream_t stream) {
+  return interpolate(d_out, d_evaluations, d_domain, n, true, coset_powers, stream);
+}
+
+template <typename S> 
+int interpolate_scalars_on_coset_batch(S* d_out, S* d_evaluations, S* d_domain,
+                                       unsigned n, unsigned batch_size, S* coset_powers, cudaStream_t stream) {
+  return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, true, coset_powers, stream);
+}
+
 template <typename S> 
 int evaluate_scalars_on_coset(S* d_out, S* d_coefficients, S* d_domain, 
                               unsigned domain_size, unsigned n, S* coset_powers, cudaStream_t stream) {
diff --git a/icicle/appUtils/ntt/ntt.cuh b/icicle/appUtils/ntt/ntt.cuh
index bb53e97f1..5456911dd 100644
--- a/icicle/appUtils/ntt/ntt.cuh
+++ b/icicle/appUtils/ntt/ntt.cuh
@@ -3,6 +3,7 @@
 #pragma once
 
 #include "../../utils/sharedmem.cuh"
+#include "../vector_manipulation/ve_mod_mult.cuh"
 
 const uint32_t MAX_NUM_THREADS = 1024;
 const uint32_t MAX_THREADS_BATCH = 512;    //TODO: allows 100% occupancy for scalar NTT for sm_86..sm_89
@@ -83,19 +84,11 @@ template < typename T > void reverse_order(T* arr, uint32_t n, uint32_t logn, cu
 }
 
 
-/**
- * Multiply the elements of an input array by a scalar in-place.
- * @param arr input array.
- * @param n size of arr.
- * @param n_inv scalar of type S (scalar).
- */
-template < typename E, typename S > __global__ void template_normalize_kernel(E * arr, uint32_t n, S scalar) {
-  int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
-  if (tid < n) {
-    arr[tid] = scalar * arr[tid];
-  }
-}
-
+enum Decimation {
+  NONE = 0,
+  DIF = 1,
+  DIT = 2,
+};
 
 /**
  * Cooley-Tuckey NTT.
@@ -288,8 +281,16 @@ __global__ void ntt_template_kernel(E *arr, uint32_t n, S *twiddles, uint32_t n_
  * @param d_twiddles 
  * @param n Length of `d_twiddles` array
  * @param batch_size The size of the batch; the length of `d_inout` is `n` * `batch_size`.
+ * @param inverse true for iNTT
+ * @param is_coset true for multiplication by coset
+ * @param coset should be array of lenght n - or in case of lesser than n, right-padded with zeroes
+ * @param stream CUDA stream   
+ * @param is_sync_needed do perform sync of the supplied CUDA stream at the end of processing
  */
-template <typename E, typename S> void ntt_inplace_batch_template(E * d_inout, S * d_twiddles, unsigned n, unsigned batch_size, bool inverse, cudaStream_t stream, bool is_sync_needed) {
+template <typename E, typename S> void ntt_inplace_batch_template(
+  E * d_inout, S * d_twiddles, unsigned n, unsigned batch_size, bool inverse, 
+  bool is_coset, S * coset, cudaStream_t stream, bool is_sync_needed) 
+{
   const int logn = int(log(n) / log(2));
   bool is_shared_mem_enabled = sizeof(E) <= MAX_SHARED_MEM_ELEMENT_SIZE;
   const int log2_shmem_elems = is_shared_mem_enabled ? int(log(int(MAX_SHARED_MEM / sizeof(E))) / log(2)) : logn;
@@ -309,12 +310,16 @@ template <typename E, typename S> void ntt_inplace_batch_template(E * d_inout, S
       ntt_template_kernel <E, S> <<<num_blocks, num_threads, 0, stream>>>(d_inout, n, d_twiddles, n, total_tasks, s, false);
     }
 
+    if (is_coset) batch_vector_mult(coset, d_inout, n, batch_size, stream);
+
     num_threads = min(n / 2, MAX_NUM_THREADS);
     num_blocks = (n * batch_size + num_threads - 1) / num_threads;
     template_normalize_kernel <E, S> <<<num_blocks, num_threads, 0, stream>>> (d_inout, n * batch_size, S::inv_log_size(logn)); 
   }
   else 
   {
+    if (is_coset) batch_vector_mult(coset, d_inout, n, batch_size, stream);
+
     for (int s = logn - 1; s >= logn_shmem; s--) // TODO: this loop also can be unrolled
     {
       ntt_template_kernel<<<num_blocks, num_threads, 0, stream>>>(d_inout, n, d_twiddles, n, total_tasks, s, true);
@@ -353,8 +358,9 @@ template <typename E, typename S> void ntt_inplace_batch_template(E * d_inout, S
   cudaMemcpyAsync(d_arr, arr, size_E, cudaMemcpyHostToDevice, stream);
   int NUM_THREADS = MAX_THREADS_BATCH;
   int NUM_BLOCKS = (batches + NUM_THREADS - 1) / NUM_THREADS;
-
-  ntt_inplace_batch_template(d_arr, d_twiddles, n, batches, inverse, stream, false);
+   
+  S* _null = nullptr;
+  ntt_inplace_batch_template(d_arr, d_twiddles, n, batches, inverse, false, _null, stream, false);
 
   cudaMemcpyAsync(arr, d_arr, size_E, cudaMemcpyDeviceToHost, stream);
   cudaFreeAsync(d_arr, stream);
diff --git a/icicle/appUtils/vector_manipulation/ve_mod_mult.cuh b/icicle/appUtils/vector_manipulation/ve_mod_mult.cuh
index 6bbf9a40a..236ad0079 100644
--- a/icicle/appUtils/vector_manipulation/ve_mod_mult.cuh
+++ b/icicle/appUtils/vector_manipulation/ve_mod_mult.cuh
@@ -7,6 +7,19 @@
 
 #define MAX_THREADS_PER_BLOCK 256
 
+/**
+ * Multiply the elements of an input array by a scalar in-place.
+ * @param arr input array.
+ * @param n size of arr.
+ * @param n_inv scalar of type S (scalar).
+ */
+ template < typename E, typename S > __global__ void template_normalize_kernel(E * arr, uint32_t n, S scalar) {
+    int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (tid < n) {
+      arr[tid] = scalar * arr[tid];
+    }
+  }
+
 // TODO: headers for prototypes and .c .cpp .cu files for implementations
 template <typename E, typename S>
 __global__ void vectorModMult(S *scalar_vec, E *element_vec, E *result, size_t n_elments)
@@ -49,6 +62,18 @@ int vector_mod_mult(S *vec_a, E *vec_b, E *result, size_t n_elments, cudaStream_
     return 0;
 }
 
+template <typename E, typename S>
+int vector_mod_mult_device(S *d_vec_a, E *d_vec_b, E *d_result, size_t n_elments) // TODO: in place so no need for third result vector
+{
+    // Set the grid and block dimensions
+    int num_blocks = (int)ceil((float)n_elments / MAX_THREADS_PER_BLOCK);
+    int threads_per_block = MAX_THREADS_PER_BLOCK;
+
+    // Call the kernel to perform element-wise modular multiplication
+    vectorModMult<<<num_blocks, threads_per_block>>>(d_vec_a, d_vec_b, d_result, n_elments);
+    return 0;
+}
+
 template <typename E, typename S>
 __global__ void batchVectorMult(S *scalar_vec, E *element_vec, unsigned n_scalars, unsigned batch_size)
 {
diff --git a/icicle/curves/bls12_377/c_api.h b/icicle/curves/bls12_377/c_api.h
new file mode 100644
index 000000000..34d1aa10f
--- /dev/null
+++ b/icicle/curves/bls12_377/c_api.h
@@ -0,0 +1,33 @@
+
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+	
+// Code generated by Ingonyama DO NOT EDIT
+
+
+#include <stdbool.h>
+#include <cuda.h>
+// c_api.h
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct BLS12377_projective_t BLS12377_projective_t;
+
+bool eq_bls12_377(BLS12377_projective_t *point1, BLS12377_projective_t *point2, size_t device_id);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/icicle/curves/bls12_377/curve_config.cuh b/icicle/curves/bls12_377/curve_config.cuh
index 1b1c95c4a..367fd061a 100644
--- a/icicle/curves/bls12_377/curve_config.cuh
+++ b/icicle/curves/bls12_377/curve_config.cuh
@@ -2,6 +2,9 @@
 
 #include "../../primitives/field.cuh"
 #include "../../primitives/projective.cuh"
+#if defined(G2_DEFINED)
+#include "../../primitives/extension_field.cuh"
+#endif
 
 #include "params.cuh"
 
diff --git a/icicle/curves/bls12_377/lde.cu b/icicle/curves/bls12_377/lde.cu
index e7e8b15f5..b4fcc80cf 100644
--- a/icicle/curves/bls12_377/lde.cu
+++ b/icicle/curves/bls12_377/lde.cu
@@ -24,7 +24,7 @@ extern "C" BLS12_377::scalar_t* build_domain_cuda_bls12_377(uint32_t domain_size
     }
 }
 
-extern "C" int ntt_cuda_bls12_377(BLS12_377::scalar_t *arr, uint32_t n, bool inverse, size_t device_id = 0, cudaStream_t stream = 0)
+extern "C" int ntt_cuda_bls12_377(BLS12_377::scalar_t *arr, uint32_t n, bool inverse, Decimation decimation, size_t device_id = 0, cudaStream_t stream = 0)
 {
     try
     {
@@ -39,7 +39,7 @@ extern "C" int ntt_cuda_bls12_377(BLS12_377::scalar_t *arr, uint32_t n, bool inv
     }
 }
 
-extern "C" int ecntt_cuda_bls12_377(BLS12_377::projective_t *arr, uint32_t n, bool inverse, size_t device_id = 0, cudaStream_t stream = 0)
+extern "C" int ecntt_cuda_bls12_377(BLS12_377::projective_t *arr, uint32_t n, bool inverse, Decimation decimation, size_t device_id = 0, cudaStream_t stream = 0)
 {
     try
     {
@@ -85,7 +85,8 @@ extern "C" int interpolate_scalars_cuda_bls12_377(BLS12_377::scalar_t* d_out, BL
 {
     try
     {
-        return interpolate(d_out, d_evaluations, d_domain, n, stream);
+        BLS12_377::scalar_t* _null = nullptr;
+        return interpolate(d_out, d_evaluations, d_domain, n, false, _null, stream);
     }
     catch (const std::runtime_error &ex)
     {
@@ -99,8 +100,9 @@ extern "C" int interpolate_scalars_batch_cuda_bls12_377(BLS12_377::scalar_t* d_o
 {
     try
     {
+        BLS12_377::scalar_t* _null = nullptr;
         cudaStreamCreate(&stream);
-        return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, stream);
+        return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, false, _null, stream);
     }
     catch (const std::runtime_error &ex)
     {
@@ -113,7 +115,8 @@ extern "C" int interpolate_points_cuda_bls12_377(BLS12_377::projective_t* d_out,
 {
     try
     {
-        return interpolate(d_out, d_evaluations, d_domain, n, stream);
+        BLS12_377::scalar_t* _null = nullptr;
+        return interpolate(d_out, d_evaluations, d_domain, n, false, _null, stream);
     }
     catch (const std::runtime_error &ex)
     {
@@ -127,8 +130,9 @@ extern "C" int interpolate_points_batch_cuda_bls12_377(BLS12_377::projective_t*
 {
     try
     {
+        BLS12_377::scalar_t* _null = nullptr;
         cudaStreamCreate(&stream);
-        return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, stream);
+        return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, false, _null, stream);
     }
     catch (const std::runtime_error &ex)
     {
@@ -267,7 +271,8 @@ extern "C" int ntt_inplace_batch_cuda_bls12_377(BLS12_377::scalar_t* d_inout, BL
     try
     {
         cudaStreamCreate(&stream);
-        ntt_inplace_batch_template(d_inout, d_twiddles, n, batch_size, inverse, stream, true);
+        BLS12_377::scalar_t* _null = nullptr;
+        ntt_inplace_batch_template(d_inout, d_twiddles, n, batch_size, inverse, false, _null, stream, true);
         return CUDA_SUCCESS; //TODO: we should implement this https://leimao.github.io/blog/Proper-CUDA-Error-Checking/
     }
     catch (const std::runtime_error &ex)
diff --git a/icicle/curves/bls12_377/msm.cu b/icicle/curves/bls12_377/msm.cu
index 73332ddbe..77d44a32f 100644
--- a/icicle/curves/bls12_377/msm.cu
+++ b/icicle/curves/bls12_377/msm.cu
@@ -12,7 +12,7 @@ int msm_cuda_bls12_377(BLS12_377::projective_t *out, BLS12_377::affine_t points[
 {
     try
     {
-        large_msm<BLS12_377::scalar_t, BLS12_377::projective_t, BLS12_377::affine_t>(scalars, points, count, out, false, stream);
+        large_msm<BLS12_377::scalar_t, BLS12_377::projective_t, BLS12_377::affine_t>(scalars, points, count, out, false, false, stream);
         return CUDA_SUCCESS;
     }
     catch (const std::runtime_error &ex)
@@ -53,7 +53,7 @@ extern "C" int msm_batch_cuda_bls12_377(BLS12_377::projective_t* out, BLS12_377:
  {
      try
      {
-         large_msm<BLS12_377::scalar_t, BLS12_377::projective_t, BLS12_377::affine_t>(d_scalars, d_points, count, d_out, true, stream);
+         large_msm<BLS12_377::scalar_t, BLS12_377::projective_t, BLS12_377::affine_t>(d_scalars, d_points, count, d_out, true, false, stream);
          cudaStreamSynchronize(stream);
          return 0;
      }
diff --git a/icicle/curves/bls12_377/msm.h b/icicle/curves/bls12_377/msm.h
new file mode 100644
index 000000000..fdfcd7418
--- /dev/null
+++ b/icicle/curves/bls12_377/msm.h
@@ -0,0 +1,53 @@
+
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+	
+// Code generated by Ingonyama DO NOT EDIT
+
+
+#include <stdbool.h>
+#include <cuda.h>
+// msm.h
+
+#ifndef _BLS12377_MSM_H
+#define _BLS12377_MSM_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Incomplete declaration of BLS12377 projective and affine structs
+typedef struct BLS12377_projective_t BLS12377_projective_t;
+typedef struct BLS12377_affine_t BLS12377_affine_t;
+typedef struct BLS12377_scalar_t BLS12377_scalar_t;
+
+int msm_cuda_bls12_377(BLS12377_projective_t* out, BLS12377_affine_t* points,
+                   BLS12377_scalar_t* scalars, size_t count, size_t device_id);
+
+int msm_batch_cuda_bls12_377(BLS12377_projective_t* out, BLS12377_affine_t* points,
+                         BLS12377_scalar_t* scalars, size_t batch_size,
+                         size_t msm_size, size_t device_id);
+
+int commit_cuda_bls12_377(BLS12377_projective_t* d_out, BLS12377_scalar_t* d_scalars,
+                      BLS12377_affine_t* d_points, size_t count, size_t device_id);
+
+int commit_batch_cuda_bls12_377(BLS12377_projective_t* d_out, BLS12377_scalar_t* d_scalars,
+                            BLS12377_affine_t* d_points, size_t count,
+                            size_t batch_size, size_t device_id);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _BLS12377_MSM_H */
diff --git a/icicle/curves/bls12_377/ntt.h b/icicle/curves/bls12_377/ntt.h
new file mode 100644
index 000000000..19842a7f9
--- /dev/null
+++ b/icicle/curves/bls12_377/ntt.h
@@ -0,0 +1,44 @@
+
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+	
+// Code generated by Ingonyama DO NOT EDIT
+
+#include <stdbool.h>
+#include <cuda.h>
+// ntt.h
+
+#ifndef _BLS12377_NTT_H
+#define _BLS12377_NTT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Incomplete declaration of BLS12377 projective and affine structs
+typedef struct BLS12377_projective_t BLS12377_projective_t;
+typedef struct BLS12377_affine_t BLS12377_affine_t;
+typedef struct BLS12377_scalar_t BLS12377_scalar_t;
+
+int ntt_cuda_bls12_377(BLS12377_scalar_t *arr, uint32_t n, bool inverse, size_t decimation, size_t device_id);
+int ntt_batch_cuda_bls12_377(BLS12377_scalar_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id);
+
+int ecntt_cuda_bls12_377(BLS12377_projective_t *arr, uint32_t n, bool inverse, size_t device_id);
+int ecntt_batch_cuda_bls12_377(BLS12377_projective_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _BLS12377_NTT_H */
diff --git a/icicle/curves/bls12_377/params.cuh b/icicle/curves/bls12_377/params.cuh
index a60375592..bd1de1084 100644
--- a/icicle/curves/bls12_377/params.cuh
+++ b/icicle/curves/bls12_377/params.cuh
@@ -153,7 +153,7 @@ namespace PARAMS_BLS12_377{
     static constexpr storage<limbs_count> one = {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
     static constexpr storage<limbs_count> zero = {0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
     // i^2, the square of the imaginary unit for the extension field
-    static constexpr uint32_t i_squared = 1;
+    static constexpr uint32_t i_squared = 5;
     // true if i^2 is negative
     static constexpr bool i_squared_is_negative = true;
     // G1 and G2 generators 
diff --git a/icicle/curves/bls12_377/ve_mod_mult.h b/icicle/curves/bls12_377/ve_mod_mult.h
new file mode 100644
index 000000000..0da1817c6
--- /dev/null
+++ b/icicle/curves/bls12_377/ve_mod_mult.h
@@ -0,0 +1,41 @@
+
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+	
+// Code generated by Ingonyama DO NOT EDIT
+
+#include <stdbool.h>
+#include <cuda.h>
+// ve_mod_mult.h
+
+#ifndef _BLS12377_VEC_MULT_H
+#define _BLS12377_VEC_MULT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct BLS12377_projective_t BLS12377_projective_t;
+typedef struct BLS12377_scalar_t BLS12377_scalar_t;
+
+int32_t vec_mod_mult_point_bls12_377(BLS12377_projective_t *inout, BLS12377_scalar_t *scalar_vec, size_t n_elments, size_t device_id);
+int32_t vec_mod_mult_scalar_bls12_377(BLS12377_scalar_t *inout, BLS12377_scalar_t *scalar_vec, size_t n_elments, size_t device_id);
+int32_t matrix_vec_mod_mult_bls12_377(BLS12377_scalar_t *matrix_flattened, BLS12377_scalar_t *input, BLS12377_scalar_t *output, size_t n_elments, size_t device_id);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _BLS12377_VEC_MULT_H */
diff --git a/icicle/curves/bls12_381/c_api.h b/icicle/curves/bls12_381/c_api.h
new file mode 100644
index 000000000..605628550
--- /dev/null
+++ b/icicle/curves/bls12_381/c_api.h
@@ -0,0 +1,32 @@
+
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+	
+// Code generated by Ingonyama DO NOT EDIT
+
+#include <stdbool.h>
+#include <cuda.h>
+// c_api.h
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct BLS12381_projective_t BLS12381_projective_t;
+
+bool eq_bls12_381(BLS12381_projective_t *point1, BLS12381_projective_t *point2, size_t device_id);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/icicle/curves/bls12_381/curve_config.cuh b/icicle/curves/bls12_381/curve_config.cuh
index c1f6781ea..24951fa56 100644
--- a/icicle/curves/bls12_381/curve_config.cuh
+++ b/icicle/curves/bls12_381/curve_config.cuh
@@ -2,6 +2,9 @@
 
 #include "../../primitives/field.cuh"
 #include "../../primitives/projective.cuh"
+#if defined(G2_DEFINED)
+#include "../../primitives/extension_field.cuh"
+#endif
 
 #include "params.cuh"
 
diff --git a/icicle/curves/bls12_381/lde.cu b/icicle/curves/bls12_381/lde.cu
index a79f4a5be..7bd92f89b 100644
--- a/icicle/curves/bls12_381/lde.cu
+++ b/icicle/curves/bls12_381/lde.cu
@@ -24,7 +24,7 @@ extern "C" BLS12_381::scalar_t* build_domain_cuda_bls12_381(uint32_t domain_size
     }
 }
 
-extern "C" int ntt_cuda_bls12_381(BLS12_381::scalar_t *arr, uint32_t n, bool inverse, size_t device_id = 0, cudaStream_t stream = 0)
+extern "C" int ntt_cuda_bls12_381(BLS12_381::scalar_t *arr, uint32_t n, bool inverse, Decimation decimation, size_t device_id = 0, cudaStream_t stream = 0)
 {
     try
     {
@@ -39,7 +39,7 @@ extern "C" int ntt_cuda_bls12_381(BLS12_381::scalar_t *arr, uint32_t n, bool inv
     }
 }
 
-extern "C" int ecntt_cuda_bls12_381(BLS12_381::projective_t *arr, uint32_t n, bool inverse, size_t device_id = 0, cudaStream_t stream = 0)
+extern "C" int ecntt_cuda_bls12_381(BLS12_381::projective_t *arr, uint32_t n, bool inverse, Decimation decimation, size_t device_id = 0, cudaStream_t stream = 0)
 {
     try
     {
@@ -85,7 +85,8 @@ extern "C" int interpolate_scalars_cuda_bls12_381(BLS12_381::scalar_t* d_out, BL
 {
     try
     {
-        return interpolate(d_out, d_evaluations, d_domain, n, stream);
+        BLS12_381::scalar_t* _null = nullptr;
+        return interpolate(d_out, d_evaluations, d_domain, n, false, _null, stream);
     }
     catch (const std::runtime_error &ex)
     {
@@ -99,11 +100,9 @@ extern "C" int interpolate_scalars_batch_cuda_bls12_381(BLS12_381::scalar_t* d_o
 {
     try
     {
-        cudaStreamCreate(&stream); //TODO: we should avoid creating stream if default (cudaStream_t stream = 0) is passed.
-                                   //      but default is not working as expected as valgrind still reports errors
-        auto result_code = interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, stream);
-        cudaStreamDestroy(stream); //TODO: hotfix for not freeing memory 
-        return result_code;
+        BLS12_381::scalar_t* _null = nullptr;
+        cudaStreamCreate(&stream);
+        return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, false, _null, stream);
     }
     catch (const std::runtime_error &ex)
     {
@@ -116,7 +115,8 @@ extern "C" int interpolate_points_cuda_bls12_381(BLS12_381::projective_t* d_out,
 {
     try
     {
-        return interpolate(d_out, d_evaluations, d_domain, n, stream);
+        BLS12_381::scalar_t* _null = nullptr;
+        return interpolate(d_out, d_evaluations, d_domain, n, false, _null, stream);
     }
     catch (const std::runtime_error &ex)
     {
@@ -130,10 +130,9 @@ extern "C" int interpolate_points_batch_cuda_bls12_381(BLS12_381::projective_t*
 {
     try
     {
+        BLS12_381::scalar_t* _null = nullptr;
         cudaStreamCreate(&stream);
-        auto result_code = interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, stream);
-        cudaStreamDestroy(stream);
-        return result_code; 
+        return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, false, _null, stream);
     }
     catch (const std::runtime_error &ex)
     {
@@ -276,7 +275,8 @@ extern "C" int ntt_inplace_batch_cuda_bls12_381(BLS12_381::scalar_t* d_inout, BL
     try
     {
         cudaStreamCreate(&stream);
-        ntt_inplace_batch_template(d_inout, d_twiddles, n, batch_size, inverse, stream, true);
+        BLS12_381::scalar_t* _null = nullptr;
+        ntt_inplace_batch_template(d_inout, d_twiddles, n, batch_size, inverse, false, _null, stream, true);
         return CUDA_SUCCESS; //TODO: we should implement this https://leimao.github.io/blog/Proper-CUDA-Error-Checking/
     }
     catch (const std::runtime_error &ex)
diff --git a/icicle/curves/bls12_381/msm.cu b/icicle/curves/bls12_381/msm.cu
index c32efa45f..4be352e29 100644
--- a/icicle/curves/bls12_381/msm.cu
+++ b/icicle/curves/bls12_381/msm.cu
@@ -12,7 +12,7 @@ int msm_cuda_bls12_381(BLS12_381::projective_t *out, BLS12_381::affine_t points[
 {
     try
     {
-        large_msm<BLS12_381::scalar_t, BLS12_381::projective_t, BLS12_381::affine_t>(scalars, points, count, out, false, stream);
+        large_msm<BLS12_381::scalar_t, BLS12_381::projective_t, BLS12_381::affine_t>(scalars, points, count, out, false, false, stream);
         return CUDA_SUCCESS;
     }
     catch (const std::runtime_error &ex)
@@ -52,7 +52,7 @@ extern "C" int msm_batch_cuda_bls12_381(BLS12_381::projective_t* out, BLS12_381:
  {
      try
      {
-         large_msm(d_scalars, d_points, count, d_out, true, stream);
+         large_msm(d_scalars, d_points, count, d_out, true, false, stream);
          cudaStreamSynchronize(stream);
          return 0;
      }
diff --git a/icicle/curves/bls12_381/msm.h b/icicle/curves/bls12_381/msm.h
new file mode 100644
index 000000000..2e78a083f
--- /dev/null
+++ b/icicle/curves/bls12_381/msm.h
@@ -0,0 +1,53 @@
+
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+	
+// Code generated by Ingonyama DO NOT EDIT
+
+
+#include <stdbool.h>
+#include <cuda.h>
+// msm.h
+
+#ifndef _BLS12381_MSM_H
+#define _BLS12381_MSM_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Incomplete declaration of BLS12381 projective and affine structs
+typedef struct BLS12381_projective_t BLS12381_projective_t;
+typedef struct BLS12381_affine_t BLS12381_affine_t;
+typedef struct BLS12381_scalar_t BLS12381_scalar_t;
+
+int msm_cuda_bls12_381(BLS12381_projective_t* out, BLS12381_affine_t* points,
+                   BLS12381_scalar_t* scalars, size_t count, size_t device_id);
+
+int msm_batch_cuda_bls12_381(BLS12381_projective_t* out, BLS12381_affine_t* points,
+                         BLS12381_scalar_t* scalars, size_t batch_size,
+                         size_t msm_size, size_t device_id);
+
+int commit_cuda_bls12_381(BLS12381_projective_t* d_out, BLS12381_scalar_t* d_scalars,
+                      BLS12381_affine_t* d_points, size_t count, size_t device_id);
+
+int commit_batch_cuda_bls12_381(BLS12381_projective_t* d_out, BLS12381_scalar_t* d_scalars,
+                            BLS12381_affine_t* d_points, size_t count,
+                            size_t batch_size, size_t device_id);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _BLS12381_MSM_H */
diff --git a/icicle/curves/bls12_381/ntt.h b/icicle/curves/bls12_381/ntt.h
new file mode 100644
index 000000000..3e4ac4054
--- /dev/null
+++ b/icicle/curves/bls12_381/ntt.h
@@ -0,0 +1,44 @@
+
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+	
+// Code generated by Ingonyama DO NOT EDIT
+
+#include <stdbool.h>
+#include <cuda.h>
+// ntt.h
+
+#ifndef _BLS12381_NTT_H
+#define _BLS12381_NTT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Incomplete declaration of BLS12381 projective and affine structs
+typedef struct BLS12381_projective_t BLS12381_projective_t;
+typedef struct BLS12381_affine_t BLS12381_affine_t;
+typedef struct BLS12381_scalar_t BLS12381_scalar_t;
+
+int ntt_cuda_bls12_381(BLS12381_scalar_t *arr, uint32_t n, bool inverse, size_t decimation, size_t device_id);
+int ntt_batch_cuda_bls12_381(BLS12381_scalar_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id);
+
+int ecntt_cuda_bls12_381(BLS12381_projective_t *arr, uint32_t n, bool inverse, size_t device_id);
+int ecntt_batch_cuda_bls12_381(BLS12381_projective_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _BLS12381_NTT_H */
diff --git a/icicle/curves/bls12_381/params.cuh b/icicle/curves/bls12_381/params.cuh
index 3589bd577..7de524dcd 100644
--- a/icicle/curves/bls12_381/params.cuh
+++ b/icicle/curves/bls12_381/params.cuh
@@ -21,6 +21,9 @@ namespace PARAMS_BLS12_381{
     // 2*modulus^2
     static constexpr storage<2*limbs_count> modulus_squared_2 = {0x00000002, 0xfffffffc, 0xfff96ffd, 0x4efd200f, 0x39b7600b, 0xd315c004, 0xa867ef70, 0x915482bc, 
                                                                 0x95538cc2, 0x84c23ede, 0xb326943b, 0x1d2b27f2, 0xde59841e, 0xa41827b7, 0xe9784ef0, 0x68fec1e7};
+    // note: doesn't actually fit into 384 bits, and shouldn't be used! is added for compilation
+    static constexpr storage<2*limbs_count> modulus_squared_4 = {0x00000002, 0xfffffffc, 0xfff96ffd, 0x4efd200f, 0x39b7600b, 0xd315c004, 0xa867ef70, 0x915482bc, 
+                                                                0x95538cc2, 0x84c23ede, 0xb326943b, 0x1d2b27f2, 0xde59841e, 0xa41827b7, 0xe9784ef0, 0x68fec1e7};
     static constexpr unsigned modulus_bit_count = 255;
     // m = floor(2^(2*modulus_bit_count) / modulus)
     static constexpr storage<limbs_count> m = {0x830358e4, 0x509cde80, 0x2f92eb5c, 0xd9410fad, 0xc1f823b4, 0xe2d772d, 0x7fb78ddf, 0x8d54253b};
diff --git a/icicle/curves/bls12_381/supported_operations.cu b/icicle/curves/bls12_381/supported_operations.cu
index 314e9f719..11be2dbda 100644
--- a/icicle/curves/bls12_381/supported_operations.cu
+++ b/icicle/curves/bls12_381/supported_operations.cu
@@ -2,4 +2,4 @@
 #include "lde.cu"
 #include "msm.cu"
 #include "ve_mod_mult.cu"
-#include "poseidon.cu"
\ No newline at end of file
+#include "poseidon.cu"
diff --git a/icicle/curves/bls12_381/ve_mod_mult.h b/icicle/curves/bls12_381/ve_mod_mult.h
new file mode 100644
index 000000000..05627ebc7
--- /dev/null
+++ b/icicle/curves/bls12_381/ve_mod_mult.h
@@ -0,0 +1,41 @@
+
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+	
+// Code generated by Ingonyama DO NOT EDIT
+
+#include <stdbool.h>
+#include <cuda.h>
+// ve_mod_mult.h
+
+#ifndef _BLS12381_VEC_MULT_H
+#define _BLS12381_VEC_MULT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct BLS12381_projective_t BLS12381_projective_t;
+typedef struct BLS12381_scalar_t BLS12381_scalar_t;
+
+int32_t vec_mod_mult_point_bls12_381(BLS12381_projective_t *inout, BLS12381_scalar_t *scalar_vec, size_t n_elments, size_t device_id);
+int32_t vec_mod_mult_scalar_bls12_381(BLS12381_scalar_t *inout, BLS12381_scalar_t *scalar_vec, size_t n_elments, size_t device_id);
+int32_t matrix_vec_mod_mult_bls12_381(BLS12381_scalar_t *matrix_flattened, BLS12381_scalar_t *input, BLS12381_scalar_t *output, size_t n_elments, size_t device_id);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _BLS12381_VEC_MULT_H */
diff --git a/icicle/curves/bn254/c_api.h b/icicle/curves/bn254/c_api.h
new file mode 100644
index 000000000..dde669012
--- /dev/null
+++ b/icicle/curves/bn254/c_api.h
@@ -0,0 +1,34 @@
+
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+	
+// Code generated by Ingonyama DO NOT EDIT
+
+#include <stdbool.h>
+#include <cuda.h>
+// c_api.h
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct BN254_projective_t BN254_projective_t;
+typedef struct  BN254_g2_projective_t BN254_g2_projective_t;
+
+bool eq_bn254(BN254_projective_t *point1, BN254_projective_t *point2);
+bool eq_g2_bn254(BN254_g2_projective_t *point1, BN254_g2_projective_t *point2);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/icicle/curves/bn254/cuda.h b/icicle/curves/bn254/cuda.h
new file mode 100644
index 000000000..fc05e1b1d
--- /dev/null
+++ b/icicle/curves/bn254/cuda.h
@@ -0,0 +1,14752 @@
+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef __cuda_cuda_h__
+#define __cuda_cuda_h__
+
+#include <stdlib.h>
+#ifdef _MSC_VER
+typedef unsigned __int32 cuuint32_t;
+typedef unsigned __int64 cuuint64_t;
+#else
+#include <stdint.h>
+typedef uint32_t cuuint32_t;
+typedef uint64_t cuuint64_t;
+#endif
+
+/**
+ * CUDA API versioning support
+ */
+#if defined(__CUDA_API_VERSION_INTERNAL) || defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
+#define __CUDA_DEPRECATED
+#elif defined(_MSC_VER)
+#define __CUDA_DEPRECATED __declspec(deprecated)
+#elif defined(__GNUC__)
+#define __CUDA_DEPRECATED __attribute__((deprecated))
+#else
+#define __CUDA_DEPRECATED
+#endif
+
+#if defined(CUDA_FORCE_API_VERSION)
+    #if (CUDA_FORCE_API_VERSION == 3010)
+        #define __CUDA_API_VERSION 3010
+    #else
+        #error "Unsupported value of CUDA_FORCE_API_VERSION"
+    #endif
+#else
+    #define __CUDA_API_VERSION 10010
+#endif /* CUDA_FORCE_API_VERSION */
+
+#if defined(__CUDA_API_VERSION_INTERNAL) || defined(CUDA_API_PER_THREAD_DEFAULT_STREAM)
+    #define __CUDA_API_PER_THREAD_DEFAULT_STREAM
+    #define __CUDA_API_PTDS(api) api ## _ptds
+    #define __CUDA_API_PTSZ(api) api ## _ptsz
+#else
+    #define __CUDA_API_PTDS(api) api
+    #define __CUDA_API_PTSZ(api) api
+#endif
+
+#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION >= 3020
+    #define cuDeviceTotalMem                    cuDeviceTotalMem_v2
+    #define cuCtxCreate                         cuCtxCreate_v2
+    #define cuModuleGetGlobal                   cuModuleGetGlobal_v2
+    #define cuMemGetInfo                        cuMemGetInfo_v2
+    #define cuMemAlloc                          cuMemAlloc_v2
+    #define cuMemAllocPitch                     cuMemAllocPitch_v2
+    #define cuMemFree                           cuMemFree_v2
+    #define cuMemGetAddressRange                cuMemGetAddressRange_v2
+    #define cuMemAllocHost                      cuMemAllocHost_v2
+    #define cuMemHostGetDevicePointer           cuMemHostGetDevicePointer_v2
+    #define cuMemcpyHtoD                        __CUDA_API_PTDS(cuMemcpyHtoD_v2)
+    #define cuMemcpyDtoH                        __CUDA_API_PTDS(cuMemcpyDtoH_v2)
+    #define cuMemcpyDtoD                        __CUDA_API_PTDS(cuMemcpyDtoD_v2)
+    #define cuMemcpyDtoA                        __CUDA_API_PTDS(cuMemcpyDtoA_v2)
+    #define cuMemcpyAtoD                        __CUDA_API_PTDS(cuMemcpyAtoD_v2)
+    #define cuMemcpyHtoA                        __CUDA_API_PTDS(cuMemcpyHtoA_v2)
+    #define cuMemcpyAtoH                        __CUDA_API_PTDS(cuMemcpyAtoH_v2)
+    #define cuMemcpyAtoA                        __CUDA_API_PTDS(cuMemcpyAtoA_v2)
+    #define cuMemcpyHtoAAsync                   __CUDA_API_PTSZ(cuMemcpyHtoAAsync_v2)
+    #define cuMemcpyAtoHAsync                   __CUDA_API_PTSZ(cuMemcpyAtoHAsync_v2)
+    #define cuMemcpy2D                          __CUDA_API_PTDS(cuMemcpy2D_v2)
+    #define cuMemcpy2DUnaligned                 __CUDA_API_PTDS(cuMemcpy2DUnaligned_v2)
+    #define cuMemcpy3D                          __CUDA_API_PTDS(cuMemcpy3D_v2)
+    #define cuMemcpyHtoDAsync                   __CUDA_API_PTSZ(cuMemcpyHtoDAsync_v2)
+    #define cuMemcpyDtoHAsync                   __CUDA_API_PTSZ(cuMemcpyDtoHAsync_v2)
+    #define cuMemcpyDtoDAsync                   __CUDA_API_PTSZ(cuMemcpyDtoDAsync_v2)
+    #define cuMemcpy2DAsync                     __CUDA_API_PTSZ(cuMemcpy2DAsync_v2)
+    #define cuMemcpy3DAsync                     __CUDA_API_PTSZ(cuMemcpy3DAsync_v2)
+    #define cuMemsetD8                          __CUDA_API_PTDS(cuMemsetD8_v2)
+    #define cuMemsetD16                         __CUDA_API_PTDS(cuMemsetD16_v2)
+    #define cuMemsetD32                         __CUDA_API_PTDS(cuMemsetD32_v2)
+    #define cuMemsetD2D8                        __CUDA_API_PTDS(cuMemsetD2D8_v2)
+    #define cuMemsetD2D16                       __CUDA_API_PTDS(cuMemsetD2D16_v2)
+    #define cuMemsetD2D32                       __CUDA_API_PTDS(cuMemsetD2D32_v2)
+    #define cuArrayCreate                       cuArrayCreate_v2
+    #define cuArrayGetDescriptor                cuArrayGetDescriptor_v2
+    #define cuArray3DCreate                     cuArray3DCreate_v2
+    #define cuArray3DGetDescriptor              cuArray3DGetDescriptor_v2
+    #define cuTexRefSetAddress                  cuTexRefSetAddress_v2
+    #define cuTexRefGetAddress                  cuTexRefGetAddress_v2
+    #define cuGraphicsResourceGetMappedPointer  cuGraphicsResourceGetMappedPointer_v2
+#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION >= 3020 */
+#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION >= 4000
+    #define cuCtxDestroy                        cuCtxDestroy_v2
+    #define cuCtxPopCurrent                     cuCtxPopCurrent_v2
+    #define cuCtxPushCurrent                    cuCtxPushCurrent_v2
+    #define cuStreamDestroy                     cuStreamDestroy_v2
+    #define cuEventDestroy                      cuEventDestroy_v2
+#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION >= 4000 */
+#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION >= 4010
+    #define cuTexRefSetAddress2D                cuTexRefSetAddress2D_v3
+#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION >= 4010 */
+#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION >= 6050
+    #define cuLinkCreate                        cuLinkCreate_v2
+    #define cuLinkAddData                       cuLinkAddData_v2
+    #define cuLinkAddFile                       cuLinkAddFile_v2
+#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION >= 6050 */
+#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION >= 6050
+    #define cuMemHostRegister                   cuMemHostRegister_v2
+    #define cuGraphicsResourceSetMapFlags       cuGraphicsResourceSetMapFlags_v2
+#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION >= 6050 */
+#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION >= 10010
+    #define cuStreamBeginCapture                __CUDA_API_PTSZ(cuStreamBeginCapture_v2)
+#elif defined(__CUDA_API_PER_THREAD_DEFAULT_STREAM)
+    #define cuStreamBeginCapture                __CUDA_API_PTSZ(cuStreamBeginCapture)
+#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION >= 10010 */
+
+#if !defined(__CUDA_API_VERSION_INTERNAL)
+#if defined(__CUDA_API_VERSION) && __CUDA_API_VERSION >= 3020 && __CUDA_API_VERSION < 4010
+    #define cuTexRefSetAddress2D                cuTexRefSetAddress2D_v2
+#endif /* __CUDA_API_VERSION && __CUDA_API_VERSION >= 3020 && __CUDA_API_VERSION < 4010 */
+#endif /* __CUDA_API_VERSION_INTERNAL */
+
+#if defined(__CUDA_API_PER_THREAD_DEFAULT_STREAM)
+    #define cuMemcpy                            __CUDA_API_PTDS(cuMemcpy)
+    #define cuMemcpyAsync                       __CUDA_API_PTSZ(cuMemcpyAsync)
+    #define cuMemcpyPeer                        __CUDA_API_PTDS(cuMemcpyPeer)
+    #define cuMemcpyPeerAsync                   __CUDA_API_PTSZ(cuMemcpyPeerAsync)
+    #define cuMemcpy3DPeer                      __CUDA_API_PTDS(cuMemcpy3DPeer)
+    #define cuMemcpy3DPeerAsync                 __CUDA_API_PTSZ(cuMemcpy3DPeerAsync)
+    #define cuMemPrefetchAsync                  __CUDA_API_PTSZ(cuMemPrefetchAsync)
+
+    #define cuMemsetD8Async                     __CUDA_API_PTSZ(cuMemsetD8Async)
+    #define cuMemsetD16Async                    __CUDA_API_PTSZ(cuMemsetD16Async)
+    #define cuMemsetD32Async                    __CUDA_API_PTSZ(cuMemsetD32Async)
+    #define cuMemsetD2D8Async                   __CUDA_API_PTSZ(cuMemsetD2D8Async)
+    #define cuMemsetD2D16Async                  __CUDA_API_PTSZ(cuMemsetD2D16Async)
+    #define cuMemsetD2D32Async                  __CUDA_API_PTSZ(cuMemsetD2D32Async)
+
+    #define cuStreamGetPriority                 __CUDA_API_PTSZ(cuStreamGetPriority)
+    #define cuStreamGetFlags                    __CUDA_API_PTSZ(cuStreamGetFlags)
+    #define cuStreamGetCtx                      __CUDA_API_PTSZ(cuStreamGetCtx)
+    #define cuStreamWaitEvent                   __CUDA_API_PTSZ(cuStreamWaitEvent)
+    #define cuStreamEndCapture                  __CUDA_API_PTSZ(cuStreamEndCapture)
+    #define cuStreamIsCapturing                 __CUDA_API_PTSZ(cuStreamIsCapturing)
+    #define cuStreamGetCaptureInfo              __CUDA_API_PTSZ(cuStreamGetCaptureInfo)
+    #define cuStreamAddCallback                 __CUDA_API_PTSZ(cuStreamAddCallback)
+    #define cuStreamAttachMemAsync              __CUDA_API_PTSZ(cuStreamAttachMemAsync)
+    #define cuStreamQuery                       __CUDA_API_PTSZ(cuStreamQuery)
+    #define cuStreamSynchronize                 __CUDA_API_PTSZ(cuStreamSynchronize)
+    #define cuEventRecord                       __CUDA_API_PTSZ(cuEventRecord)
+    #define cuLaunchKernel                      __CUDA_API_PTSZ(cuLaunchKernel)
+    #define cuLaunchHostFunc                    __CUDA_API_PTSZ(cuLaunchHostFunc)
+    #define cuGraphicsMapResources              __CUDA_API_PTSZ(cuGraphicsMapResources)
+    #define cuGraphicsUnmapResources            __CUDA_API_PTSZ(cuGraphicsUnmapResources)
+
+    #define cuStreamWriteValue32                __CUDA_API_PTSZ(cuStreamWriteValue32)
+    #define cuStreamWaitValue32                 __CUDA_API_PTSZ(cuStreamWaitValue32)
+    #define cuStreamWriteValue64                __CUDA_API_PTSZ(cuStreamWriteValue64)
+    #define cuStreamWaitValue64                 __CUDA_API_PTSZ(cuStreamWaitValue64)
+    #define cuStreamBatchMemOp                  __CUDA_API_PTSZ(cuStreamBatchMemOp)
+
+    #define cuLaunchCooperativeKernel           __CUDA_API_PTSZ(cuLaunchCooperativeKernel)
+
+    #define cuSignalExternalSemaphoresAsync     __CUDA_API_PTSZ(cuSignalExternalSemaphoresAsync)
+    #define cuWaitExternalSemaphoresAsync       __CUDA_API_PTSZ(cuWaitExternalSemaphoresAsync)
+
+    #define cuGraphLaunch                       __CUDA_API_PTSZ(cuGraphLaunch)
+#endif
+
+/**
+ * \file cuda.h
+ * \brief Header file for the CUDA Toolkit application programming interface.
+ *
+ * \file cudaGL.h
+ * \brief Header file for the OpenGL interoperability functions of the
+ * low-level CUDA driver application programming interface.
+ *
+ * \file cudaD3D9.h
+ * \brief Header file for the Direct3D 9 interoperability functions of the
+ * low-level CUDA driver application programming interface.
+ */
+
+/**
+ * \defgroup CUDA_TYPES Data types used by CUDA driver
+ * @{
+ */
+
+/**
+ * CUDA API version number
+ */
+#define CUDA_VERSION 10010
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * CUDA device pointer
+ * CUdeviceptr is defined as an unsigned integer type whose size matches the size of a pointer on the target platform.
+ */
+#if __CUDA_API_VERSION >= 3020
+
+#if defined(_WIN64) || defined(__LP64__)
+typedef unsigned long long CUdeviceptr;
+#else
+typedef unsigned int CUdeviceptr;
+#endif
+
+#endif /* __CUDA_API_VERSION >= 3020 */
+
+typedef int CUdevice;                                     /**< CUDA device */
+typedef struct CUctx_st *CUcontext;                       /**< CUDA context */
+typedef struct CUmod_st *CUmodule;                        /**< CUDA module */
+typedef struct CUfunc_st *CUfunction;                     /**< CUDA function */
+typedef struct CUarray_st *CUarray;                       /**< CUDA array */
+typedef struct CUmipmappedArray_st *CUmipmappedArray;     /**< CUDA mipmapped array */
+typedef struct CUtexref_st *CUtexref;                     /**< CUDA texture reference */
+typedef struct CUsurfref_st *CUsurfref;                   /**< CUDA surface reference */
+typedef struct CUevent_st *CUevent;                       /**< CUDA event */
+typedef struct CUstream_st *CUstream;                     /**< CUDA stream */
+typedef struct CUgraphicsResource_st *CUgraphicsResource; /**< CUDA graphics interop resource */
+typedef unsigned long long CUtexObject;                   /**< An opaque value that represents a CUDA texture object */
+typedef unsigned long long CUsurfObject;                  /**< An opaque value that represents a CUDA surface object */
+typedef struct CUextMemory_st *CUexternalMemory;          /**< CUDA external memory */
+typedef struct CUextSemaphore_st *CUexternalSemaphore;    /**< CUDA external semaphore */
+typedef struct CUgraph_st *CUgraph;                       /**< CUDA graph */
+typedef struct CUgraphNode_st *CUgraphNode;               /**< CUDA graph node */
+typedef struct CUgraphExec_st *CUgraphExec;               /**< CUDA executable graph */
+
+#ifndef CU_UUID_HAS_BEEN_DEFINED
+#define CU_UUID_HAS_BEEN_DEFINED
+typedef struct CUuuid_st {                                /**< CUDA definition of UUID */
+    char bytes[16];
+} CUuuid;
+#endif
+
+#if __CUDA_API_VERSION >= 4010
+
+/**
+ * CUDA IPC handle size
+ */
+#define CU_IPC_HANDLE_SIZE 64
+
+/**
+ * CUDA IPC event handle
+ */
+typedef struct CUipcEventHandle_st {
+    char reserved[CU_IPC_HANDLE_SIZE];
+} CUipcEventHandle;
+
+/**
+ * CUDA IPC mem handle
+ */
+typedef struct CUipcMemHandle_st {
+    char reserved[CU_IPC_HANDLE_SIZE];
+} CUipcMemHandle;
+
+/**
+ * CUDA Ipc Mem Flags
+ */
+typedef enum CUipcMem_flags_enum {
+    CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS = 0x1 /**< Automatically enable peer access between remote devices as needed */
+} CUipcMem_flags;
+
+#endif
+
+/**
+ * CUDA Mem Attach Flags
+ */
+typedef enum CUmemAttach_flags_enum {
+    CU_MEM_ATTACH_GLOBAL = 0x1, /**< Memory can be accessed by any stream on any device */
+    CU_MEM_ATTACH_HOST   = 0x2, /**< Memory cannot be accessed by any stream on any device */
+    CU_MEM_ATTACH_SINGLE = 0x4  /**< Memory can only be accessed by a single stream on the associated device */
+} CUmemAttach_flags;
+
+/**
+ * Context creation flags
+ */
+typedef enum CUctx_flags_enum {
+    CU_CTX_SCHED_AUTO          = 0x00, /**< Automatic scheduling */
+    CU_CTX_SCHED_SPIN          = 0x01, /**< Set spin as default scheduling */
+    CU_CTX_SCHED_YIELD         = 0x02, /**< Set yield as default scheduling */
+    CU_CTX_SCHED_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling */
+    CU_CTX_BLOCKING_SYNC       = 0x04, /**< Set blocking synchronization as default scheduling
+                                         *  \deprecated This flag was deprecated as of CUDA 4.0
+                                         *  and was replaced with ::CU_CTX_SCHED_BLOCKING_SYNC. */
+    CU_CTX_SCHED_MASK          = 0x07,
+    CU_CTX_MAP_HOST            = 0x08, /**< Support mapped pinned allocations */
+    CU_CTX_LMEM_RESIZE_TO_MAX  = 0x10, /**< Keep local memory allocation after launch */
+    CU_CTX_FLAGS_MASK          = 0x1f
+} CUctx_flags;
+
+/**
+ * Stream creation flags
+ */
+typedef enum CUstream_flags_enum {
+    CU_STREAM_DEFAULT      = 0x0, /**< Default stream flag */
+    CU_STREAM_NON_BLOCKING = 0x1  /**< Stream does not synchronize with stream 0 (the NULL stream) */
+} CUstream_flags;
+
+/**
+ * Legacy stream handle
+ *
+ * Stream handle that can be passed as a CUstream to use an implicit stream
+ * with legacy synchronization behavior.
+ *
+ * See details of the \link_sync_behavior
+ */
+#define CU_STREAM_LEGACY     ((CUstream)0x1)
+
+/**
+ * Per-thread stream handle
+ *
+ * Stream handle that can be passed as a CUstream to use an implicit stream
+ * with per-thread synchronization behavior.
+ *
+ * See details of the \link_sync_behavior
+ */
+#define CU_STREAM_PER_THREAD ((CUstream)0x2)
+
+/**
+ * Event creation flags
+ */
+typedef enum CUevent_flags_enum {
+    CU_EVENT_DEFAULT        = 0x0, /**< Default event flag */
+    CU_EVENT_BLOCKING_SYNC  = 0x1, /**< Event uses blocking synchronization */
+    CU_EVENT_DISABLE_TIMING = 0x2, /**< Event will not record timing data */
+    CU_EVENT_INTERPROCESS   = 0x4  /**< Event is suitable for interprocess use. CU_EVENT_DISABLE_TIMING must be set */
+} CUevent_flags;
+
+#if __CUDA_API_VERSION >= 8000
+/**
+ * Flags for ::cuStreamWaitValue32 and ::cuStreamWaitValue64
+ */
+typedef enum CUstreamWaitValue_flags_enum {
+    CU_STREAM_WAIT_VALUE_GEQ   = 0x0,   /**< Wait until (int32_t)(*addr - value) >= 0 (or int64_t for 64 bit
+                                             values). Note this is a cyclic comparison which ignores wraparound.
+                                             (Default behavior.) */
+    CU_STREAM_WAIT_VALUE_EQ    = 0x1,   /**< Wait until *addr == value. */
+    CU_STREAM_WAIT_VALUE_AND   = 0x2,   /**< Wait until (*addr & value) != 0. */
+    CU_STREAM_WAIT_VALUE_NOR   = 0x3,   /**< Wait until ~(*addr | value) != 0. Support for this operation can be
+                                             queried with ::cuDeviceGetAttribute() and
+                                             ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR.*/
+    CU_STREAM_WAIT_VALUE_FLUSH = 1<<30  /**< Follow the wait operation with a flush of outstanding remote writes. This
+                                             means that, if a remote write operation is guaranteed to have reached the
+                                             device before the wait can be satisfied, that write is guaranteed to be
+                                             visible to downstream device work. The device is permitted to reorder
+                                             remote writes internally. For example, this flag would be required if
+                                             two remote writes arrive in a defined order, the wait is satisfied by the
+                                             second write, and downstream work needs to observe the first write.
+                                             Support for this operation is restricted to selected platforms and can be
+                                             queried with ::CU_DEVICE_ATTRIBUTE_CAN_USE_WAIT_VALUE_FLUSH.*/
+} CUstreamWaitValue_flags;
+
+/**
+ * Flags for ::cuStreamWriteValue32
+ */
+typedef enum CUstreamWriteValue_flags_enum {
+    CU_STREAM_WRITE_VALUE_DEFAULT           = 0x0, /**< Default behavior */
+    CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER = 0x1  /**< Permits the write to be reordered with writes which were issued
+                                                        before it, as a performance optimization. Normally,
+                                                        ::cuStreamWriteValue32 will provide a memory fence before the
+                                                        write, which has similar semantics to
+                                                        __threadfence_system() but is scoped to the stream
+                                                        rather than a CUDA thread. */
+} CUstreamWriteValue_flags;
+
+/**
+ * Operations for ::cuStreamBatchMemOp
+ */
+typedef enum CUstreamBatchMemOpType_enum {
+    CU_STREAM_MEM_OP_WAIT_VALUE_32  = 1,     /**< Represents a ::cuStreamWaitValue32 operation */
+    CU_STREAM_MEM_OP_WRITE_VALUE_32 = 2,     /**< Represents a ::cuStreamWriteValue32 operation */
+    CU_STREAM_MEM_OP_WAIT_VALUE_64  = 4,     /**< Represents a ::cuStreamWaitValue64 operation */
+    CU_STREAM_MEM_OP_WRITE_VALUE_64 = 5,     /**< Represents a ::cuStreamWriteValue64 operation */
+    CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES = 3 /**< This has the same effect as ::CU_STREAM_WAIT_VALUE_FLUSH, but as a
+                                                  standalone operation. */
+} CUstreamBatchMemOpType;
+
+/**
+ * Per-operation parameters for ::cuStreamBatchMemOp
+ */
+typedef union CUstreamBatchMemOpParams_union {
+    CUstreamBatchMemOpType operation;
+    struct CUstreamMemOpWaitValueParams_st {
+        CUstreamBatchMemOpType operation;
+        CUdeviceptr address;
+        union {
+            cuuint32_t value;
+            cuuint64_t value64;
+        };
+        unsigned int flags;
+        CUdeviceptr alias; /**< For driver internal use. Initial value is unimportant. */
+    } waitValue;
+    struct CUstreamMemOpWriteValueParams_st {
+        CUstreamBatchMemOpType operation;
+        CUdeviceptr address;
+        union {
+            cuuint32_t value;
+            cuuint64_t value64;
+        };
+        unsigned int flags;
+        CUdeviceptr alias; /**< For driver internal use. Initial value is unimportant. */
+    } writeValue;
+    struct CUstreamMemOpFlushRemoteWritesParams_st {
+        CUstreamBatchMemOpType operation;
+        unsigned int flags;
+    } flushRemoteWrites;
+    cuuint64_t pad[6];
+} CUstreamBatchMemOpParams;
+#endif /* __CUDA_API_VERSION >= 8000 */
+
+/**
+ * Occupancy calculator flag
+ */
+typedef enum CUoccupancy_flags_enum {
+    CU_OCCUPANCY_DEFAULT                  = 0x0, /**< Default behavior */
+    CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE = 0x1  /**< Assume global caching is enabled and cannot be automatically turned off */
+} CUoccupancy_flags;
+
+/**
+ * Array formats
+ */
+typedef enum CUarray_format_enum {
+    CU_AD_FORMAT_UNSIGNED_INT8  = 0x01, /**< Unsigned 8-bit integers */
+    CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, /**< Unsigned 16-bit integers */
+    CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, /**< Unsigned 32-bit integers */
+    CU_AD_FORMAT_SIGNED_INT8    = 0x08, /**< Signed 8-bit integers */
+    CU_AD_FORMAT_SIGNED_INT16   = 0x09, /**< Signed 16-bit integers */
+    CU_AD_FORMAT_SIGNED_INT32   = 0x0a, /**< Signed 32-bit integers */
+    CU_AD_FORMAT_HALF           = 0x10, /**< 16-bit floating point */
+    CU_AD_FORMAT_FLOAT          = 0x20  /**< 32-bit floating point */
+} CUarray_format;
+
+/**
+ * Texture reference addressing modes
+ */
+typedef enum CUaddress_mode_enum {
+    CU_TR_ADDRESS_MODE_WRAP   = 0, /**< Wrapping address mode */
+    CU_TR_ADDRESS_MODE_CLAMP  = 1, /**< Clamp to edge address mode */
+    CU_TR_ADDRESS_MODE_MIRROR = 2, /**< Mirror address mode */
+    CU_TR_ADDRESS_MODE_BORDER = 3  /**< Border address mode */
+} CUaddress_mode;
+
+/**
+ * Texture reference filtering modes
+ */
+typedef enum CUfilter_mode_enum {
+    CU_TR_FILTER_MODE_POINT  = 0, /**< Point filter mode */
+    CU_TR_FILTER_MODE_LINEAR = 1  /**< Linear filter mode */
+} CUfilter_mode;
+
+/**
+ * Device properties
+ */
+typedef enum CUdevice_attribute_enum {
+    CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1,              /**< Maximum number of threads per block */
+    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2,                    /**< Maximum block dimension X */
+    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3,                    /**< Maximum block dimension Y */
+    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4,                    /**< Maximum block dimension Z */
+    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5,                     /**< Maximum grid dimension X */
+    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6,                     /**< Maximum grid dimension Y */
+    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7,                     /**< Maximum grid dimension Z */
+    CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8,        /**< Maximum shared memory available per block in bytes */
+    CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8,            /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK */
+    CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9,              /**< Memory available on device for __constant__ variables in a CUDA C kernel in bytes */
+    CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10,                         /**< Warp size in threads */
+    CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11,                         /**< Maximum pitch in bytes allowed by memory copies */
+    CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12,           /**< Maximum number of 32-bit registers available per block */
+    CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12,               /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK */
+    CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13,                        /**< Typical clock frequency in kilohertz */
+    CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14,                 /**< Alignment requirement for textures */
+    CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15,                       /**< Device can possibly copy memory and execute a kernel concurrently. Deprecated. Use instead CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT. */
+    CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16,              /**< Number of multiprocessors on device */
+    CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17,               /**< Specifies whether there is a run time limit on kernels */
+    CU_DEVICE_ATTRIBUTE_INTEGRATED = 18,                        /**< Device is integrated with host memory */
+    CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19,               /**< Device can map host memory into CUDA address space */
+    CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20,                      /**< Compute mode (See ::CUcomputemode for details) */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21,           /**< Maximum 1D texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22,           /**< Maximum 2D texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23,          /**< Maximum 2D texture height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24,           /**< Maximum 3D texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25,          /**< Maximum 3D texture height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26,           /**< Maximum 3D texture depth */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH = 27,   /**< Maximum 2D layered texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT = 28,  /**< Maximum 2D layered texture height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS = 29,  /**< Maximum layers in a 2D layered texture */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27,     /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28,    /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS */
+    CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30,                 /**< Alignment requirement for surfaces */
+    CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31,                /**< Device can possibly execute multiple kernels concurrently */
+    CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32,                       /**< Device has ECC support enabled */
+    CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33,                        /**< PCI bus ID of the device */
+    CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34,                     /**< PCI device ID of the device */
+    CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35,                        /**< Device is using TCC driver model */
+    CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36,                 /**< Peak memory clock frequency in kilohertz */
+    CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37,           /**< Global memory bus width in bits */
+    CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38,                     /**< Size of L2 cache in bytes */
+    CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39,    /**< Maximum resident threads per multiprocessor */
+    CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40,                /**< Number of asynchronous engines */
+    CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41,                /**< Device shares a unified address space with the host */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42,   /**< Maximum 1D layered texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43,  /**< Maximum layers in a 1D layered texture */
+    CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER = 44,                  /**< Deprecated, do not use. */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH = 45,    /**< Maximum 2D texture width if CUDA_ARRAY3D_TEXTURE_GATHER is set */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT = 46,   /**< Maximum 2D texture height if CUDA_ARRAY3D_TEXTURE_GATHER is set */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE = 47, /**< Alternate maximum 3D texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE = 48,/**< Alternate maximum 3D texture height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE = 49, /**< Alternate maximum 3D texture depth */
+    CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID = 50,                     /**< PCI domain ID of the device */
+    CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT = 51,           /**< Pitch alignment requirement for textures */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH = 52,      /**< Maximum cubemap texture width/height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH = 53,  /**< Maximum cubemap layered texture width/height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS = 54, /**< Maximum layers in a cubemap layered texture */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH = 55,           /**< Maximum 1D surface width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH = 56,           /**< Maximum 2D surface width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT = 57,          /**< Maximum 2D surface height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH = 58,           /**< Maximum 3D surface width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT = 59,          /**< Maximum 3D surface height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH = 60,           /**< Maximum 3D surface depth */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH = 61,   /**< Maximum 1D layered surface width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS = 62,  /**< Maximum layers in a 1D layered surface */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH = 63,   /**< Maximum 2D layered surface width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT = 64,  /**< Maximum 2D layered surface height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS = 65,  /**< Maximum layers in a 2D layered surface */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH = 66,      /**< Maximum cubemap surface width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH = 67,  /**< Maximum cubemap layered surface width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS = 68, /**< Maximum layers in a cubemap layered surface */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH = 69,    /**< Maximum 1D linear texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH = 70,    /**< Maximum 2D linear texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT = 71,   /**< Maximum 2D linear texture height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH = 72,    /**< Maximum 2D linear texture pitch in bytes */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH = 73, /**< Maximum mipmapped 2D texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT = 74,/**< Maximum mipmapped 2D texture height */
+    CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75,          /**< Major compute capability version number */
+    CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76,          /**< Minor compute capability version number */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH = 77, /**< Maximum mipmapped 1D texture width */
+    CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED = 78,       /**< Device supports stream priorities */
+    CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED = 79,         /**< Device supports caching globals in L1 */
+    CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED = 80,          /**< Device supports caching locals in L1 */
+    CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR = 81,  /**< Maximum shared memory available per multiprocessor in bytes */
+    CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82,  /**< Maximum number of 32-bit registers available per multiprocessor */
+    CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY = 83,                    /**< Device can allocate managed memory on this system */
+    CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD = 84,                    /**< Device is on a multi-GPU board */
+    CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = 85,           /**< Unique id for a group of devices on the same multi-GPU board */
+    CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED = 86,       /**< Link between the device and the host supports native atomic operations (this is a placeholder attribute, and is not supported on any current hardware)*/
+    CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO = 87,  /**< Ratio of single precision performance (in floating-point operations per second) to double precision performance */
+    CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88,            /**< Device supports coherently accessing pageable memory without calling cudaHostRegister on it */
+    CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 89,         /**< Device can coherently access managed memory concurrently with the CPU */
+    CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED = 90,      /**< Device supports compute preemption. */
+    CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = 91, /**< Device can access host registered memory at the same virtual address as the CPU */
+    CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS = 92,            /**< ::cuStreamBatchMemOp and related APIs are supported. */
+    CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS = 93,     /**< 64-bit operations are supported in ::cuStreamBatchMemOp and related APIs. */
+    CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR = 94,     /**< ::CU_STREAM_WAIT_VALUE_NOR is supported. */
+    CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH = 95,                /**< Device supports launching cooperative kernels via ::cuLaunchCooperativeKernel */
+    CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH = 96,   /**< Device can participate in cooperative kernels launched via ::cuLaunchCooperativeKernelMultiDevice */
+    CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN = 97, /**< Maximum optin shared memory per block */
+    CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES = 98,           /**< Both the ::CU_STREAM_WAIT_VALUE_FLUSH flag and the ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES MemOp are supported on the device. See \ref CUDA_MEMOP for additional details. */
+    CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED = 99,           /**< Device supports host memory registration via ::cudaHostRegister. */
+    CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES = 100, /**< Device accesses pageable memory via the host's page tables. */
+    CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST = 101, /**< The host can directly access managed memory on the device without migration. */
+    CU_DEVICE_ATTRIBUTE_MAX
+} CUdevice_attribute;
+
+/**
+ * Legacy device properties
+ */
+typedef struct CUdevprop_st {
+    int maxThreadsPerBlock;     /**< Maximum number of threads per block */
+    int maxThreadsDim[3];       /**< Maximum size of each dimension of a block */
+    int maxGridSize[3];         /**< Maximum size of each dimension of a grid */
+    int sharedMemPerBlock;      /**< Shared memory available per block in bytes */
+    int totalConstantMemory;    /**< Constant memory available on device in bytes */
+    int SIMDWidth;              /**< Warp size in threads */
+    int memPitch;               /**< Maximum pitch in bytes allowed by memory copies */
+    int regsPerBlock;           /**< 32-bit registers available per block */
+    int clockRate;              /**< Clock frequency in kilohertz */
+    int textureAlign;           /**< Alignment requirement for textures */
+} CUdevprop;
+
+/**
+ * Pointer information
+ */
+typedef enum CUpointer_attribute_enum {
+    CU_POINTER_ATTRIBUTE_CONTEXT = 1,        /**< The ::CUcontext on which a pointer was allocated or registered */
+    CU_POINTER_ATTRIBUTE_MEMORY_TYPE = 2,    /**< The ::CUmemorytype describing the physical location of a pointer */
+    CU_POINTER_ATTRIBUTE_DEVICE_POINTER = 3, /**< The address at which a pointer's memory may be accessed on the device */
+    CU_POINTER_ATTRIBUTE_HOST_POINTER = 4,   /**< The address at which a pointer's memory may be accessed on the host */
+    CU_POINTER_ATTRIBUTE_P2P_TOKENS = 5,     /**< A pair of tokens for use with the nv-p2p.h Linux kernel interface */
+    CU_POINTER_ATTRIBUTE_SYNC_MEMOPS = 6,    /**< Synchronize every synchronous memory operation initiated on this region */
+    CU_POINTER_ATTRIBUTE_BUFFER_ID = 7,      /**< A process-wide unique ID for an allocated memory region*/
+    CU_POINTER_ATTRIBUTE_IS_MANAGED = 8,     /**< Indicates if the pointer points to managed memory */
+    CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL = 9  /**< A device ordinal of a device on which a pointer was allocated or registered */
+} CUpointer_attribute;
+
+/**
+ * Function properties
+ */
+typedef enum CUfunction_attribute_enum {
+    /**
+     * The maximum number of threads per block, beyond which a launch of the
+     * function would fail. This number depends on both the function and the
+     * device on which the function is currently loaded.
+     */
+    CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0,
+
+    /**
+     * The size in bytes of statically-allocated shared memory required by
+     * this function. This does not include dynamically-allocated shared
+     * memory requested by the user at runtime.
+     */
+    CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1,
+
+    /**
+     * The size in bytes of user-allocated constant memory required by this
+     * function.
+     */
+    CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2,
+
+    /**
+     * The size in bytes of local memory used by each thread of this function.
+     */
+    CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3,
+
+    /**
+     * The number of registers used by each thread of this function.
+     */
+    CU_FUNC_ATTRIBUTE_NUM_REGS = 4,
+
+    /**
+     * The PTX virtual architecture version for which the function was
+     * compiled. This value is the major PTX version * 10 + the minor PTX
+     * version, so a PTX version 1.3 function would return the value 13.
+     * Note that this may return the undefined value of 0 for cubins
+     * compiled prior to CUDA 3.0.
+     */
+    CU_FUNC_ATTRIBUTE_PTX_VERSION = 5,
+
+    /**
+     * The binary architecture version for which the function was compiled.
+     * This value is the major binary version * 10 + the minor binary version,
+     * so a binary version 1.3 function would return the value 13. Note that
+     * this will return a value of 10 for legacy cubins that do not have a
+     * properly-encoded binary architecture version.
+     */
+    CU_FUNC_ATTRIBUTE_BINARY_VERSION = 6,
+
+    /**
+     * The attribute to indicate whether the function has been compiled with
+     * user specified option "-Xptxas --dlcm=ca" set .
+     */
+    CU_FUNC_ATTRIBUTE_CACHE_MODE_CA = 7,
+
+    /**
+     * The maximum size in bytes of dynamically-allocated shared memory that can be used by
+     * this function. If the user-specified dynamic shared memory size is larger than this
+     * value, the launch will fail.
+     * See ::cuFuncSetAttribute
+     */
+    CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES = 8,
+
+    /**
+     * On devices where the L1 cache and shared memory use the same hardware resources,
+     * this sets the shared memory carveout preference, in percent of the total shared memory.
+     * Refer to ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR.
+     * This is only a hint, and the driver can choose a different ratio if required to execute the function.
+     * See ::cuFuncSetAttribute
+     */
+    CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT = 9,
+
+    CU_FUNC_ATTRIBUTE_MAX
+} CUfunction_attribute;
+
+/**
+ * Function cache configurations
+ */
+typedef enum CUfunc_cache_enum {
+    CU_FUNC_CACHE_PREFER_NONE    = 0x00, /**< no preference for shared memory or L1 (default) */
+    CU_FUNC_CACHE_PREFER_SHARED  = 0x01, /**< prefer larger shared memory and smaller L1 cache */
+    CU_FUNC_CACHE_PREFER_L1      = 0x02, /**< prefer larger L1 cache and smaller shared memory */
+    CU_FUNC_CACHE_PREFER_EQUAL   = 0x03  /**< prefer equal sized L1 cache and shared memory */
+} CUfunc_cache;
+
+/**
+ * Shared memory configurations
+ */
+typedef enum CUsharedconfig_enum {
+    CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE    = 0x00, /**< set default shared memory bank size */
+    CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE  = 0x01, /**< set shared memory bank width to four bytes */
+    CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE = 0x02  /**< set shared memory bank width to eight bytes */
+} CUsharedconfig;
+
+/**
+ * Shared memory carveout configurations. These may be passed to ::cuFuncSetAttribute
+ */
+typedef enum CUshared_carveout_enum {
+    CU_SHAREDMEM_CARVEOUT_DEFAULT       = -1,  /**< No preference for shared memory or L1 (default) */
+    CU_SHAREDMEM_CARVEOUT_MAX_SHARED    = 100, /**< Prefer maximum available shared memory, minimum L1 cache */
+    CU_SHAREDMEM_CARVEOUT_MAX_L1        = 0    /**< Prefer maximum available L1 cache, minimum shared memory */
+} CUshared_carveout;
+
+/**
+ * Memory types
+ */
+typedef enum CUmemorytype_enum {
+    CU_MEMORYTYPE_HOST    = 0x01,    /**< Host memory */
+    CU_MEMORYTYPE_DEVICE  = 0x02,    /**< Device memory */
+    CU_MEMORYTYPE_ARRAY   = 0x03,    /**< Array memory */
+    CU_MEMORYTYPE_UNIFIED = 0x04     /**< Unified device or host memory */
+} CUmemorytype;
+
+/**
+ * Compute Modes
+ */
+typedef enum CUcomputemode_enum {
+    CU_COMPUTEMODE_DEFAULT           = 0, /**< Default compute mode (Multiple contexts allowed per device) */
+    CU_COMPUTEMODE_PROHIBITED        = 2, /**< Compute-prohibited mode (No contexts can be created on this device at this time) */
+    CU_COMPUTEMODE_EXCLUSIVE_PROCESS = 3  /**< Compute-exclusive-process mode (Only one context used by a single process can be present on this device at a time) */
+} CUcomputemode;
+
+/**
+ * Memory advise values
+ */
+typedef enum CUmem_advise_enum {
+    CU_MEM_ADVISE_SET_READ_MOSTLY          = 1, /**< Data will mostly be read and only occasionally be written to */
+    CU_MEM_ADVISE_UNSET_READ_MOSTLY        = 2, /**< Undo the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY */
+    CU_MEM_ADVISE_SET_PREFERRED_LOCATION   = 3, /**< Set the preferred location for the data as the specified device */
+    CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION = 4, /**< Clear the preferred location for the data */
+    CU_MEM_ADVISE_SET_ACCESSED_BY          = 5, /**< Data will be accessed by the specified device, so prevent page faults as much as possible */
+    CU_MEM_ADVISE_UNSET_ACCESSED_BY        = 6  /**< Let the Unified Memory subsystem decide on the page faulting policy for the specified device */
+} CUmem_advise;
+
+typedef enum CUmem_range_attribute_enum {
+    CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY            = 1, /**< Whether the range will mostly be read and only occasionally be written to */
+    CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION     = 2, /**< The preferred location of the range */
+    CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY            = 3, /**< Memory range has ::CU_MEM_ADVISE_SET_ACCESSED_BY set for specified device */
+    CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION = 4  /**< The last location to which the range was prefetched */
+} CUmem_range_attribute;
+
+/**
+ * Online compiler and linker options
+ */
+typedef enum CUjit_option_enum
+{
+    /**
+     * Max number of registers that a thread may use.\n
+     * Option type: unsigned int\n
+     * Applies to: compiler only
+     */
+    CU_JIT_MAX_REGISTERS = 0,
+
+    /**
+     * IN: Specifies minimum number of threads per block to target compilation
+     * for\n
+     * OUT: Returns the number of threads the compiler actually targeted.
+     * This restricts the resource utilization fo the compiler (e.g. max
+     * registers) such that a block with the given number of threads should be
+     * able to launch based on register limitations. Note, this option does not
+     * currently take into account any other resource limitations, such as
+     * shared memory utilization.\n
+     * Cannot be combined with ::CU_JIT_TARGET.\n
+     * Option type: unsigned int\n
+     * Applies to: compiler only
+     */
+    CU_JIT_THREADS_PER_BLOCK,
+
+    /**
+     * Overwrites the option value with the total wall clock time, in
+     * milliseconds, spent in the compiler and linker\n
+     * Option type: float\n
+     * Applies to: compiler and linker
+     */
+    CU_JIT_WALL_TIME,
+
+    /**
+     * Pointer to a buffer in which to print any log messages
+     * that are informational in nature (the buffer size is specified via
+     * option ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES)\n
+     * Option type: char *\n
+     * Applies to: compiler and linker
+     */
+    CU_JIT_INFO_LOG_BUFFER,
+
+    /**
+     * IN: Log buffer size in bytes.  Log messages will be capped at this size
+     * (including null terminator)\n
+     * OUT: Amount of log buffer filled with messages\n
+     * Option type: unsigned int\n
+     * Applies to: compiler and linker
+     */
+    CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
+
+    /**
+     * Pointer to a buffer in which to print any log messages that
+     * reflect errors (the buffer size is specified via option
+     * ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES)\n
+     * Option type: char *\n
+     * Applies to: compiler and linker
+     */
+    CU_JIT_ERROR_LOG_BUFFER,
+
+    /**
+     * IN: Log buffer size in bytes.  Log messages will be capped at this size
+     * (including null terminator)\n
+     * OUT: Amount of log buffer filled with messages\n
+     * Option type: unsigned int\n
+     * Applies to: compiler and linker
+     */
+    CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
+
+    /**
+     * Level of optimizations to apply to generated code (0 - 4), with 4
+     * being the default and highest level of optimizations.\n
+     * Option type: unsigned int\n
+     * Applies to: compiler only
+     */
+    CU_JIT_OPTIMIZATION_LEVEL,
+
+    /**
+     * No option value required. Determines the target based on the current
+     * attached context (default)\n
+     * Option type: No option value needed\n
+     * Applies to: compiler and linker
+     */
+    CU_JIT_TARGET_FROM_CUCONTEXT,
+
+    /**
+     * Target is chosen based on supplied ::CUjit_target.  Cannot be
+     * combined with ::CU_JIT_THREADS_PER_BLOCK.\n
+     * Option type: unsigned int for enumerated type ::CUjit_target\n
+     * Applies to: compiler and linker
+     */
+    CU_JIT_TARGET,
+
+    /**
+     * Specifies choice of fallback strategy if matching cubin is not found.
+     * Choice is based on supplied ::CUjit_fallback.  This option cannot be
+     * used with cuLink* APIs as the linker requires exact matches.\n
+     * Option type: unsigned int for enumerated type ::CUjit_fallback\n
+     * Applies to: compiler only
+     */
+    CU_JIT_FALLBACK_STRATEGY,
+
+    /**
+     * Specifies whether to create debug information in output (-g)
+     * (0: false, default)\n
+     * Option type: int\n
+     * Applies to: compiler and linker
+     */
+    CU_JIT_GENERATE_DEBUG_INFO,
+
+    /**
+     * Generate verbose log messages (0: false, default)\n
+     * Option type: int\n
+     * Applies to: compiler and linker
+     */
+    CU_JIT_LOG_VERBOSE,
+
+    /**
+     * Generate line number information (-lineinfo) (0: false, default)\n
+     * Option type: int\n
+     * Applies to: compiler only
+     */
+    CU_JIT_GENERATE_LINE_INFO,
+
+    /**
+     * Specifies whether to enable caching explicitly (-dlcm) \n
+     * Choice is based on supplied ::CUjit_cacheMode_enum.\n
+     * Option type: unsigned int for enumerated type ::CUjit_cacheMode_enum\n
+     * Applies to: compiler only
+     */
+    CU_JIT_CACHE_MODE,
+
+    /**
+     * The below jit options are used for internal purposes only, in this version of CUDA
+     */
+    CU_JIT_NEW_SM3X_OPT,
+    CU_JIT_FAST_COMPILE,
+
+    /**
+     * Array of device symbol names that will be relocated to the corresponing
+     * host addresses stored in ::CU_JIT_GLOBAL_SYMBOL_ADDRESSES.\n
+     * Must contain ::CU_JIT_GLOBAL_SYMBOL_COUNT entries.\n
+     * When loding a device module, driver will relocate all encountered
+     * unresolved symbols to the host addresses.\n
+     * It is only allowed to register symbols that correspond to unresolved
+     * global variables.\n
+     * It is illegal to register the same device symbol at multiple addresses.\n
+     * Option type: const char **\n
+     * Applies to: dynamic linker only
+     */
+    CU_JIT_GLOBAL_SYMBOL_NAMES,
+
+    /**
+     * Array of host addresses that will be used to relocate corresponding
+     * device symbols stored in ::CU_JIT_GLOBAL_SYMBOL_NAMES.\n
+     * Must contain ::CU_JIT_GLOBAL_SYMBOL_COUNT entries.\n
+     * Option type: void **\n
+     * Applies to: dynamic linker only
+     */
+    CU_JIT_GLOBAL_SYMBOL_ADDRESSES,
+
+    /**
+     * Number of entries in ::CU_JIT_GLOBAL_SYMBOL_NAMES and
+     * ::CU_JIT_GLOBAL_SYMBOL_ADDRESSES arrays.\n
+     * Option type: unsigned int\n
+     * Applies to: dynamic linker only
+     */
+    CU_JIT_GLOBAL_SYMBOL_COUNT,
+
+    CU_JIT_NUM_OPTIONS
+
+} CUjit_option;
+
+/**
+ * Online compilation targets
+ */
+typedef enum CUjit_target_enum
+{
+    CU_TARGET_COMPUTE_20 = 20,       /**< Compute device class 2.0 */
+    CU_TARGET_COMPUTE_21 = 21,       /**< Compute device class 2.1 */
+    CU_TARGET_COMPUTE_30 = 30,       /**< Compute device class 3.0 */
+    CU_TARGET_COMPUTE_32 = 32,       /**< Compute device class 3.2 */
+    CU_TARGET_COMPUTE_35 = 35,       /**< Compute device class 3.5 */
+    CU_TARGET_COMPUTE_37 = 37,       /**< Compute device class 3.7 */
+    CU_TARGET_COMPUTE_50 = 50,       /**< Compute device class 5.0 */
+    CU_TARGET_COMPUTE_52 = 52,       /**< Compute device class 5.2 */
+    CU_TARGET_COMPUTE_53 = 53,       /**< Compute device class 5.3 */
+    CU_TARGET_COMPUTE_60 = 60,       /**< Compute device class 6.0.*/
+    CU_TARGET_COMPUTE_61 = 61,       /**< Compute device class 6.1.*/
+    CU_TARGET_COMPUTE_62 = 62,       /**< Compute device class 6.2.*/
+    CU_TARGET_COMPUTE_70 = 70,       /**< Compute device class 7.0.*/
+    CU_TARGET_COMPUTE_72 = 72,       /**< Compute device class 7.2.*/
+    CU_TARGET_COMPUTE_75 = 75        /**< Compute device class 7.5.*/
+} CUjit_target;
+
+/**
+ * Cubin matching fallback strategies
+ */
+typedef enum CUjit_fallback_enum
+{
+    CU_PREFER_PTX = 0,  /**< Prefer to compile ptx if exact binary match not found */
+
+    CU_PREFER_BINARY    /**< Prefer to fall back to compatible binary code if exact match not found */
+
+} CUjit_fallback;
+
+/**
+ * Caching modes for dlcm
+ */
+typedef enum CUjit_cacheMode_enum
+{
+    CU_JIT_CACHE_OPTION_NONE = 0, /**< Compile with no -dlcm flag specified */
+    CU_JIT_CACHE_OPTION_CG,       /**< Compile with L1 cache disabled */
+    CU_JIT_CACHE_OPTION_CA        /**< Compile with L1 cache enabled */
+} CUjit_cacheMode;
+
+/**
+ * Device code formats
+ */
+typedef enum CUjitInputType_enum
+{
+    /**
+     * Compiled device-class-specific device code\n
+     * Applicable options: none
+     */
+    CU_JIT_INPUT_CUBIN = 0,
+
+    /**
+     * PTX source code\n
+     * Applicable options: PTX compiler options
+     */
+    CU_JIT_INPUT_PTX,
+
+    /**
+     * Bundle of multiple cubins and/or PTX of some device code\n
+     * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY
+     */
+    CU_JIT_INPUT_FATBINARY,
+
+    /**
+     * Host object with embedded device code\n
+     * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY
+     */
+    CU_JIT_INPUT_OBJECT,
+
+    /**
+     * Archive of host objects with embedded device code\n
+     * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY
+     */
+    CU_JIT_INPUT_LIBRARY,
+
+    CU_JIT_NUM_INPUT_TYPES
+} CUjitInputType;
+
+#if __CUDA_API_VERSION >= 5050
+typedef struct CUlinkState_st *CUlinkState;
+#endif /* __CUDA_API_VERSION >= 5050 */
+
+/**
+ * Flags to register a graphics resource
+ */
+typedef enum CUgraphicsRegisterFlags_enum {
+    CU_GRAPHICS_REGISTER_FLAGS_NONE           = 0x00,
+    CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY      = 0x01,
+    CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD  = 0x02,
+    CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST   = 0x04,
+    CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER = 0x08
+} CUgraphicsRegisterFlags;
+
+/**
+ * Flags for mapping and unmapping interop resources
+ */
+typedef enum CUgraphicsMapResourceFlags_enum {
+    CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE          = 0x00,
+    CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY     = 0x01,
+    CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02
+} CUgraphicsMapResourceFlags;
+
+/**
+ * Array indices for cube faces
+ */
+typedef enum CUarray_cubemap_face_enum {
+    CU_CUBEMAP_FACE_POSITIVE_X  = 0x00, /**< Positive X face of cubemap */
+    CU_CUBEMAP_FACE_NEGATIVE_X  = 0x01, /**< Negative X face of cubemap */
+    CU_CUBEMAP_FACE_POSITIVE_Y  = 0x02, /**< Positive Y face of cubemap */
+    CU_CUBEMAP_FACE_NEGATIVE_Y  = 0x03, /**< Negative Y face of cubemap */
+    CU_CUBEMAP_FACE_POSITIVE_Z  = 0x04, /**< Positive Z face of cubemap */
+    CU_CUBEMAP_FACE_NEGATIVE_Z  = 0x05  /**< Negative Z face of cubemap */
+} CUarray_cubemap_face;
+
+/**
+ * Limits
+ */
+typedef enum CUlimit_enum {
+    CU_LIMIT_STACK_SIZE                       = 0x00, /**< GPU thread stack size */
+    CU_LIMIT_PRINTF_FIFO_SIZE                 = 0x01, /**< GPU printf FIFO size */
+    CU_LIMIT_MALLOC_HEAP_SIZE                 = 0x02, /**< GPU malloc heap size */
+    CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH           = 0x03, /**< GPU device runtime launch synchronize depth */
+    CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT = 0x04, /**< GPU device runtime pending launch count */
+    CU_LIMIT_MAX_L2_FETCH_GRANULARITY         = 0x05, /**< A value between 0 and 128 that indicates the maximum fetch granularity of L2 (in Bytes). This is a hint */
+    CU_LIMIT_MAX
+} CUlimit;
+
+/**
+ * Resource types
+ */
+typedef enum CUresourcetype_enum {
+    CU_RESOURCE_TYPE_ARRAY           = 0x00, /**< Array resoure */
+    CU_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01, /**< Mipmapped array resource */
+    CU_RESOURCE_TYPE_LINEAR          = 0x02, /**< Linear resource */
+    CU_RESOURCE_TYPE_PITCH2D         = 0x03  /**< Pitch 2D resource */
+} CUresourcetype;
+
+#ifdef _WIN32
+#define CUDA_CB __stdcall
+#else
+#define CUDA_CB
+#endif
+
+#if __CUDA_API_VERSION >= 10000
+
+/**
+ * CUDA host function
+ * \param userData Argument value passed to the function
+ */
+typedef void (CUDA_CB *CUhostFn)(void *userData);
+
+/**
+ * GPU kernel node parameters
+ */
+typedef struct CUDA_KERNEL_NODE_PARAMS_st {
+    CUfunction func;             /**< Kernel to launch */
+    unsigned int gridDimX;       /**< Width of grid in blocks */
+    unsigned int gridDimY;       /**< Height of grid in blocks */
+    unsigned int gridDimZ;       /**< Depth of grid in blocks */
+    unsigned int blockDimX;      /**< X dimension of each thread block */
+    unsigned int blockDimY;      /**< Y dimension of each thread block */
+    unsigned int blockDimZ;      /**< Z dimension of each thread block */
+    unsigned int sharedMemBytes; /**< Dynamic shared-memory size per thread block in bytes */
+    void **kernelParams;         /**< Array of pointers to kernel parameters */
+    void **extra;                /**< Extra options */
+} CUDA_KERNEL_NODE_PARAMS;
+
+/**
+ * Memset node parameters
+ */
+typedef struct CUDA_MEMSET_NODE_PARAMS_st {
+    CUdeviceptr dst;                        /**< Destination device pointer */
+    size_t pitch;                           /**< Pitch of destination device pointer. Unused if height is 1 */
+    unsigned int value;                     /**< Value to be set */
+    unsigned int elementSize;               /**< Size of each element in bytes. Must be 1, 2, or 4. */
+    size_t width;                           /**< Width in bytes, of the row */
+    size_t height;                          /**< Number of rows */
+} CUDA_MEMSET_NODE_PARAMS;
+
+/**
+ * Host node parameters
+ */
+typedef struct CUDA_HOST_NODE_PARAMS_st {
+    CUhostFn fn;    /**< The function to call when the node executes */
+    void* userData; /**< Argument to pass to the function */
+} CUDA_HOST_NODE_PARAMS;
+
+/**
+ * Graph node types
+ */
+typedef enum CUgraphNodeType_enum {
+    CU_GRAPH_NODE_TYPE_KERNEL = 0, /**< GPU kernel node */
+    CU_GRAPH_NODE_TYPE_MEMCPY = 1, /**< Memcpy node */
+    CU_GRAPH_NODE_TYPE_MEMSET = 2, /**< Memset node */
+    CU_GRAPH_NODE_TYPE_HOST   = 3, /**< Host (executable) node */
+    CU_GRAPH_NODE_TYPE_GRAPH  = 4, /**< Node which executes an embedded graph */
+    CU_GRAPH_NODE_TYPE_EMPTY  = 5, /**< Empty (no-op) node */
+    CU_GRAPH_NODE_TYPE_COUNT
+} CUgraphNodeType;
+
+/**
+ * Possible stream capture statuses returned by ::cuStreamIsCapturing
+ */
+typedef enum CUstreamCaptureStatus_enum {
+    CU_STREAM_CAPTURE_STATUS_NONE        = 0, /**< Stream is not capturing */
+    CU_STREAM_CAPTURE_STATUS_ACTIVE      = 1, /**< Stream is actively capturing */
+    CU_STREAM_CAPTURE_STATUS_INVALIDATED = 2  /**< Stream is part of a capture sequence that
+                                                   has been invalidated, but not terminated */
+} CUstreamCaptureStatus;
+
+#endif /* __CUDA_API_VERSION >= 10000 */
+
+#if __CUDA_API_VERSION >= 10010
+
+/**
+ * Possible modes for stream capture thread interactions. For more details see
+ * ::cuStreamBeginCapture and ::cuThreadExchangeStreamCaptureMode
+ */
+typedef enum CUstreamCaptureMode_enum {
+    CU_STREAM_CAPTURE_MODE_GLOBAL       = 0,
+    CU_STREAM_CAPTURE_MODE_THREAD_LOCAL = 1,
+    CU_STREAM_CAPTURE_MODE_RELAXED      = 2
+} CUstreamCaptureMode;
+
+#endif /* __CUDA_API_VERSION >= 10010 */
+
+/**
+ * Error codes
+ */
+typedef enum cudaError_enum {
+    /**
+     * The API call returned with no errors. In the case of query calls, this
+     * also means that the operation being queried is complete (see
+     * ::cuEventQuery() and ::cuStreamQuery()).
+     */
+    CUDA_SUCCESS                              = 0,
+
+    /**
+     * This indicates that one or more of the parameters passed to the API call
+     * is not within an acceptable range of values.
+     */
+    CUDA_ERROR_INVALID_VALUE                  = 1,
+
+    /**
+     * The API call failed because it was unable to allocate enough memory to
+     * perform the requested operation.
+     */
+    CUDA_ERROR_OUT_OF_MEMORY                  = 2,
+
+    /**
+     * This indicates that the CUDA driver has not been initialized with
+     * ::cuInit() or that initialization has failed.
+     */
+    CUDA_ERROR_NOT_INITIALIZED                = 3,
+
+    /**
+     * This indicates that the CUDA driver is in the process of shutting down.
+     */
+    CUDA_ERROR_DEINITIALIZED                  = 4,
+
+    /**
+     * This indicates profiler is not initialized for this run. This can
+     * happen when the application is running with external profiling tools
+     * like visual profiler.
+     */
+    CUDA_ERROR_PROFILER_DISABLED              = 5,
+
+    /**
+     * \deprecated
+     * This error return is deprecated as of CUDA 5.0. It is no longer an error
+     * to attempt to enable/disable the profiling via ::cuProfilerStart or
+     * ::cuProfilerStop without initialization.
+     */
+    CUDA_ERROR_PROFILER_NOT_INITIALIZED       = 6,
+
+    /**
+     * \deprecated
+     * This error return is deprecated as of CUDA 5.0. It is no longer an error
+     * to call cuProfilerStart() when profiling is already enabled.
+     */
+    CUDA_ERROR_PROFILER_ALREADY_STARTED       = 7,
+
+    /**
+     * \deprecated
+     * This error return is deprecated as of CUDA 5.0. It is no longer an error
+     * to call cuProfilerStop() when profiling is already disabled.
+     */
+    CUDA_ERROR_PROFILER_ALREADY_STOPPED       = 8,
+
+    /**
+     * This indicates that no CUDA-capable devices were detected by the installed
+     * CUDA driver.
+     */
+    CUDA_ERROR_NO_DEVICE                      = 100,
+
+    /**
+     * This indicates that the device ordinal supplied by the user does not
+     * correspond to a valid CUDA device.
+     */
+    CUDA_ERROR_INVALID_DEVICE                 = 101,
+
+
+    /**
+     * This indicates that the device kernel image is invalid. This can also
+     * indicate an invalid CUDA module.
+     */
+    CUDA_ERROR_INVALID_IMAGE                  = 200,
+
+    /**
+     * This most frequently indicates that there is no context bound to the
+     * current thread. This can also be returned if the context passed to an
+     * API call is not a valid handle (such as a context that has had
+     * ::cuCtxDestroy() invoked on it). This can also be returned if a user
+     * mixes different API versions (i.e. 3010 context with 3020 API calls).
+     * See ::cuCtxGetApiVersion() for more details.
+     */
+    CUDA_ERROR_INVALID_CONTEXT                = 201,
+
+    /**
+     * This indicated that the context being supplied as a parameter to the
+     * API call was already the active context.
+     * \deprecated
+     * This error return is deprecated as of CUDA 3.2. It is no longer an
+     * error to attempt to push the active context via ::cuCtxPushCurrent().
+     */
+    CUDA_ERROR_CONTEXT_ALREADY_CURRENT        = 202,
+
+    /**
+     * This indicates that a map or register operation has failed.
+     */
+    CUDA_ERROR_MAP_FAILED                     = 205,
+
+    /**
+     * This indicates that an unmap or unregister operation has failed.
+     */
+    CUDA_ERROR_UNMAP_FAILED                   = 206,
+
+    /**
+     * This indicates that the specified array is currently mapped and thus
+     * cannot be destroyed.
+     */
+    CUDA_ERROR_ARRAY_IS_MAPPED                = 207,
+
+    /**
+     * This indicates that the resource is already mapped.
+     */
+    CUDA_ERROR_ALREADY_MAPPED                 = 208,
+
+    /**
+     * This indicates that there is no kernel image available that is suitable
+     * for the device. This can occur when a user specifies code generation
+     * options for a particular CUDA source file that do not include the
+     * corresponding device configuration.
+     */
+    CUDA_ERROR_NO_BINARY_FOR_GPU              = 209,
+
+    /**
+     * This indicates that a resource has already been acquired.
+     */
+    CUDA_ERROR_ALREADY_ACQUIRED               = 210,
+
+    /**
+     * This indicates that a resource is not mapped.
+     */
+    CUDA_ERROR_NOT_MAPPED                     = 211,
+
+    /**
+     * This indicates that a mapped resource is not available for access as an
+     * array.
+     */
+    CUDA_ERROR_NOT_MAPPED_AS_ARRAY            = 212,
+
+    /**
+     * This indicates that a mapped resource is not available for access as a
+     * pointer.
+     */
+    CUDA_ERROR_NOT_MAPPED_AS_POINTER          = 213,
+
+    /**
+     * This indicates that an uncorrectable ECC error was detected during
+     * execution.
+     */
+    CUDA_ERROR_ECC_UNCORRECTABLE              = 214,
+
+    /**
+     * This indicates that the ::CUlimit passed to the API call is not
+     * supported by the active device.
+     */
+    CUDA_ERROR_UNSUPPORTED_LIMIT              = 215,
+
+    /**
+     * This indicates that the ::CUcontext passed to the API call can
+     * only be bound to a single CPU thread at a time but is already
+     * bound to a CPU thread.
+     */
+    CUDA_ERROR_CONTEXT_ALREADY_IN_USE         = 216,
+
+    /**
+     * This indicates that peer access is not supported across the given
+     * devices.
+     */
+    CUDA_ERROR_PEER_ACCESS_UNSUPPORTED        = 217,
+
+    /**
+     * This indicates that a PTX JIT compilation failed.
+     */
+    CUDA_ERROR_INVALID_PTX                    = 218,
+
+    /**
+     * This indicates an error with OpenGL or DirectX context.
+     */
+    CUDA_ERROR_INVALID_GRAPHICS_CONTEXT       = 219,
+
+    /**
+    * This indicates that an uncorrectable NVLink error was detected during the
+    * execution.
+    */
+    CUDA_ERROR_NVLINK_UNCORRECTABLE           = 220,
+
+    /**
+    * This indicates that the PTX JIT compiler library was not found.
+    */
+    CUDA_ERROR_JIT_COMPILER_NOT_FOUND         = 221,
+
+    /**
+     * This indicates that the device kernel source is invalid.
+     */
+    CUDA_ERROR_INVALID_SOURCE                 = 300,
+
+    /**
+     * This indicates that the file specified was not found.
+     */
+    CUDA_ERROR_FILE_NOT_FOUND                 = 301,
+
+    /**
+     * This indicates that a link to a shared object failed to resolve.
+     */
+    CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302,
+
+    /**
+     * This indicates that initialization of a shared object failed.
+     */
+    CUDA_ERROR_SHARED_OBJECT_INIT_FAILED      = 303,
+
+    /**
+     * This indicates that an OS call failed.
+     */
+    CUDA_ERROR_OPERATING_SYSTEM               = 304,
+
+    /**
+     * This indicates that a resource handle passed to the API call was not
+     * valid. Resource handles are opaque types like ::CUstream and ::CUevent.
+     */
+    CUDA_ERROR_INVALID_HANDLE                 = 400,
+
+    /**
+     * This indicates that a resource required by the API call is not in a
+     * valid state to perform the requested operation.
+     */
+    CUDA_ERROR_ILLEGAL_STATE                  = 401,
+
+    /**
+     * This indicates that a named symbol was not found. Examples of symbols
+     * are global/constant variable names, texture names, and surface names.
+     */
+    CUDA_ERROR_NOT_FOUND                      = 500,
+
+    /**
+     * This indicates that asynchronous operations issued previously have not
+     * completed yet. This result is not actually an error, but must be indicated
+     * differently than ::CUDA_SUCCESS (which indicates completion). Calls that
+     * may return this value include ::cuEventQuery() and ::cuStreamQuery().
+     */
+    CUDA_ERROR_NOT_READY                      = 600,
+
+    /**
+     * While executing a kernel, the device encountered a
+     * load or store instruction on an invalid memory address.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    CUDA_ERROR_ILLEGAL_ADDRESS                = 700,
+
+    /**
+     * This indicates that a launch did not occur because it did not have
+     * appropriate resources. This error usually indicates that the user has
+     * attempted to pass too many arguments to the device kernel, or the
+     * kernel launch specifies too many threads for the kernel's register
+     * count. Passing arguments of the wrong size (i.e. a 64-bit pointer
+     * when a 32-bit int is expected) is equivalent to passing too many
+     * arguments and can also result in this error.
+     */
+    CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES        = 701,
+
+    /**
+     * This indicates that the device kernel took too long to execute. This can
+     * only occur if timeouts are enabled - see the device attribute
+     * ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    CUDA_ERROR_LAUNCH_TIMEOUT                 = 702,
+
+    /**
+     * This error indicates a kernel launch that uses an incompatible texturing
+     * mode.
+     */
+    CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING  = 703,
+
+    /**
+     * This error indicates that a call to ::cuCtxEnablePeerAccess() is
+     * trying to re-enable peer access to a context which has already
+     * had peer access to it enabled.
+     */
+    CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED    = 704,
+
+    /**
+     * This error indicates that ::cuCtxDisablePeerAccess() is
+     * trying to disable peer access which has not been enabled yet
+     * via ::cuCtxEnablePeerAccess().
+     */
+    CUDA_ERROR_PEER_ACCESS_NOT_ENABLED        = 705,
+
+    /**
+     * This error indicates that the primary context for the specified device
+     * has already been initialized.
+     */
+    CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE         = 708,
+
+    /**
+     * This error indicates that the context current to the calling thread
+     * has been destroyed using ::cuCtxDestroy, or is a primary context which
+     * has not yet been initialized.
+     */
+    CUDA_ERROR_CONTEXT_IS_DESTROYED           = 709,
+
+    /**
+     * A device-side assert triggered during kernel execution. The context
+     * cannot be used anymore, and must be destroyed. All existing device
+     * memory allocations from this context are invalid and must be
+     * reconstructed if the program is to continue using CUDA.
+     */
+    CUDA_ERROR_ASSERT                         = 710,
+
+    /**
+     * This error indicates that the hardware resources required to enable
+     * peer access have been exhausted for one or more of the devices
+     * passed to ::cuCtxEnablePeerAccess().
+     */
+    CUDA_ERROR_TOO_MANY_PEERS                 = 711,
+
+    /**
+     * This error indicates that the memory range passed to ::cuMemHostRegister()
+     * has already been registered.
+     */
+    CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED = 712,
+
+    /**
+     * This error indicates that the pointer passed to ::cuMemHostUnregister()
+     * does not correspond to any currently registered memory region.
+     */
+    CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED     = 713,
+
+    /**
+     * While executing a kernel, the device encountered a stack error.
+     * This can be due to stack corruption or exceeding the stack size limit.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    CUDA_ERROR_HARDWARE_STACK_ERROR           = 714,
+
+    /**
+     * While executing a kernel, the device encountered an illegal instruction.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    CUDA_ERROR_ILLEGAL_INSTRUCTION            = 715,
+
+    /**
+     * While executing a kernel, the device encountered a load or store instruction
+     * on a memory address which is not aligned.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    CUDA_ERROR_MISALIGNED_ADDRESS             = 716,
+
+    /**
+     * While executing a kernel, the device encountered an instruction
+     * which can only operate on memory locations in certain address spaces
+     * (global, shared, or local), but was supplied a memory address not
+     * belonging to an allowed address space.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    CUDA_ERROR_INVALID_ADDRESS_SPACE          = 717,
+
+    /**
+     * While executing a kernel, the device program counter wrapped its address space.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    CUDA_ERROR_INVALID_PC                     = 718,
+
+    /**
+     * An exception occurred on the device while executing a kernel. Common
+     * causes include dereferencing an invalid device pointer and accessing
+     * out of bounds shared memory. Less common cases can be system specific - more
+     * information about these cases can be found in the system specific user guide.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    CUDA_ERROR_LAUNCH_FAILED                  = 719,
+
+    /**
+     * This error indicates that the number of blocks launched per grid for a kernel that was
+     * launched via either ::cuLaunchCooperativeKernel or ::cuLaunchCooperativeKernelMultiDevice
+     * exceeds the maximum number of blocks as allowed by ::cuOccupancyMaxActiveBlocksPerMultiprocessor
+     * or ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags times the number of multiprocessors
+     * as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT.
+     */
+    CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE   = 720,
+
+    /**
+     * This error indicates that the attempted operation is not permitted.
+     */
+    CUDA_ERROR_NOT_PERMITTED                  = 800,
+
+    /**
+     * This error indicates that the attempted operation is not supported
+     * on the current system or device.
+     */
+    CUDA_ERROR_NOT_SUPPORTED                  = 801,
+
+    /**
+     * This error indicates that the system is not yet ready to start any CUDA
+     * work.  To continue using CUDA, verify the system configuration is in a
+     * valid state and all required driver daemons are actively running.
+     * More information about this error can be found in the system specific
+     * user guide.
+     */
+    CUDA_ERROR_SYSTEM_NOT_READY               = 802,
+
+    /**
+     * This error indicates that there is a mismatch between the versions of
+     * the display driver and the CUDA driver. Refer to the compatibility documentation
+     * for supported versions.
+     */
+    CUDA_ERROR_SYSTEM_DRIVER_MISMATCH         = 803,
+
+    /**
+     * This error indicates that the system was upgraded to run with forward compatibility
+     * but the visible hardware detected by CUDA does not support this configuration.
+     * Refer to the compatibility documentation for the supported hardware matrix or ensure
+     * that only supported hardware is visible during initialization via the CUDA_VISIBLE_DEVICES
+     * environment variable.
+     */
+    CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE = 804,
+
+    /**
+     * This error indicates that the operation is not permitted when
+     * the stream is capturing.
+     */
+    CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED     = 900,
+
+    /**
+     * This error indicates that the current capture sequence on the stream
+     * has been invalidated due to a previous error.
+     */
+    CUDA_ERROR_STREAM_CAPTURE_INVALIDATED     = 901,
+
+    /**
+     * This error indicates that the operation would have resulted in a merge
+     * of two independent capture sequences.
+     */
+    CUDA_ERROR_STREAM_CAPTURE_MERGE           = 902,
+
+    /**
+     * This error indicates that the capture was not initiated in this stream.
+     */
+    CUDA_ERROR_STREAM_CAPTURE_UNMATCHED       = 903,
+
+    /**
+     * This error indicates that the capture sequence contains a fork that was
+     * not joined to the primary stream.
+     */
+    CUDA_ERROR_STREAM_CAPTURE_UNJOINED        = 904,
+
+    /**
+     * This error indicates that a dependency would have been created which
+     * crosses the capture sequence boundary. Only implicit in-stream ordering
+     * dependencies are allowed to cross the boundary.
+     */
+    CUDA_ERROR_STREAM_CAPTURE_ISOLATION       = 905,
+
+    /**
+     * This error indicates a disallowed implicit dependency on a current capture
+     * sequence from cudaStreamLegacy.
+     */
+    CUDA_ERROR_STREAM_CAPTURE_IMPLICIT        = 906,
+
+    /**
+     * This error indicates that the operation is not permitted on an event which
+     * was last recorded in a capturing stream.
+     */
+    CUDA_ERROR_CAPTURED_EVENT                 = 907,
+
+    /**
+     * A stream capture sequence not initiated with the ::CU_STREAM_CAPTURE_MODE_RELAXED
+     * argument to ::cuStreamBeginCapture was passed to ::cuStreamEndCapture in a
+     * different thread.
+     */
+    CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD    = 908,
+
+    /**
+     * This indicates that an unknown internal error has occurred.
+     */
+    CUDA_ERROR_UNKNOWN                        = 999
+} CUresult;
+
+/**
+ * P2P Attributes
+ */
+typedef enum CUdevice_P2PAttribute_enum {
+    CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK                     = 0x01,  /**< A relative value indicating the performance of the link between two devices */
+    CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED                     = 0x02,  /**< P2P Access is enable */
+    CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED              = 0x03,  /**< Atomic operation over the link supported */
+    CU_DEVICE_P2P_ATTRIBUTE_ACCESS_ACCESS_SUPPORTED              = 0x04,  /**< \deprecated use CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED instead */
+    CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED          = 0x04   /**< Accessing CUDA arrays over the link supported */
+} CUdevice_P2PAttribute;
+
+/**
+ * CUDA stream callback
+ * \param hStream The stream the callback was added to, as passed to ::cuStreamAddCallback.  May be NULL.
+ * \param status ::CUDA_SUCCESS or any persistent error on the stream.
+ * \param userData User parameter provided at registration.
+ */
+typedef void (CUDA_CB *CUstreamCallback)(CUstream hStream, CUresult status, void *userData);
+
+/**
+ * Block size to per-block dynamic shared memory mapping for a certain
+ * kernel \param blockSize Block size of the kernel.
+ *
+ * \return The dynamic shared memory needed by a block.
+ */
+typedef size_t (CUDA_CB *CUoccupancyB2DSize)(int blockSize);
+
+/**
+ * If set, host memory is portable between CUDA contexts.
+ * Flag for ::cuMemHostAlloc()
+ */
+#define CU_MEMHOSTALLOC_PORTABLE        0x01
+
+/**
+ * If set, host memory is mapped into CUDA address space and
+ * ::cuMemHostGetDevicePointer() may be called on the host pointer.
+ * Flag for ::cuMemHostAlloc()
+ */
+#define CU_MEMHOSTALLOC_DEVICEMAP       0x02
+
+/**
+ * If set, host memory is allocated as write-combined - fast to write,
+ * faster to DMA, slow to read except via SSE4 streaming load instruction
+ * (MOVNTDQA).
+ * Flag for ::cuMemHostAlloc()
+ */
+#define CU_MEMHOSTALLOC_WRITECOMBINED   0x04
+
+/**
+ * If set, host memory is portable between CUDA contexts.
+ * Flag for ::cuMemHostRegister()
+ */
+#define CU_MEMHOSTREGISTER_PORTABLE     0x01
+
+/**
+ * If set, host memory is mapped into CUDA address space and
+ * ::cuMemHostGetDevicePointer() may be called on the host pointer.
+ * Flag for ::cuMemHostRegister()
+ */
+#define CU_MEMHOSTREGISTER_DEVICEMAP    0x02
+
+/**
+ * If set, the passed memory pointer is treated as pointing to some
+ * memory-mapped I/O space, e.g. belonging to a third-party PCIe device.
+ * On Windows the flag is a no-op.
+ * On Linux that memory is marked as non cache-coherent for the GPU and
+ * is expected to be physically contiguous. It may return
+ * CUDA_ERROR_NOT_PERMITTED if run as an unprivileged user,
+ * CUDA_ERROR_NOT_SUPPORTED on older Linux kernel versions.
+ * On all other platforms, it is not supported and CUDA_ERROR_NOT_SUPPORTED
+ * is returned.
+ * Flag for ::cuMemHostRegister()
+ */
+#define CU_MEMHOSTREGISTER_IOMEMORY     0x04
+
+#if __CUDA_API_VERSION >= 3020
+
+/**
+ * 2D memory copy parameters
+ */
+typedef struct CUDA_MEMCPY2D_st {
+    size_t srcXInBytes;         /**< Source X in bytes */
+    size_t srcY;                /**< Source Y */
+
+    CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
+    const void *srcHost;        /**< Source host pointer */
+    CUdeviceptr srcDevice;      /**< Source device pointer */
+    CUarray srcArray;           /**< Source array reference */
+    size_t srcPitch;            /**< Source pitch (ignored when src is array) */
+
+    size_t dstXInBytes;         /**< Destination X in bytes */
+    size_t dstY;                /**< Destination Y */
+
+    CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
+    void *dstHost;              /**< Destination host pointer */
+    CUdeviceptr dstDevice;      /**< Destination device pointer */
+    CUarray dstArray;           /**< Destination array reference */
+    size_t dstPitch;            /**< Destination pitch (ignored when dst is array) */
+
+    size_t WidthInBytes;        /**< Width of 2D memory copy in bytes */
+    size_t Height;              /**< Height of 2D memory copy */
+} CUDA_MEMCPY2D;
+
+/**
+ * 3D memory copy parameters
+ */
+typedef struct CUDA_MEMCPY3D_st {
+    size_t srcXInBytes;         /**< Source X in bytes */
+    size_t srcY;                /**< Source Y */
+    size_t srcZ;                /**< Source Z */
+    size_t srcLOD;              /**< Source LOD */
+    CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
+    const void *srcHost;        /**< Source host pointer */
+    CUdeviceptr srcDevice;      /**< Source device pointer */
+    CUarray srcArray;           /**< Source array reference */
+    void *reserved0;            /**< Must be NULL */
+    size_t srcPitch;            /**< Source pitch (ignored when src is array) */
+    size_t srcHeight;           /**< Source height (ignored when src is array; may be 0 if Depth==1) */
+
+    size_t dstXInBytes;         /**< Destination X in bytes */
+    size_t dstY;                /**< Destination Y */
+    size_t dstZ;                /**< Destination Z */
+    size_t dstLOD;              /**< Destination LOD */
+    CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
+    void *dstHost;              /**< Destination host pointer */
+    CUdeviceptr dstDevice;      /**< Destination device pointer */
+    CUarray dstArray;           /**< Destination array reference */
+    void *reserved1;            /**< Must be NULL */
+    size_t dstPitch;            /**< Destination pitch (ignored when dst is array) */
+    size_t dstHeight;           /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */
+
+    size_t WidthInBytes;        /**< Width of 3D memory copy in bytes */
+    size_t Height;              /**< Height of 3D memory copy */
+    size_t Depth;               /**< Depth of 3D memory copy */
+} CUDA_MEMCPY3D;
+
+/**
+ * 3D memory cross-context copy parameters
+ */
+typedef struct CUDA_MEMCPY3D_PEER_st {
+    size_t srcXInBytes;         /**< Source X in bytes */
+    size_t srcY;                /**< Source Y */
+    size_t srcZ;                /**< Source Z */
+    size_t srcLOD;              /**< Source LOD */
+    CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
+    const void *srcHost;        /**< Source host pointer */
+    CUdeviceptr srcDevice;      /**< Source device pointer */
+    CUarray srcArray;           /**< Source array reference */
+    CUcontext srcContext;       /**< Source context (ignored with srcMemoryType is ::CU_MEMORYTYPE_ARRAY) */
+    size_t srcPitch;            /**< Source pitch (ignored when src is array) */
+    size_t srcHeight;           /**< Source height (ignored when src is array; may be 0 if Depth==1) */
+
+    size_t dstXInBytes;         /**< Destination X in bytes */
+    size_t dstY;                /**< Destination Y */
+    size_t dstZ;                /**< Destination Z */
+    size_t dstLOD;              /**< Destination LOD */
+    CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
+    void *dstHost;              /**< Destination host pointer */
+    CUdeviceptr dstDevice;      /**< Destination device pointer */
+    CUarray dstArray;           /**< Destination array reference */
+    CUcontext dstContext;       /**< Destination context (ignored with dstMemoryType is ::CU_MEMORYTYPE_ARRAY) */
+    size_t dstPitch;            /**< Destination pitch (ignored when dst is array) */
+    size_t dstHeight;           /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */
+
+    size_t WidthInBytes;        /**< Width of 3D memory copy in bytes */
+    size_t Height;              /**< Height of 3D memory copy */
+    size_t Depth;               /**< Depth of 3D memory copy */
+} CUDA_MEMCPY3D_PEER;
+
+/**
+ * Array descriptor
+ */
+typedef struct CUDA_ARRAY_DESCRIPTOR_st
+{
+    size_t Width;             /**< Width of array */
+    size_t Height;            /**< Height of array */
+
+    CUarray_format Format;    /**< Array format */
+    unsigned int NumChannels; /**< Channels per array element */
+} CUDA_ARRAY_DESCRIPTOR;
+
+/**
+ * 3D array descriptor
+ */
+typedef struct CUDA_ARRAY3D_DESCRIPTOR_st
+{
+    size_t Width;             /**< Width of 3D array */
+    size_t Height;            /**< Height of 3D array */
+    size_t Depth;             /**< Depth of 3D array */
+
+    CUarray_format Format;    /**< Array format */
+    unsigned int NumChannels; /**< Channels per array element */
+    unsigned int Flags;       /**< Flags */
+} CUDA_ARRAY3D_DESCRIPTOR;
+
+#endif /* __CUDA_API_VERSION >= 3020 */
+
+#if __CUDA_API_VERSION >= 5000
+
+/**
+ * CUDA Resource descriptor
+ */
+typedef struct CUDA_RESOURCE_DESC_st
+{
+    CUresourcetype resType;                   /**< Resource type */
+
+    union {
+        struct {
+            CUarray hArray;                   /**< CUDA array */
+        } array;
+        struct {
+            CUmipmappedArray hMipmappedArray; /**< CUDA mipmapped array */
+        } mipmap;
+        struct {
+            CUdeviceptr devPtr;               /**< Device pointer */
+            CUarray_format format;            /**< Array format */
+            unsigned int numChannels;         /**< Channels per array element */
+            size_t sizeInBytes;               /**< Size in bytes */
+        } linear;
+        struct {
+            CUdeviceptr devPtr;               /**< Device pointer */
+            CUarray_format format;            /**< Array format */
+            unsigned int numChannels;         /**< Channels per array element */
+            size_t width;                     /**< Width of the array in elements */
+            size_t height;                    /**< Height of the array in elements */
+            size_t pitchInBytes;              /**< Pitch between two rows in bytes */
+        } pitch2D;
+        struct {
+            int reserved[32];
+        } reserved;
+    } res;
+
+    unsigned int flags;                       /**< Flags (must be zero) */
+} CUDA_RESOURCE_DESC;
+
+/**
+ * Texture descriptor
+ */
+typedef struct CUDA_TEXTURE_DESC_st {
+    CUaddress_mode addressMode[3];  /**< Address modes */
+    CUfilter_mode filterMode;       /**< Filter mode */
+    unsigned int flags;             /**< Flags */
+    unsigned int maxAnisotropy;     /**< Maximum anisotropy ratio */
+    CUfilter_mode mipmapFilterMode; /**< Mipmap filter mode */
+    float mipmapLevelBias;          /**< Mipmap level bias */
+    float minMipmapLevelClamp;      /**< Mipmap minimum level clamp */
+    float maxMipmapLevelClamp;      /**< Mipmap maximum level clamp */
+    float borderColor[4];           /**< Border Color */
+    int reserved[12];
+} CUDA_TEXTURE_DESC;
+
+/**
+ * Resource view format
+ */
+typedef enum CUresourceViewFormat_enum
+{
+    CU_RES_VIEW_FORMAT_NONE          = 0x00, /**< No resource view format (use underlying resource format) */
+    CU_RES_VIEW_FORMAT_UINT_1X8      = 0x01, /**< 1 channel unsigned 8-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_2X8      = 0x02, /**< 2 channel unsigned 8-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_4X8      = 0x03, /**< 4 channel unsigned 8-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_1X8      = 0x04, /**< 1 channel signed 8-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_2X8      = 0x05, /**< 2 channel signed 8-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_4X8      = 0x06, /**< 4 channel signed 8-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_1X16     = 0x07, /**< 1 channel unsigned 16-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_2X16     = 0x08, /**< 2 channel unsigned 16-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_4X16     = 0x09, /**< 4 channel unsigned 16-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_1X16     = 0x0a, /**< 1 channel signed 16-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_2X16     = 0x0b, /**< 2 channel signed 16-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_4X16     = 0x0c, /**< 4 channel signed 16-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_1X32     = 0x0d, /**< 1 channel unsigned 32-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_2X32     = 0x0e, /**< 2 channel unsigned 32-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_4X32     = 0x0f, /**< 4 channel unsigned 32-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_1X32     = 0x10, /**< 1 channel signed 32-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_2X32     = 0x11, /**< 2 channel signed 32-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_4X32     = 0x12, /**< 4 channel signed 32-bit integers */
+    CU_RES_VIEW_FORMAT_FLOAT_1X16    = 0x13, /**< 1 channel 16-bit floating point */
+    CU_RES_VIEW_FORMAT_FLOAT_2X16    = 0x14, /**< 2 channel 16-bit floating point */
+    CU_RES_VIEW_FORMAT_FLOAT_4X16    = 0x15, /**< 4 channel 16-bit floating point */
+    CU_RES_VIEW_FORMAT_FLOAT_1X32    = 0x16, /**< 1 channel 32-bit floating point */
+    CU_RES_VIEW_FORMAT_FLOAT_2X32    = 0x17, /**< 2 channel 32-bit floating point */
+    CU_RES_VIEW_FORMAT_FLOAT_4X32    = 0x18, /**< 4 channel 32-bit floating point */
+    CU_RES_VIEW_FORMAT_UNSIGNED_BC1  = 0x19, /**< Block compressed 1 */
+    CU_RES_VIEW_FORMAT_UNSIGNED_BC2  = 0x1a, /**< Block compressed 2 */
+    CU_RES_VIEW_FORMAT_UNSIGNED_BC3  = 0x1b, /**< Block compressed 3 */
+    CU_RES_VIEW_FORMAT_UNSIGNED_BC4  = 0x1c, /**< Block compressed 4 unsigned */
+    CU_RES_VIEW_FORMAT_SIGNED_BC4    = 0x1d, /**< Block compressed 4 signed */
+    CU_RES_VIEW_FORMAT_UNSIGNED_BC5  = 0x1e, /**< Block compressed 5 unsigned */
+    CU_RES_VIEW_FORMAT_SIGNED_BC5    = 0x1f, /**< Block compressed 5 signed */
+    CU_RES_VIEW_FORMAT_UNSIGNED_BC6H = 0x20, /**< Block compressed 6 unsigned half-float */
+    CU_RES_VIEW_FORMAT_SIGNED_BC6H   = 0x21, /**< Block compressed 6 signed half-float */
+    CU_RES_VIEW_FORMAT_UNSIGNED_BC7  = 0x22  /**< Block compressed 7 */
+} CUresourceViewFormat;
+
+/**
+ * Resource view descriptor
+ */
+typedef struct CUDA_RESOURCE_VIEW_DESC_st
+{
+    CUresourceViewFormat format;   /**< Resource view format */
+    size_t width;                  /**< Width of the resource view */
+    size_t height;                 /**< Height of the resource view */
+    size_t depth;                  /**< Depth of the resource view */
+    unsigned int firstMipmapLevel; /**< First defined mipmap level */
+    unsigned int lastMipmapLevel;  /**< Last defined mipmap level */
+    unsigned int firstLayer;       /**< First layer index */
+    unsigned int lastLayer;        /**< Last layer index */
+    unsigned int reserved[16];
+} CUDA_RESOURCE_VIEW_DESC;
+
+/**
+ * GPU Direct v3 tokens
+ */
+typedef struct CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_st {
+    unsigned long long p2pToken;
+    unsigned int vaSpaceToken;
+} CUDA_POINTER_ATTRIBUTE_P2P_TOKENS;
+
+#endif /* __CUDA_API_VERSION >= 5000 */
+
+#if __CUDA_API_VERSION >= 9000
+
+/**
+ * Kernel launch parameters
+ */
+typedef struct CUDA_LAUNCH_PARAMS_st {
+    CUfunction function;         /**< Kernel to launch */
+    unsigned int gridDimX;       /**< Width of grid in blocks */
+    unsigned int gridDimY;       /**< Height of grid in blocks */
+    unsigned int gridDimZ;       /**< Depth of grid in blocks */
+    unsigned int blockDimX;      /**< X dimension of each thread block */
+    unsigned int blockDimY;      /**< Y dimension of each thread block */
+    unsigned int blockDimZ;      /**< Z dimension of each thread block */
+    unsigned int sharedMemBytes; /**< Dynamic shared-memory size per thread block in bytes */
+    CUstream hStream;            /**< Stream identifier */
+    void **kernelParams;         /**< Array of pointers to kernel parameters */
+} CUDA_LAUNCH_PARAMS;
+
+#endif /* __CUDA_API_VERSION >= 9000 */
+
+#if __CUDA_API_VERSION >= 10000
+
+/**
+ * External memory handle types
+ */
+typedef enum CUexternalMemoryHandleType_enum {
+    /**
+     * Handle is an opaque file descriptor
+     */
+    CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD        = 1,
+    /**
+     * Handle is an opaque shared NT handle
+     */
+    CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32     = 2,
+    /**
+     * Handle is an opaque, globally shared handle
+     */
+    CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT = 3,
+    /**
+     * Handle is a D3D12 heap object
+     */
+    CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP       = 4,
+    /**
+     * Handle is a D3D12 committed resource
+     */
+    CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE   = 5
+} CUexternalMemoryHandleType;
+
+/**
+ * Indicates that the external memory object is a dedicated resource
+ */
+#define CUDA_EXTERNAL_MEMORY_DEDICATED   0x1
+
+/**
+ * External memory handle descriptor
+ */
+typedef struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st {
+    /**
+     * Type of the handle
+     */
+    CUexternalMemoryHandleType type;
+    union {
+        /**
+         * File descriptor referencing the memory object. Valid
+         * when type is
+         * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD
+         */
+        int fd;
+        /**
+         * Win32 handle referencing the semaphore object. Valid when
+         * type is one of the following:
+         * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32
+         * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT
+         * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP
+         * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE
+         * Exactly one of 'handle' and 'name' must be non-NULL. If
+         * type is
+         * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT
+         * then 'name' must be NULL.
+         */
+        struct {
+            /**
+             * Valid NT handle. Must be NULL if 'name' is non-NULL
+             */
+            void *handle;
+            /**
+             * Name of a valid memory object.
+             * Must be NULL if 'handle' is non-NULL.
+             */
+            const void *name;
+        } win32;
+    } handle;
+    /**
+     * Size of the memory allocation
+     */
+    unsigned long long size;
+    /**
+     * Flags must either be zero or ::CUDA_EXTERNAL_MEMORY_DEDICATED
+     */
+    unsigned int flags;
+    unsigned int reserved[16];
+} CUDA_EXTERNAL_MEMORY_HANDLE_DESC;
+
+/**
+ * External memory buffer descriptor
+ */
+typedef struct CUDA_EXTERNAL_MEMORY_BUFFER_DESC_st {
+    /**
+     * Offset into the memory object where the buffer's base is
+     */
+    unsigned long long offset;
+    /**
+     * Size of the buffer
+     */
+    unsigned long long size;
+    /**
+     * Flags reserved for future use. Must be zero.
+     */
+    unsigned int flags;
+    unsigned int reserved[16];
+} CUDA_EXTERNAL_MEMORY_BUFFER_DESC;
+
+/**
+ * External memory mipmap descriptor
+ */
+typedef struct CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_st {
+    /**
+     * Offset into the memory object where the base level of the
+     * mipmap chain is.
+     */
+    unsigned long long offset;
+    /**
+     * Format, dimension and type of base level of the mipmap chain
+     */
+    CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
+    /**
+     * Total number of levels in the mipmap chain
+     */
+    unsigned int numLevels;
+    unsigned int reserved[16];
+} CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC;
+
+/**
+ * External semaphore handle types
+ */
+typedef enum CUexternalSemaphoreHandleType_enum {
+    /**
+     * Handle is an opaque file descriptor
+     */
+    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD        = 1,
+    /**
+     * Handle is an opaque shared NT handle
+     */
+    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32     = 2,
+    /**
+     * Handle is an opaque, globally shared handle
+     */
+    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT = 3,
+    /**
+     * Handle is a shared NT handle referencing a D3D12 fence object
+     */
+    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE     = 4
+} CUexternalSemaphoreHandleType;
+
+/**
+ * External semaphore handle descriptor
+ */
+typedef struct CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st {
+    /**
+     * Type of the handle
+     */
+    CUexternalSemaphoreHandleType type;
+    union {
+        /**
+         * File descriptor referencing the semaphore object. Valid
+         * when type is
+         * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD
+         */
+        int fd;
+        /**
+         * Win32 handle referencing the semaphore object. Valid when
+         * type is one of the following:
+         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32
+         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT
+         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE
+         * Exactly one of 'handle' and 'name' must be non-NULL. If
+         * type is
+         * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT
+         * then 'name' must be NULL.
+         */
+        struct {
+            /**
+             * Valid NT handle. Must be NULL if 'name' is non-NULL
+             */
+            void *handle;
+            /**
+             * Name of a valid synchronization primitive.
+             * Must be NULL if 'handle' is non-NULL.
+             */
+            const void *name;
+        } win32;
+    } handle;
+    /**
+     * Flags reserved for the future. Must be zero.
+     */
+    unsigned int flags;
+    unsigned int reserved[16];
+} CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC;
+
+/**
+ * External semaphore signal parameters
+ */
+typedef struct CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st {
+    struct {
+        /**
+         * Parameters for fence objects
+         */
+        struct {
+            /**
+             * Value of fence to be signaled
+             */
+            unsigned long long value;
+        } fence;
+        unsigned int reserved[16];
+    } params;
+    /**
+     * Flags reserved for the future. Must be zero.
+     */
+    unsigned int flags;
+    unsigned int reserved[16];
+} CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS;
+
+/**
+ * External semaphore wait parameters
+ */
+typedef struct CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st {
+    struct {
+        /**
+         * Parameters for fence objects
+         */
+        struct {
+            /**
+             * Value of fence to be waited on
+             */
+            unsigned long long value;
+        } fence;
+        unsigned int reserved[16];
+    } params;
+    /**
+     * Flags reserved for the future. Must be zero.
+     */
+    unsigned int flags;
+    unsigned int reserved[16];
+} CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS;
+
+
+#endif /* __CUDA_API_VERSION >= 10000 */
+
+/**
+ * If set, each kernel launched as part of ::cuLaunchCooperativeKernelMultiDevice only
+ * waits for prior work in the stream corresponding to that GPU to complete before the
+ * kernel begins execution.
+ */
+#define CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_PRE_LAUNCH_SYNC   0x01
+
+/**
+ * If set, any subsequent work pushed in a stream that participated in a call to
+ * ::cuLaunchCooperativeKernelMultiDevice will only wait for the kernel launched on
+ * the GPU corresponding to that stream to complete before it begins execution.
+ */
+#define CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_POST_LAUNCH_SYNC  0x02
+
+/**
+ * If set, the CUDA array is a collection of layers, where each layer is either a 1D
+ * or a 2D array and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies the number
+ * of layers, not the depth of a 3D array.
+ */
+#define CUDA_ARRAY3D_LAYERED        0x01
+
+/**
+ * Deprecated, use CUDA_ARRAY3D_LAYERED
+ */
+#define CUDA_ARRAY3D_2DARRAY        0x01
+
+/**
+ * This flag must be set in order to bind a surface reference
+ * to the CUDA array
+ */
+#define CUDA_ARRAY3D_SURFACE_LDST   0x02
+
+/**
+ * If set, the CUDA array is a collection of six 2D arrays, representing faces of a cube. The
+ * width of such a CUDA array must be equal to its height, and Depth must be six.
+ * If ::CUDA_ARRAY3D_LAYERED flag is also set, then the CUDA array is a collection of cubemaps
+ * and Depth must be a multiple of six.
+ */
+#define CUDA_ARRAY3D_CUBEMAP        0x04
+
+/**
+ * This flag must be set in order to perform texture gather operations
+ * on a CUDA array.
+ */
+#define CUDA_ARRAY3D_TEXTURE_GATHER 0x08
+
+/**
+ * This flag if set indicates that the CUDA
+ * array is a DEPTH_TEXTURE.
+ */
+#define CUDA_ARRAY3D_DEPTH_TEXTURE 0x10
+
+/**
+ * This flag indicates that the CUDA array may be bound as a color target
+ * in an external graphics API
+ */
+#define CUDA_ARRAY3D_COLOR_ATTACHMENT 0x20
+
+/**
+ * Override the texref format with a format inferred from the array.
+ * Flag for ::cuTexRefSetArray()
+ */
+#define CU_TRSA_OVERRIDE_FORMAT 0x01
+
+/**
+ * Read the texture as integers rather than promoting the values to floats
+ * in the range [0,1].
+ * Flag for ::cuTexRefSetFlags()
+ */
+#define CU_TRSF_READ_AS_INTEGER         0x01
+
+/**
+ * Use normalized texture coordinates in the range [0,1) instead of [0,dim).
+ * Flag for ::cuTexRefSetFlags()
+ */
+#define CU_TRSF_NORMALIZED_COORDINATES  0x02
+
+/**
+ * Perform sRGB->linear conversion during texture read.
+ * Flag for ::cuTexRefSetFlags()
+ */
+#define CU_TRSF_SRGB  0x10
+
+/**
+ * End of array terminator for the \p extra parameter to
+ * ::cuLaunchKernel
+ */
+#define CU_LAUNCH_PARAM_END            ((void*)0x00)
+
+/**
+ * Indicator that the next value in the \p extra parameter to
+ * ::cuLaunchKernel will be a pointer to a buffer containing all kernel
+ * parameters used for launching kernel \p f.  This buffer needs to
+ * honor all alignment/padding requirements of the individual parameters.
+ * If ::CU_LAUNCH_PARAM_BUFFER_SIZE is not also specified in the
+ * \p extra array, then ::CU_LAUNCH_PARAM_BUFFER_POINTER will have no
+ * effect.
+ */
+#define CU_LAUNCH_PARAM_BUFFER_POINTER ((void*)0x01)
+
+/**
+ * Indicator that the next value in the \p extra parameter to
+ * ::cuLaunchKernel will be a pointer to a size_t which contains the
+ * size of the buffer specified with ::CU_LAUNCH_PARAM_BUFFER_POINTER.
+ * It is required that ::CU_LAUNCH_PARAM_BUFFER_POINTER also be specified
+ * in the \p extra array if the value associated with
+ * ::CU_LAUNCH_PARAM_BUFFER_SIZE is not zero.
+ */
+#define CU_LAUNCH_PARAM_BUFFER_SIZE    ((void*)0x02)
+
+/**
+ * For texture references loaded into the module, use default texunit from
+ * texture reference.
+ */
+#define CU_PARAM_TR_DEFAULT -1
+
+/**
+ * Device that represents the CPU
+ */
+#define CU_DEVICE_CPU               ((CUdevice)-1)
+
+/**
+ * Device that represents an invalid device
+ */
+#define CU_DEVICE_INVALID           ((CUdevice)-2)
+
+/** @} */ /* END CUDA_TYPES */
+
+#ifdef _WIN32
+#define CUDAAPI __stdcall
+#else
+#define CUDAAPI
+#endif
+
+/**
+ * \defgroup CUDA_ERROR Error Handling
+ *
+ * ___MANBRIEF___ error handling functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the error handling functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Gets the string description of an error code
+ *
+ * Sets \p *pStr to the address of a NULL-terminated string description
+ * of the error code \p error.
+ * If the error code is not recognized, ::CUDA_ERROR_INVALID_VALUE
+ * will be returned and \p *pStr will be set to the NULL address.
+ *
+ * \param error - Error code to convert to string
+ * \param pStr - Address of the string pointer.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::CUresult,
+ * ::cudaGetErrorString
+ */
+CUresult CUDAAPI cuGetErrorString(CUresult error, const char **pStr);
+
+/**
+ * \brief Gets the string representation of an error code enum name
+ *
+ * Sets \p *pStr to the address of a NULL-terminated string representation
+ * of the name of the enum error code \p error.
+ * If the error code is not recognized, ::CUDA_ERROR_INVALID_VALUE
+ * will be returned and \p *pStr will be set to the NULL address.
+ *
+ * \param error - Error code to convert to string
+ * \param pStr - Address of the string pointer.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::CUresult,
+ * ::cudaGetErrorName
+ */
+CUresult CUDAAPI cuGetErrorName(CUresult error, const char **pStr);
+
+/** @} */ /* END CUDA_ERROR */
+
+/**
+ * \defgroup CUDA_INITIALIZE Initialization
+ *
+ * ___MANBRIEF___ initialization functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the initialization functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Initialize the CUDA driver API
+ *
+ * Initializes the driver API and must be called before any other function from
+ * the driver API. Currently, the \p Flags parameter must be 0. If ::cuInit()
+ * has not been called, any function from the driver API will return
+ * ::CUDA_ERROR_NOT_INITIALIZED.
+ *
+ * \param Flags - Initialization flag for CUDA.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_SYSTEM_DRIVER_MISMATCH,
+ * ::CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE
+ * \notefnerr
+ */
+CUresult CUDAAPI cuInit(unsigned int Flags);
+
+/** @} */ /* END CUDA_INITIALIZE */
+
+/**
+ * \defgroup CUDA_VERSION Version Management
+ *
+ * ___MANBRIEF___ version management functions of the low-level CUDA driver
+ * API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the version management functions of the low-level
+ * CUDA driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Returns the latest CUDA version supported by driver
+ *
+ * Returns in \p *driverVersion the version of CUDA supported by
+ * the driver.  The version is returned as
+ * (1000 &times; major + 10 &times; minor). For example, CUDA 9.2
+ * would be represented by 9020.
+ *
+ * This function automatically returns ::CUDA_ERROR_INVALID_VALUE if
+ * \p driverVersion is NULL.
+ *
+ * \param driverVersion - Returns the CUDA driver version
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaDriverGetVersion,
+ * ::cudaRuntimeGetVersion
+ */
+CUresult CUDAAPI cuDriverGetVersion(int *driverVersion);
+
+/** @} */ /* END CUDA_VERSION */
+
+/**
+ * \defgroup CUDA_DEVICE Device Management
+ *
+ * ___MANBRIEF___ device management functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the device management functions of the low-level
+ * CUDA driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Returns a handle to a compute device
+ *
+ * Returns in \p *device a device handle given an ordinal in the range <b>[0,
+ * ::cuDeviceGetCount()-1]</b>.
+ *
+ * \param device  - Returned device handle
+ * \param ordinal - Device number to get handle for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetCount,
+ * ::cuDeviceGetName,
+ * ::cuDeviceGetUuid,
+ * ::cuDeviceGetLuid,
+ * ::cuDeviceTotalMem
+ */
+CUresult CUDAAPI cuDeviceGet(CUdevice *device, int ordinal);
+
+/**
+ * \brief Returns the number of compute-capable devices
+ *
+ * Returns in \p *count the number of devices with compute capability greater
+ * than or equal to 2.0 that are available for execution. If there is no such
+ * device, ::cuDeviceGetCount() returns 0.
+ *
+ * \param count - Returned number of compute-capable devices
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetName,
+ * ::cuDeviceGetUuid,
+ * ::cuDeviceGetLuid,
+ * ::cuDeviceGet,
+ * ::cuDeviceTotalMem,
+ * ::cudaGetDeviceCount
+ */
+CUresult CUDAAPI cuDeviceGetCount(int *count);
+
+/**
+ * \brief Returns an identifier string for the device
+ *
+ * Returns an ASCII string identifying the device \p dev in the NULL-terminated
+ * string pointed to by \p name. \p len specifies the maximum length of the
+ * string that may be returned.
+ *
+ * \param name - Returned identifier string for the device
+ * \param len  - Maximum length of string to store in \p name
+ * \param dev  - Device to get identifier string for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetUuid,
+ * ::cuDeviceGetLuid,
+ * ::cuDeviceGetCount,
+ * ::cuDeviceGet,
+ * ::cuDeviceTotalMem,
+ * ::cudaGetDeviceProperties
+ */
+CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev);
+
+#if __CUDA_API_VERSION >= 9020
+/**
+ * \brief Return an UUID for the device
+ *
+ * Returns 16-octets identifing the device \p dev in the structure
+ * pointed by the \p uuid.
+ *
+ * \param uuid - Returned UUID
+ * \param dev  - Device to get identifier string for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetCount,
+ * ::cuDeviceGetName,
+ * ::cuDeviceGetLuid,
+ * ::cuDeviceGet,
+ * ::cuDeviceTotalMem,
+ * ::cudaGetDeviceProperties
+ */
+CUresult CUDAAPI cuDeviceGetUuid(CUuuid *uuid, CUdevice dev);
+#endif
+
+#if defined(_WIN32) && __CUDA_API_VERSION >= 10000
+/**
+ * \brief Return an LUID and device node mask for the device
+ *
+ * Return identifying information (\p luid and \p deviceNodeMask) to allow
+ * matching device with graphics APIs.
+ *
+ * \param luid - Returned LUID
+ * \param deviceNodeMask - Returned device node mask
+ * \param dev  - Device to get identifier string for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetCount,
+ * ::cuDeviceGetName,
+ * ::cuDeviceGet,
+ * ::cuDeviceTotalMem,
+ * ::cudaGetDeviceProperties
+ */
+CUresult CUDAAPI cuDeviceGetLuid(char *luid, unsigned int *deviceNodeMask, CUdevice dev);
+#endif
+
+#if __CUDA_API_VERSION >= 3020
+/**
+ * \brief Returns the total amount of memory on the device
+ *
+ * Returns in \p *bytes the total amount of memory available on the device
+ * \p dev in bytes.
+ *
+ * \param bytes - Returned memory available on device in bytes
+ * \param dev   - Device handle
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetCount,
+ * ::cuDeviceGetName,
+ * ::cuDeviceGetUuid,
+ * ::cuDeviceGet,
+ * ::cudaMemGetInfo
+ */
+CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev);
+#endif /* __CUDA_API_VERSION >= 3020 */
+
+/**
+ * \brief Returns information about the device
+ *
+ * Returns in \p *pi the integer value of the attribute \p attrib on device
+ * \p dev. The supported attributes are:
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK: Maximum number of threads per
+ *   block;
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X: Maximum x-dimension of a block;
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y: Maximum y-dimension of a block;
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z: Maximum z-dimension of a block;
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X: Maximum x-dimension of a grid;
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y: Maximum y-dimension of a grid;
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z: Maximum z-dimension of a grid;
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK: Maximum amount of
+ *   shared memory available to a thread block in bytes;
+ * - ::CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY: Memory available on device for
+ *   __constant__ variables in a CUDA C kernel in bytes;
+ * - ::CU_DEVICE_ATTRIBUTE_WARP_SIZE: Warp size in threads;
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_PITCH: Maximum pitch in bytes allowed by the
+ *   memory copy functions that involve memory regions allocated through
+ *   ::cuMemAllocPitch();
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH: Maximum 1D
+ *  texture width;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH: Maximum width
+ *  for a 1D texture bound to linear memory;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH: Maximum
+ *  mipmapped 1D texture width;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH: Maximum 2D
+ *  texture width;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT: Maximum 2D
+ *  texture height;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH: Maximum width
+ *  for a 2D texture bound to linear memory;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT: Maximum height
+ *  for a 2D texture bound to linear memory;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH: Maximum pitch
+ *  in bytes for a 2D texture bound to linear memory;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH: Maximum
+ *  mipmapped 2D texture width;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT: Maximum
+ *  mipmapped 2D texture height;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH: Maximum 3D
+ *  texture width;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT: Maximum 3D
+ *  texture height;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH: Maximum 3D
+ *  texture depth;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE:
+ *  Alternate maximum 3D texture width, 0 if no alternate
+ *  maximum 3D texture size is supported;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE:
+ *  Alternate maximum 3D texture height, 0 if no alternate
+ *  maximum 3D texture size is supported;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE:
+ *  Alternate maximum 3D texture depth, 0 if no alternate
+ *  maximum 3D texture size is supported;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH:
+ *  Maximum cubemap texture width or height;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH:
+ *  Maximum 1D layered texture width;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS:
+ *   Maximum layers in a 1D layered texture;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH:
+ *  Maximum 2D layered texture width;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT:
+ *   Maximum 2D layered texture height;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS:
+ *   Maximum layers in a 2D layered texture;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH:
+ *   Maximum cubemap layered texture width or height;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS:
+ *   Maximum layers in a cubemap layered texture;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH:
+ *   Maximum 1D surface width;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH:
+ *   Maximum 2D surface width;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT:
+ *   Maximum 2D surface height;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH:
+ *   Maximum 3D surface width;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT:
+ *   Maximum 3D surface height;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH:
+ *   Maximum 3D surface depth;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH:
+ *   Maximum 1D layered surface width;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS:
+ *   Maximum layers in a 1D layered surface;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH:
+ *   Maximum 2D layered surface width;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT:
+ *   Maximum 2D layered surface height;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS:
+ *   Maximum layers in a 2D layered surface;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH:
+ *   Maximum cubemap surface width;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH:
+ *   Maximum cubemap layered surface width;
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS:
+ *   Maximum layers in a cubemap layered surface;
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK: Maximum number of 32-bit
+ *   registers available to a thread block;
+ * - ::CU_DEVICE_ATTRIBUTE_CLOCK_RATE: The typical clock frequency in kilohertz;
+ * - ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT: Alignment requirement; texture
+ *   base addresses aligned to ::textureAlign bytes do not need an offset
+ *   applied to texture fetches;
+ * - ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT: Pitch alignment requirement
+ *   for 2D texture references bound to pitched memory;
+ * - ::CU_DEVICE_ATTRIBUTE_GPU_OVERLAP: 1 if the device can concurrently copy
+ *   memory between host and device while executing a kernel, or 0 if not;
+ * - ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT: Number of multiprocessors on
+ *   the device;
+ * - ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT: 1 if there is a run time limit
+ *   for kernels executed on the device, or 0 if not;
+ * - ::CU_DEVICE_ATTRIBUTE_INTEGRATED: 1 if the device is integrated with the
+ *   memory subsystem, or 0 if not;
+ * - ::CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY: 1 if the device can map host
+ *   memory into the CUDA address space, or 0 if not;
+ * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE: Compute mode that device is currently
+ *   in. Available modes are as follows:
+ *   - ::CU_COMPUTEMODE_DEFAULT: Default mode - Device is not restricted and
+ *     can have multiple CUDA contexts present at a single time.
+ *   - ::CU_COMPUTEMODE_PROHIBITED: Compute-prohibited mode - Device is
+ *     prohibited from creating new CUDA contexts.
+ *   - ::CU_COMPUTEMODE_EXCLUSIVE_PROCESS:  Compute-exclusive-process mode - Device
+ *     can have only one context used by a single process at a time.
+ * - ::CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS: 1 if the device supports
+ *   executing multiple kernels within the same context simultaneously, or 0 if
+ *   not. It is not guaranteed that multiple kernels will be resident
+ *   on the device concurrently so this feature should not be relied upon for
+ *   correctness;
+ * - ::CU_DEVICE_ATTRIBUTE_ECC_ENABLED: 1 if error correction is enabled on the
+ *    device, 0 if error correction is disabled or not supported by the device;
+ * - ::CU_DEVICE_ATTRIBUTE_PCI_BUS_ID: PCI bus identifier of the device;
+ * - ::CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID: PCI device (also known as slot) identifier
+ *   of the device;
+ * - ::CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID: PCI domain identifier of the device
+ * - ::CU_DEVICE_ATTRIBUTE_TCC_DRIVER: 1 if the device is using a TCC driver. TCC
+ *    is only available on Tesla hardware running Windows Vista or later;
+ * - ::CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE: Peak memory clock frequency in kilohertz;
+ * - ::CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH: Global memory bus width in bits;
+ * - ::CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE: Size of L2 cache in bytes. 0 if the device doesn't have L2 cache;
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR: Maximum resident threads per multiprocessor;
+ * - ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING: 1 if the device shares a unified address space with
+ *   the host, or 0 if not;
+ * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR: Major compute capability version number;
+ * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR: Minor compute capability version number;
+ * - ::CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED: 1 if device supports caching globals
+ *    in L1 cache, 0 if caching globals in L1 cache is not supported by the device;
+ * - ::CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED: 1 if device supports caching locals
+ *    in L1 cache, 0 if caching locals in L1 cache is not supported by the device;
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR: Maximum amount of
+ *   shared memory available to a multiprocessor in bytes; this amount is shared
+ *   by all thread blocks simultaneously resident on a multiprocessor;
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR: Maximum number of 32-bit
+ *   registers available to a multiprocessor; this number is shared by all thread
+ *   blocks simultaneously resident on a multiprocessor;
+ * - ::CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY: 1 if device supports allocating managed memory
+ *   on this system, 0 if allocating managed memory is not supported by the device on this system.
+ * - ::CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD: 1 if device is on a multi-GPU board, 0 if not.
+ * - ::CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID: Unique identifier for a group of devices
+ *   associated with the same board. Devices on the same multi-GPU board will share the same identifier.
+ * - ::CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED: 1 if Link between the device and the host
+ *   supports native atomic operations.
+ * - ::CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO: Ratio of single precision performance
+ *   (in floating-point operations per second) to double precision performance.
+ * - ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS: Device supports coherently accessing
+ *   pageable memory without calling cudaHostRegister on it.
+ * - ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS: Device can coherently access managed memory
+ *   concurrently with the CPU.
+ * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED: Device supports Compute Preemption.
+ * - ::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM: Device can access host registered
+ *   memory at the same virtual address as the CPU.
+ * -  ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN: The maximum per block shared memory size
+ *    supported on this device. This is the maximum value that can be opted into when using the cuFuncSetAttribute() call.
+ *    For more details see ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES
+ * - ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES: Device accesses pageable memory via the host's
+ *   page tables.
+ * - ::CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST: The host can directly access managed memory on the device without migration.
+ *
+ * \param pi     - Returned device attribute value
+ * \param attrib - Device attribute to query
+ * \param dev    - Device handle
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGetCount,
+ * ::cuDeviceGetName,
+ * ::cuDeviceGetUuid,
+ * ::cuDeviceGet,
+ * ::cuDeviceTotalMem,
+ * ::cudaDeviceGetAttribute,
+ * ::cudaGetDeviceProperties
+ */
+CUresult CUDAAPI cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev);
+
+/** @} */ /* END CUDA_DEVICE */
+
+/**
+ * \defgroup CUDA_DEVICE_DEPRECATED Device Management [DEPRECATED]
+ *
+ * ___MANBRIEF___ deprecated device management functions of the low-level CUDA
+ * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the device management functions of the low-level
+ * CUDA driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Returns properties for a selected device
+ *
+ * \deprecated
+ *
+ * This function was deprecated as of CUDA 5.0 and replaced by ::cuDeviceGetAttribute().
+ *
+ * Returns in \p *prop the properties of device \p dev. The ::CUdevprop
+ * structure is defined as:
+ *
+ * \code
+     typedef struct CUdevprop_st {
+     int maxThreadsPerBlock;
+     int maxThreadsDim[3];
+     int maxGridSize[3];
+     int sharedMemPerBlock;
+     int totalConstantMemory;
+     int SIMDWidth;
+     int memPitch;
+     int regsPerBlock;
+     int clockRate;
+     int textureAlign
+  } CUdevprop;
+ * \endcode
+ * where:
+ *
+ * - ::maxThreadsPerBlock is the maximum number of threads per block;
+ * - ::maxThreadsDim[3] is the maximum sizes of each dimension of a block;
+ * - ::maxGridSize[3] is the maximum sizes of each dimension of a grid;
+ * - ::sharedMemPerBlock is the total amount of shared memory available per
+ *   block in bytes;
+ * - ::totalConstantMemory is the total amount of constant memory available on
+ *   the device in bytes;
+ * - ::SIMDWidth is the warp size;
+ * - ::memPitch is the maximum pitch allowed by the memory copy functions that
+ *   involve memory regions allocated through ::cuMemAllocPitch();
+ * - ::regsPerBlock is the total number of registers available per block;
+ * - ::clockRate is the clock frequency in kilohertz;
+ * - ::textureAlign is the alignment requirement; texture base addresses that
+ *   are aligned to ::textureAlign bytes do not need an offset applied to
+ *   texture fetches.
+ *
+ * \param prop - Returned properties of device
+ * \param dev  - Device to get properties for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetCount,
+ * ::cuDeviceGetName,
+ * ::cuDeviceGetUuid,
+ * ::cuDeviceGet,
+ * ::cuDeviceTotalMem
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuDeviceGetProperties(CUdevprop *prop, CUdevice dev);
+
+/**
+ * \brief Returns the compute capability of the device
+ *
+ * \deprecated
+ *
+ * This function was deprecated as of CUDA 5.0 and its functionality superseded
+ * by ::cuDeviceGetAttribute().
+ *
+ * Returns in \p *major and \p *minor the major and minor revision numbers that
+ * define the compute capability of the device \p dev.
+ *
+ * \param major - Major revision number
+ * \param minor - Minor revision number
+ * \param dev   - Device handle
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetCount,
+ * ::cuDeviceGetName,
+ * ::cuDeviceGetUuid,
+ * ::cuDeviceGet,
+ * ::cuDeviceTotalMem
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuDeviceComputeCapability(int *major, int *minor, CUdevice dev);
+
+/** @} */ /* END CUDA_DEVICE_DEPRECATED */
+
+/**
+ * \defgroup CUDA_PRIMARY_CTX Primary Context Management
+ *
+ * ___MANBRIEF___ primary context management functions of the low-level CUDA driver
+ * API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the primary context management functions of the low-level
+ * CUDA driver application programming interface.
+ *
+ * The primary context is unique per device and shared with the CUDA runtime API.
+ * These functions allow integration with other libraries using CUDA.
+ *
+ * @{
+ */
+
+#if __CUDA_API_VERSION >= 7000
+
+/**
+ * \brief Retain the primary context on the GPU
+ *
+ * Retains the primary context on the device, creating it if necessary,
+ * increasing its usage count. The caller must call
+ * ::cuDevicePrimaryCtxRelease() when done using the context.
+ * Unlike ::cuCtxCreate() the newly created context is not pushed onto the stack.
+ *
+ * Context creation will fail with ::CUDA_ERROR_UNKNOWN if the compute mode of
+ * the device is ::CU_COMPUTEMODE_PROHIBITED.  The function ::cuDeviceGetAttribute()
+ * can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the compute mode
+ * of the device.
+ * The <i>nvidia-smi</i> tool can be used to set the compute mode for
+ * devices. Documentation for <i>nvidia-smi</i> can be obtained by passing a
+ * -h option to it.
+ *
+ * Please note that the primary context always supports pinned allocations. Other
+ * flags can be specified by ::cuDevicePrimaryCtxSetFlags().
+ *
+ * \param pctx  - Returned context handle of the new context
+ * \param dev   - Device for which primary context is requested
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa ::cuDevicePrimaryCtxRelease,
+ * ::cuDevicePrimaryCtxSetFlags,
+ * ::cuCtxCreate,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize
+ */
+CUresult CUDAAPI cuDevicePrimaryCtxRetain(CUcontext *pctx, CUdevice dev);
+
+/**
+ * \brief Release the primary context on the GPU
+ *
+ * Releases the primary context interop on the device by decreasing the usage
+ * count by 1. If the usage drops to 0 the primary context of device \p dev
+ * will be destroyed regardless of how many threads it is current to.
+ *
+ * Please note that unlike ::cuCtxDestroy() this method does not pop the context
+ * from stack in any circumstances.
+ *
+ * \param dev - Device which primary context is released
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa ::cuDevicePrimaryCtxRetain,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize
+ */
+CUresult CUDAAPI cuDevicePrimaryCtxRelease(CUdevice dev);
+
+/**
+ * \brief Set flags for the primary context
+ *
+ * Sets the flags for the primary context on the device overwriting previously
+ * set ones. If the primary context is already created
+ * ::CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE is returned.
+ *
+ * The three LSBs of the \p flags parameter can be used to control how the OS
+ * thread, which owns the CUDA context at the time of an API call, interacts
+ * with the OS scheduler when waiting for results from the GPU. Only one of
+ * the scheduling flags can be set when creating a context.
+ *
+ * - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for
+ * results from the GPU. This can decrease latency when waiting for the GPU,
+ * but may lower the performance of CPU threads if they are performing work in
+ * parallel with the CUDA thread.
+ *
+ * - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for
+ * results from the GPU. This can increase latency when waiting for the GPU,
+ * but can increase the performance of CPU threads performing work in parallel
+ * with the GPU.
+ *
+ * - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
+ * synchronization primitive when waiting for the GPU to finish work.
+ *
+ * - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
+ * synchronization primitive when waiting for the GPU to finish work. <br>
+ * <b>Deprecated:</b> This flag was deprecated as of CUDA 4.0 and was
+ * replaced with ::CU_CTX_SCHED_BLOCKING_SYNC.
+ *
+ * - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero,
+ * uses a heuristic based on the number of active CUDA contexts in the
+ * process \e C and the number of logical processors in the system \e P. If
+ * \e C > \e P, then CUDA will yield to other OS threads when waiting for
+ * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while
+ * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN).
+ * Additionally, on Tegra devices, ::CU_CTX_SCHED_AUTO uses a heuristic based on
+ * the power profile of the platform and may choose ::CU_CTX_SCHED_BLOCKING_SYNC
+ * for low-powered devices.
+ *
+ * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory
+ * after resizing local memory for a kernel. This can prevent thrashing by
+ * local memory allocations when launching many kernels with high local
+ * memory usage at the cost of potentially increased memory usage.
+ *
+ * \param dev   - Device for which the primary context flags are set
+ * \param flags - New flags for the device
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE
+ * \notefnerr
+ *
+ * \sa ::cuDevicePrimaryCtxRetain,
+ * ::cuDevicePrimaryCtxGetState,
+ * ::cuCtxCreate,
+ * ::cuCtxGetFlags,
+ * ::cudaSetDeviceFlags
+ */
+CUresult CUDAAPI cuDevicePrimaryCtxSetFlags(CUdevice dev, unsigned int flags);
+
+/**
+ * \brief Get the state of the primary context
+ *
+ * Returns in \p *flags the flags for the primary context of \p dev, and in
+ * \p *active whether it is active.  See ::cuDevicePrimaryCtxSetFlags for flag
+ * values.
+ *
+ * \param dev    - Device to get primary context flags for
+ * \param flags  - Pointer to store flags
+ * \param active - Pointer to store context state; 0 = inactive, 1 = active
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDevicePrimaryCtxSetFlags,
+ * ::cuCtxGetFlags,
+ * ::cudaGetDeviceFlags
+ */
+CUresult CUDAAPI cuDevicePrimaryCtxGetState(CUdevice dev, unsigned int *flags, int *active);
+
+/**
+ * \brief Destroy all allocations and reset all state on the primary context
+ *
+ * Explicitly destroys and cleans up all resources associated with the current
+ * device in the current process.
+ *
+ * Note that it is responsibility of the calling function to ensure that no
+ * other module in the process is using the device any more. For that reason
+ * it is recommended to use ::cuDevicePrimaryCtxRelease() in most cases.
+ * However it is safe for other modules to call ::cuDevicePrimaryCtxRelease()
+ * even after resetting the device.
+ *
+ * \param dev - Device for which primary context is destroyed
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE
+ * \notefnerr
+ *
+ * \sa ::cuDevicePrimaryCtxRetain,
+ * ::cuDevicePrimaryCtxRelease,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize,
+ * ::cudaDeviceReset
+ */
+CUresult CUDAAPI cuDevicePrimaryCtxReset(CUdevice dev);
+
+#endif /* __CUDA_API_VERSION >= 7000 */
+
+/** @} */ /* END CUDA_PRIMARY_CTX */
+
+
+/**
+ * \defgroup CUDA_CTX Context Management
+ *
+ * ___MANBRIEF___ context management functions of the low-level CUDA driver
+ * API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the context management functions of the low-level
+ * CUDA driver application programming interface.
+ *
+ * Please note that some functions are described in
+ * \ref CUDA_PRIMARY_CTX "Primary Context Management" section.
+ *
+ * @{
+ */
+
+#if __CUDA_API_VERSION >= 3020
+/**
+ * \brief Create a CUDA context
+ *
+ * \note In most cases it is recommended to use ::cuDevicePrimaryCtxRetain.
+ *
+ * Creates a new CUDA context and associates it with the calling thread. The
+ * \p flags parameter is described below. The context is created with a usage
+ * count of 1 and the caller of ::cuCtxCreate() must call ::cuCtxDestroy()
+ * when done using the context. If a context is already current to the thread,
+ * it is supplanted by the newly created context and may be restored by a subsequent
+ * call to ::cuCtxPopCurrent().
+ *
+ * The three LSBs of the \p flags parameter can be used to control how the OS
+ * thread, which owns the CUDA context at the time of an API call, interacts
+ * with the OS scheduler when waiting for results from the GPU. Only one of
+ * the scheduling flags can be set when creating a context.
+ *
+ * - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for
+ * results from the GPU. This can decrease latency when waiting for the GPU,
+ * but may lower the performance of CPU threads if they are performing work in
+ * parallel with the CUDA thread.
+ *
+ * - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for
+ * results from the GPU. This can increase latency when waiting for the GPU,
+ * but can increase the performance of CPU threads performing work in parallel
+ * with the GPU.
+ *
+ * - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
+ * synchronization primitive when waiting for the GPU to finish work.
+ *
+ * - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
+ * synchronization primitive when waiting for the GPU to finish work. <br>
+ * <b>Deprecated:</b> This flag was deprecated as of CUDA 4.0 and was
+ * replaced with ::CU_CTX_SCHED_BLOCKING_SYNC.
+ *
+ * - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero,
+ * uses a heuristic based on the number of active CUDA contexts in the
+ * process \e C and the number of logical processors in the system \e P. If
+ * \e C > \e P, then CUDA will yield to other OS threads when waiting for
+ * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while
+ * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN).
+ * Additionally, on Tegra devices, ::CU_CTX_SCHED_AUTO uses a heuristic based on
+ * the power profile of the platform and may choose ::CU_CTX_SCHED_BLOCKING_SYNC
+ * for low-powered devices.
+ *
+ * - ::CU_CTX_MAP_HOST: Instruct CUDA to support mapped pinned allocations.
+ * This flag must be set in order to allocate pinned host memory that is
+ * accessible to the GPU.
+ *
+ * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory
+ * after resizing local memory for a kernel. This can prevent thrashing by
+ * local memory allocations when launching many kernels with high local
+ * memory usage at the cost of potentially increased memory usage.
+ *
+ * Context creation will fail with ::CUDA_ERROR_UNKNOWN if the compute mode of
+ * the device is ::CU_COMPUTEMODE_PROHIBITED. The function ::cuDeviceGetAttribute()
+ * can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the
+ * compute mode of the device. The <i>nvidia-smi</i> tool can be used to set
+ * the compute mode for * devices.
+ * Documentation for <i>nvidia-smi</i> can be obtained by passing a
+ * -h option to it.
+ *
+ * \param pctx  - Returned context handle of the new context
+ * \param flags - Context creation flags
+ * \param dev   - Device to create context on
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize
+ */
+CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev);
+#endif /* __CUDA_API_VERSION >= 3020 */
+
+#if __CUDA_API_VERSION >= 4000
+/**
+ * \brief Destroy a CUDA context
+ *
+ * Destroys the CUDA context specified by \p ctx.  The context \p ctx will be
+ * destroyed regardless of how many threads it is current to.
+ * It is the responsibility of the calling function to ensure that no API
+ * call issues using \p ctx while ::cuCtxDestroy() is executing.
+ *
+ * If \p ctx is current to the calling thread then \p ctx will also be
+ * popped from the current thread's context stack (as though ::cuCtxPopCurrent()
+ * were called).  If \p ctx is current to other threads, then \p ctx will
+ * remain current to those threads, and attempting to access \p ctx from
+ * those threads will result in the error ::CUDA_ERROR_CONTEXT_IS_DESTROYED.
+ *
+ * \param ctx - Context to destroy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize
+ */
+CUresult CUDAAPI cuCtxDestroy(CUcontext ctx);
+#endif /* __CUDA_API_VERSION >= 4000 */
+
+#if __CUDA_API_VERSION >= 4000
+/**
+ * \brief Pushes a context on the current CPU thread
+ *
+ * Pushes the given context \p ctx onto the CPU thread's stack of current
+ * contexts. The specified context becomes the CPU thread's current context, so
+ * all CUDA functions that operate on the current context are affected.
+ *
+ * The previous current context may be made current again by calling
+ * ::cuCtxDestroy() or ::cuCtxPopCurrent().
+ *
+ * \param ctx - Context to push
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize
+ */
+CUresult CUDAAPI cuCtxPushCurrent(CUcontext ctx);
+
+/**
+ * \brief Pops the current CUDA context from the current CPU thread.
+ *
+ * Pops the current CUDA context from the CPU thread and passes back the
+ * old context handle in \p *pctx. That context may then be made current
+ * to a different CPU thread by calling ::cuCtxPushCurrent().
+ *
+ * If a context was current to the CPU thread before ::cuCtxCreate() or
+ * ::cuCtxPushCurrent() was called, this function makes that context current to
+ * the CPU thread again.
+ *
+ * \param pctx - Returned new context handle
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize
+ */
+CUresult CUDAAPI cuCtxPopCurrent(CUcontext *pctx);
+
+/**
+ * \brief Binds the specified CUDA context to the calling CPU thread
+ *
+ * Binds the specified CUDA context to the calling CPU thread.
+ * If \p ctx is NULL then the CUDA context previously bound to the
+ * calling CPU thread is unbound and ::CUDA_SUCCESS is returned.
+ *
+ * If there exists a CUDA context stack on the calling CPU thread, this
+ * will replace the top of that stack with \p ctx.
+ * If \p ctx is NULL then this will be equivalent to popping the top
+ * of the calling CPU thread's CUDA context stack (or a no-op if the
+ * calling CPU thread's CUDA context stack is empty).
+ *
+ * \param ctx - Context to bind to the calling CPU thread
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT
+ * \notefnerr
+ *
+ * \sa
+ * ::cuCtxGetCurrent,
+ * ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cudaSetDevice
+ */
+CUresult CUDAAPI cuCtxSetCurrent(CUcontext ctx);
+
+/**
+ * \brief Returns the CUDA context bound to the calling CPU thread.
+ *
+ * Returns in \p *pctx the CUDA context bound to the calling CPU thread.
+ * If no context is bound to the calling CPU thread then \p *pctx is
+ * set to NULL and ::CUDA_SUCCESS is returned.
+ *
+ * \param pctx - Returned context handle
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * \notefnerr
+ *
+ * \sa
+ * ::cuCtxSetCurrent,
+ * ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cudaGetDevice
+ */
+CUresult CUDAAPI cuCtxGetCurrent(CUcontext *pctx);
+#endif /* __CUDA_API_VERSION >= 4000 */
+
+/**
+ * \brief Returns the device ID for the current context
+ *
+ * Returns in \p *device the ordinal of the current context's device.
+ *
+ * \param device - Returned device ID for the current context
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize,
+ * ::cudaGetDevice
+ */
+CUresult CUDAAPI cuCtxGetDevice(CUdevice *device);
+
+#if __CUDA_API_VERSION >= 7000
+/**
+ * \brief Returns the flags for the current context
+ *
+ * Returns in \p *flags the flags of the current context. See ::cuCtxCreate
+ * for flag values.
+ *
+ * \param flags - Pointer to store flags of current context
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetCurrent,
+ * ::cuCtxGetDevice
+ * ::cuCtxGetLimit,
+ * ::cuCtxGetSharedMemConfig,
+ * ::cuCtxGetStreamPriorityRange,
+ * ::cudaGetDeviceFlags
+ */
+CUresult CUDAAPI cuCtxGetFlags(unsigned int *flags);
+#endif /* __CUDA_API_VERSION >= 7000 */
+
+/**
+ * \brief Block for a context's tasks to complete
+ *
+ * Blocks until the device has completed all preceding requested tasks.
+ * ::cuCtxSynchronize() returns an error if one of the preceding tasks failed.
+ * If the context was created with the ::CU_CTX_SCHED_BLOCKING_SYNC flag, the
+ * CPU thread will block until the GPU context has finished its work.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cudaDeviceSynchronize
+ */
+CUresult CUDAAPI cuCtxSynchronize(void);
+
+/**
+ * \brief Set resource limits
+ *
+ * Setting \p limit to \p value is a request by the application to update
+ * the current limit maintained by the context. The driver is free to
+ * modify the requested value to meet h/w requirements (this could be
+ * clamping to minimum or maximum values, rounding up to nearest element
+ * size, etc). The application can use ::cuCtxGetLimit() to find out exactly
+ * what the limit has been set to.
+ *
+ * Setting each ::CUlimit has its own specific restrictions, so each is
+ * discussed here.
+ *
+ * - ::CU_LIMIT_STACK_SIZE controls the stack size in bytes of each GPU thread.
+ * Note that the CUDA driver will set the \p limit to the maximum of \p value
+ * and what the kernel function requires.
+ *
+ * - ::CU_LIMIT_PRINTF_FIFO_SIZE controls the size in bytes of the FIFO used
+ *   by the ::printf() device system call. Setting ::CU_LIMIT_PRINTF_FIFO_SIZE
+ *   must be performed before launching any kernel that uses the ::printf()
+ *   device system call, otherwise ::CUDA_ERROR_INVALID_VALUE will be returned.
+ *
+ * - ::CU_LIMIT_MALLOC_HEAP_SIZE controls the size in bytes of the heap used
+ *   by the ::malloc() and ::free() device system calls. Setting
+ *   ::CU_LIMIT_MALLOC_HEAP_SIZE must be performed before launching any kernel
+ *   that uses the ::malloc() or ::free() device system calls, otherwise
+ *   ::CUDA_ERROR_INVALID_VALUE will be returned.
+ *
+ * - ::CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH controls the maximum nesting depth of
+ *   a grid at which a thread can safely call ::cudaDeviceSynchronize(). Setting
+ *   this limit must be performed before any launch of a kernel that uses the
+ *   device runtime and calls ::cudaDeviceSynchronize() above the default sync
+ *   depth, two levels of grids. Calls to ::cudaDeviceSynchronize() will fail
+ *   with error code ::cudaErrorSyncDepthExceeded if the limitation is
+ *   violated. This limit can be set smaller than the default or up the maximum
+ *   launch depth of 24. When setting this limit, keep in mind that additional
+ *   levels of sync depth require the driver to reserve large amounts of device
+ *   memory which can no longer be used for user allocations. If these
+ *   reservations of device memory fail, ::cuCtxSetLimit will return
+ *   ::CUDA_ERROR_OUT_OF_MEMORY, and the limit can be reset to a lower value.
+ *   This limit is only applicable to devices of compute capability 3.5 and
+ *   higher. Attempting to set this limit on devices of compute capability less
+ *   than 3.5 will result in the error ::CUDA_ERROR_UNSUPPORTED_LIMIT being
+ *   returned.
+ *
+ * - ::CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT controls the maximum number of
+ *   outstanding device runtime launches that can be made from the current
+ *   context. A grid is outstanding from the point of launch up until the grid
+ *   is known to have been completed. Device runtime launches which violate
+ *   this limitation fail and return ::cudaErrorLaunchPendingCountExceeded when
+ *   ::cudaGetLastError() is called after launch. If more pending launches than
+ *   the default (2048 launches) are needed for a module using the device
+ *   runtime, this limit can be increased. Keep in mind that being able to
+ *   sustain additional pending launches will require the driver to reserve
+ *   larger amounts of device memory upfront which can no longer be used for
+ *   allocations. If these reservations fail, ::cuCtxSetLimit will return
+ *   ::CUDA_ERROR_OUT_OF_MEMORY, and the limit can be reset to a lower value.
+ *   This limit is only applicable to devices of compute capability 3.5 and
+ *   higher. Attempting to set this limit on devices of compute capability less
+ *   than 3.5 will result in the error ::CUDA_ERROR_UNSUPPORTED_LIMIT being
+ *   returned.
+ *
+ * - ::CU_LIMIT_MAX_L2_FETCH_GRANULARITY controls the L2 cache fetch granularity.
+ *   Values can range from 0B to 128B. This is purely a performance hint and
+ *   it can be ignored or clamped depending on the platform.
+ *
+ * \param limit - Limit to set
+ * \param value - Size of limit
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_UNSUPPORTED_LIMIT,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_INVALID_CONTEXT
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSynchronize,
+ * ::cudaDeviceSetLimit
+ */
+CUresult CUDAAPI cuCtxSetLimit(CUlimit limit, size_t value);
+
+/**
+ * \brief Returns resource limits
+ *
+ * Returns in \p *pvalue the current size of \p limit.  The supported
+ * ::CUlimit values are:
+ * - ::CU_LIMIT_STACK_SIZE: stack size in bytes of each GPU thread.
+ * - ::CU_LIMIT_PRINTF_FIFO_SIZE: size in bytes of the FIFO used by the
+ *   ::printf() device system call.
+ * - ::CU_LIMIT_MALLOC_HEAP_SIZE: size in bytes of the heap used by the
+ *   ::malloc() and ::free() device system calls.
+ * - ::CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH: maximum grid depth at which a thread
+ *   can issue the device runtime call ::cudaDeviceSynchronize() to wait on
+ *   child grid launches to complete.
+ * - ::CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT: maximum number of outstanding
+ *   device runtime launches that can be made from this context.
+ * - ::CU_LIMIT_MAX_L2_FETCH_GRANULARITY: L2 cache fetch granularity.
+ *
+ * \param limit  - Limit to query
+ * \param pvalue - Returned size of limit
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_UNSUPPORTED_LIMIT
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize,
+ * ::cudaDeviceGetLimit
+ */
+CUresult CUDAAPI cuCtxGetLimit(size_t *pvalue, CUlimit limit);
+
+/**
+ * \brief Returns the preferred cache configuration for the current context.
+ *
+ * On devices where the L1 cache and shared memory use the same hardware
+ * resources, this function returns through \p pconfig the preferred cache configuration
+ * for the current context. This is only a preference. The driver will use
+ * the requested configuration if possible, but it is free to choose a different
+ * configuration if required to execute functions.
+ *
+ * This will return a \p pconfig of ::CU_FUNC_CACHE_PREFER_NONE on devices
+ * where the size of the L1 cache and shared memory are fixed.
+ *
+ * The supported cache configurations are:
+ * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default)
+ * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache
+ * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory
+ * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory
+ *
+ * \param pconfig - Returned cache configuration
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize,
+ * ::cuFuncSetCacheConfig,
+ * ::cudaDeviceGetCacheConfig
+ */
+CUresult CUDAAPI cuCtxGetCacheConfig(CUfunc_cache *pconfig);
+
+/**
+ * \brief Sets the preferred cache configuration for the current context.
+ *
+ * On devices where the L1 cache and shared memory use the same hardware
+ * resources, this sets through \p config the preferred cache configuration for
+ * the current context. This is only a preference. The driver will use
+ * the requested configuration if possible, but it is free to choose a different
+ * configuration if required to execute the function. Any function preference
+ * set via ::cuFuncSetCacheConfig() will be preferred over this context-wide
+ * setting. Setting the context-wide cache configuration to
+ * ::CU_FUNC_CACHE_PREFER_NONE will cause subsequent kernel launches to prefer
+ * to not change the cache configuration unless required to launch the kernel.
+ *
+ * This setting does nothing on devices where the size of the L1 cache and
+ * shared memory are fixed.
+ *
+ * Launching a kernel with a different preference than the most recent
+ * preference setting may insert a device-side synchronization point.
+ *
+ * The supported cache configurations are:
+ * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default)
+ * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache
+ * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory
+ * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory
+ *
+ * \param config - Requested cache configuration
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize,
+ * ::cuFuncSetCacheConfig,
+ * ::cudaDeviceSetCacheConfig
+ */
+CUresult CUDAAPI cuCtxSetCacheConfig(CUfunc_cache config);
+
+#if __CUDA_API_VERSION >= 4020
+/**
+ * \brief Returns the current shared memory configuration for the current context.
+ *
+ * This function will return in \p pConfig the current size of shared memory banks
+ * in the current context. On devices with configurable shared memory banks,
+ * ::cuCtxSetSharedMemConfig can be used to change this setting, so that all
+ * subsequent kernel launches will by default use the new bank size. When
+ * ::cuCtxGetSharedMemConfig is called on devices without configurable shared
+ * memory, it will return the fixed bank size of the hardware.
+ *
+ * The returned bank configurations can be either:
+ * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE:  shared memory bank width is
+ *   four bytes.
+ * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: shared memory bank width will
+ *   eight bytes.
+ *
+ * \param pConfig - returned shared memory configuration
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize,
+ * ::cuCtxGetSharedMemConfig,
+ * ::cuFuncSetCacheConfig,
+ * ::cudaDeviceGetSharedMemConfig
+ */
+CUresult CUDAAPI cuCtxGetSharedMemConfig(CUsharedconfig *pConfig);
+
+/**
+ * \brief Sets the shared memory configuration for the current context.
+ *
+ * On devices with configurable shared memory banks, this function will set
+ * the context's shared memory bank size which is used for subsequent kernel
+ * launches.
+ *
+ * Changed the shared memory configuration between launches may insert a device
+ * side synchronization point between those launches.
+ *
+ * Changing the shared memory bank size will not increase shared memory usage
+ * or affect occupancy of kernels, but may have major effects on performance.
+ * Larger bank sizes will allow for greater potential bandwidth to shared memory,
+ * but will change what kinds of accesses to shared memory will result in bank
+ * conflicts.
+ *
+ * This function will do nothing on devices with fixed shared memory bank size.
+ *
+ * The supported bank configurations are:
+ * - ::CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE: set bank width to the default initial
+ *   setting (currently, four bytes).
+ * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: set shared memory bank width to
+ *   be natively four bytes.
+ * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: set shared memory bank width to
+ *   be natively eight bytes.
+ *
+ * \param config - requested shared memory configuration
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize,
+ * ::cuCtxGetSharedMemConfig,
+ * ::cuFuncSetCacheConfig,
+ * ::cudaDeviceSetSharedMemConfig
+ */
+CUresult CUDAAPI cuCtxSetSharedMemConfig(CUsharedconfig config);
+#endif
+
+/**
+ * \brief Gets the context's API version.
+ *
+ * Returns a version number in \p version corresponding to the capabilities of
+ * the context (e.g. 3010 or 3020), which library developers can use to direct
+ * callers to a specific API version. If \p ctx is NULL, returns the API version
+ * used to create the currently bound context.
+ *
+ * Note that new API versions are only introduced when context capabilities are
+ * changed that break binary compatibility, so the API version and driver version
+ * may be different. For example, it is valid for the API version to be 3020 while
+ * the driver version is 4020.
+ *
+ * \param ctx     - Context to check
+ * \param version - Pointer to version
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize
+ */
+CUresult CUDAAPI cuCtxGetApiVersion(CUcontext ctx, unsigned int *version);
+
+/**
+ * \brief Returns numerical values that correspond to the least and
+ * greatest stream priorities.
+ *
+ * Returns in \p *leastPriority and \p *greatestPriority the numerical values that correspond
+ * to the least and greatest stream priorities respectively. Stream priorities
+ * follow a convention where lower numbers imply greater priorities. The range of
+ * meaningful stream priorities is given by [\p *greatestPriority, \p *leastPriority].
+ * If the user attempts to create a stream with a priority value that is
+ * outside the meaningful range as specified by this API, the priority is
+ * automatically clamped down or up to either \p *leastPriority or \p *greatestPriority
+ * respectively. See ::cuStreamCreateWithPriority for details on creating a
+ * priority stream.
+ * A NULL may be passed in for \p *leastPriority or \p *greatestPriority if the value
+ * is not desired.
+ *
+ * This function will return '0' in both \p *leastPriority and \p *greatestPriority if
+ * the current context's device does not support stream priorities
+ * (see ::cuDeviceGetAttribute).
+ *
+ * \param leastPriority    - Pointer to an int in which the numerical value for least
+ *                           stream priority is returned
+ * \param greatestPriority - Pointer to an int in which the numerical value for greatest
+ *                           stream priority is returned
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \notefnerr
+ *
+ * \sa ::cuStreamCreateWithPriority,
+ * ::cuStreamGetPriority,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize,
+ * ::cudaDeviceGetStreamPriorityRange
+ */
+CUresult CUDAAPI cuCtxGetStreamPriorityRange(int *leastPriority, int *greatestPriority);
+
+/** @} */ /* END CUDA_CTX */
+
+/**
+ * \defgroup CUDA_CTX_DEPRECATED Context Management [DEPRECATED]
+ *
+ * ___MANBRIEF___ deprecated context management functions of the low-level CUDA
+ * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the deprecated context management functions of the low-level
+ * CUDA driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Increment a context's usage-count
+ *
+ * \deprecated
+ *
+ * Note that this function is deprecated and should not be used.
+ *
+ * Increments the usage count of the context and passes back a context handle
+ * in \p *pctx that must be passed to ::cuCtxDetach() when the application is
+ * done with the context. ::cuCtxAttach() fails if there is no context current
+ * to the thread.
+ *
+ * Currently, the \p flags parameter must be 0.
+ *
+ * \param pctx  - Returned context handle of the current context
+ * \param flags - Context attach flags (must be 0)
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxDetach,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuCtxAttach(CUcontext *pctx, unsigned int flags);
+
+/**
+ * \brief Decrement a context's usage-count
+ *
+ * \deprecated
+ *
+ * Note that this function is deprecated and should not be used.
+ *
+ * Decrements the usage count of the context \p ctx, and destroys the context
+ * if the usage count goes to 0. The context must be a handle that was passed
+ * back by ::cuCtxCreate() or ::cuCtxAttach(), and must be current to the
+ * calling thread.
+ *
+ * \param ctx - Context to destroy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuCtxDetach(CUcontext ctx);
+
+/** @} */ /* END CUDA_CTX_DEPRECATED */
+
+
+/**
+ * \defgroup CUDA_MODULE Module Management
+ *
+ * ___MANBRIEF___ module management functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the module management functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Loads a compute module
+ *
+ * Takes a filename \p fname and loads the corresponding module \p module into
+ * the current context. The CUDA driver API does not attempt to lazily
+ * allocate the resources needed by a module; if the memory for functions and
+ * data (constant and global) needed by the module cannot be allocated,
+ * ::cuModuleLoad() fails. The file should be a \e cubin file as output by
+ * \b nvcc, or a \e PTX file either as output by \b nvcc or handwritten, or
+ * a \e fatbin file as output by \b nvcc from toolchain 4.0 or later.
+ *
+ * \param module - Returned module
+ * \param fname  - Filename of module to load
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_PTX,
+ * ::CUDA_ERROR_NOT_FOUND,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_FILE_NOT_FOUND,
+ * ::CUDA_ERROR_NO_BINARY_FOR_GPU,
+ * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
+ * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
+ * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND
+ * \notefnerr
+ *
+ * \sa ::cuModuleGetFunction,
+ * ::cuModuleGetGlobal,
+ * ::cuModuleGetTexRef,
+ * ::cuModuleLoadData,
+ * ::cuModuleLoadDataEx,
+ * ::cuModuleLoadFatBinary,
+ * ::cuModuleUnload
+ */
+CUresult CUDAAPI cuModuleLoad(CUmodule *module, const char *fname);
+
+/**
+ * \brief Load a module's data
+ *
+ * Takes a pointer \p image and loads the corresponding module \p module into
+ * the current context. The pointer may be obtained by mapping a \e cubin or
+ * \e PTX or \e fatbin file, passing a \e cubin or \e PTX or \e fatbin file
+ * as a NULL-terminated text string, or incorporating a \e cubin or \e fatbin
+ * object into the executable resources and using operating system calls such
+ * as Windows \c FindResource() to obtain the pointer.
+ *
+ * \param module - Returned module
+ * \param image  - Module data to load
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_PTX,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_NO_BINARY_FOR_GPU,
+ * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
+ * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
+ * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND
+ * \notefnerr
+ *
+ * \sa ::cuModuleGetFunction,
+ * ::cuModuleGetGlobal,
+ * ::cuModuleGetTexRef,
+ * ::cuModuleLoad,
+ * ::cuModuleLoadDataEx,
+ * ::cuModuleLoadFatBinary,
+ * ::cuModuleUnload
+ */
+CUresult CUDAAPI cuModuleLoadData(CUmodule *module, const void *image);
+
+/**
+ * \brief Load a module's data with options
+ *
+ * Takes a pointer \p image and loads the corresponding module \p module into
+ * the current context. The pointer may be obtained by mapping a \e cubin or
+ * \e PTX or \e fatbin file, passing a \e cubin or \e PTX or \e fatbin file
+ * as a NULL-terminated text string, or incorporating a \e cubin or \e fatbin
+ * object into the executable resources and using operating system calls such
+ * as Windows \c FindResource() to obtain the pointer. Options are passed as
+ * an array via \p options and any corresponding parameters are passed in
+ * \p optionValues. The number of total options is supplied via \p numOptions.
+ * Any outputs will be returned via \p optionValues.
+ *
+ * \param module       - Returned module
+ * \param image        - Module data to load
+ * \param numOptions   - Number of options
+ * \param options      - Options for JIT
+ * \param optionValues - Option values for JIT
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_PTX,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_NO_BINARY_FOR_GPU,
+ * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
+ * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
+ * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND
+ * \notefnerr
+ *
+ * \sa ::cuModuleGetFunction,
+ * ::cuModuleGetGlobal,
+ * ::cuModuleGetTexRef,
+ * ::cuModuleLoad,
+ * ::cuModuleLoadData,
+ * ::cuModuleLoadFatBinary,
+ * ::cuModuleUnload
+ */
+CUresult CUDAAPI cuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
+
+/**
+ * \brief Load a module's data
+ *
+ * Takes a pointer \p fatCubin and loads the corresponding module \p module
+ * into the current context. The pointer represents a <i>fat binary</i> object,
+ * which is a collection of different \e cubin and/or \e PTX files, all
+ * representing the same device code, but compiled and optimized for different
+ * architectures.
+ *
+ * Prior to CUDA 4.0, there was no documented API for constructing and using
+ * fat binary objects by programmers.  Starting with CUDA 4.0, fat binary
+ * objects can be constructed by providing the <i>-fatbin option</i> to \b nvcc.
+ * More information can be found in the \b nvcc document.
+ *
+ * \param module   - Returned module
+ * \param fatCubin - Fat binary to load
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_PTX,
+ * ::CUDA_ERROR_NOT_FOUND,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_NO_BINARY_FOR_GPU,
+ * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
+ * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
+ * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND
+ * \notefnerr
+ *
+ * \sa ::cuModuleGetFunction,
+ * ::cuModuleGetGlobal,
+ * ::cuModuleGetTexRef,
+ * ::cuModuleLoad,
+ * ::cuModuleLoadData,
+ * ::cuModuleLoadDataEx,
+ * ::cuModuleUnload
+ */
+CUresult CUDAAPI cuModuleLoadFatBinary(CUmodule *module, const void *fatCubin);
+
+/**
+ * \brief Unloads a module
+ *
+ * Unloads a module \p hmod from the current context.
+ *
+ * \param hmod - Module to unload
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuModuleGetFunction,
+ * ::cuModuleGetGlobal,
+ * ::cuModuleGetTexRef,
+ * ::cuModuleLoad,
+ * ::cuModuleLoadData,
+ * ::cuModuleLoadDataEx,
+ * ::cuModuleLoadFatBinary
+ */
+CUresult CUDAAPI cuModuleUnload(CUmodule hmod);
+
+/**
+ * \brief Returns a function handle
+ *
+ * Returns in \p *hfunc the handle of the function of name \p name located in
+ * module \p hmod. If no function of that name exists, ::cuModuleGetFunction()
+ * returns ::CUDA_ERROR_NOT_FOUND.
+ *
+ * \param hfunc - Returned function handle
+ * \param hmod  - Module to retrieve function from
+ * \param name  - Name of function to retrieve
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_FOUND
+ * \notefnerr
+ *
+ * \sa ::cuModuleGetGlobal,
+ * ::cuModuleGetTexRef,
+ * ::cuModuleLoad,
+ * ::cuModuleLoadData,
+ * ::cuModuleLoadDataEx,
+ * ::cuModuleLoadFatBinary,
+ * ::cuModuleUnload
+ */
+CUresult CUDAAPI cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name);
+
+#if __CUDA_API_VERSION >= 3020
+/**
+ * \brief Returns a global pointer from a module
+ *
+ * Returns in \p *dptr and \p *bytes the base pointer and size of the
+ * global of name \p name located in module \p hmod. If no variable of that name
+ * exists, ::cuModuleGetGlobal() returns ::CUDA_ERROR_NOT_FOUND. Both
+ * parameters \p dptr and \p bytes are optional. If one of them is
+ * NULL, it is ignored.
+ *
+ * \param dptr  - Returned global device pointer
+ * \param bytes - Returned global size in bytes
+ * \param hmod  - Module to retrieve global from
+ * \param name  - Name of global to retrieve
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_FOUND
+ * \notefnerr
+ *
+ * \sa ::cuModuleGetFunction,
+ * ::cuModuleGetTexRef,
+ * ::cuModuleLoad,
+ * ::cuModuleLoadData,
+ * ::cuModuleLoadDataEx,
+ * ::cuModuleLoadFatBinary,
+ * ::cuModuleUnload,
+ * ::cudaGetSymbolAddress,
+ * ::cudaGetSymbolSize
+ */
+CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes, CUmodule hmod, const char *name);
+#endif /* __CUDA_API_VERSION >= 3020 */
+
+/**
+ * \brief Returns a handle to a texture reference
+ *
+ * Returns in \p *pTexRef the handle of the texture reference of name \p name
+ * in the module \p hmod. If no texture reference of that name exists,
+ * ::cuModuleGetTexRef() returns ::CUDA_ERROR_NOT_FOUND. This texture reference
+ * handle should not be destroyed, since it will be destroyed when the module
+ * is unloaded.
+ *
+ * \param pTexRef  - Returned texture reference
+ * \param hmod     - Module to retrieve texture reference from
+ * \param name     - Name of texture reference to retrieve
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_FOUND
+ * \notefnerr
+ *
+ * \sa ::cuModuleGetFunction,
+ * ::cuModuleGetGlobal,
+ * ::cuModuleGetSurfRef,
+ * ::cuModuleLoad,
+ * ::cuModuleLoadData,
+ * ::cuModuleLoadDataEx,
+ * ::cuModuleLoadFatBinary,
+ * ::cuModuleUnload,
+ * ::cudaGetTextureReference
+ */
+CUresult CUDAAPI cuModuleGetTexRef(CUtexref *pTexRef, CUmodule hmod, const char *name);
+
+/**
+ * \brief Returns a handle to a surface reference
+ *
+ * Returns in \p *pSurfRef the handle of the surface reference of name \p name
+ * in the module \p hmod. If no surface reference of that name exists,
+ * ::cuModuleGetSurfRef() returns ::CUDA_ERROR_NOT_FOUND.
+ *
+ * \param pSurfRef  - Returned surface reference
+ * \param hmod     - Module to retrieve surface reference from
+ * \param name     - Name of surface reference to retrieve
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_FOUND
+ * \notefnerr
+ *
+ * \sa ::cuModuleGetFunction,
+ * ::cuModuleGetGlobal,
+ * ::cuModuleGetTexRef,
+ * ::cuModuleLoad,
+ * ::cuModuleLoadData,
+ * ::cuModuleLoadDataEx,
+ * ::cuModuleLoadFatBinary,
+ * ::cuModuleUnload,
+ * ::cudaGetSurfaceReference
+ */
+CUresult CUDAAPI cuModuleGetSurfRef(CUsurfref *pSurfRef, CUmodule hmod, const char *name);
+
+#if __CUDA_API_VERSION >= 5050
+
+/**
+ * \brief Creates a pending JIT linker invocation.
+ *
+ * If the call is successful, the caller owns the returned CUlinkState, which
+ * should eventually be destroyed with ::cuLinkDestroy.  The
+ * device code machine size (32 or 64 bit) will match the calling application.
+ *
+ * Both linker and compiler options may be specified.  Compiler options will
+ * be applied to inputs to this linker action which must be compiled from PTX.
+ * The options ::CU_JIT_WALL_TIME,
+ * ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, and ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES
+ * will accumulate data until the CUlinkState is destroyed.
+ *
+ * \p optionValues must remain valid for the life of the CUlinkState if output
+ * options are used.  No other references to inputs are maintained after this
+ * call returns.
+ *
+ * \param numOptions   Size of options arrays
+ * \param options      Array of linker and compiler options
+ * \param optionValues Array of option values, each cast to void *
+ * \param stateOut     On success, this will contain a CUlinkState to specify
+ *                     and complete this action
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND
+ * \notefnerr
+ *
+ * \sa ::cuLinkAddData,
+ * ::cuLinkAddFile,
+ * ::cuLinkComplete,
+ * ::cuLinkDestroy
+ */
+CUresult CUDAAPI
+cuLinkCreate(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut);
+
+/**
+ * \brief Add an input to a pending linker invocation
+ *
+ * Ownership of \p data is retained by the caller.  No reference is retained to any
+ * inputs after this call returns.
+ *
+ * This method accepts only compiler options, which are used if the data must
+ * be compiled from PTX, and does not accept any of
+ * ::CU_JIT_WALL_TIME, ::CU_JIT_INFO_LOG_BUFFER, ::CU_JIT_ERROR_LOG_BUFFER,
+ * ::CU_JIT_TARGET_FROM_CUCONTEXT, or ::CU_JIT_TARGET.
+ *
+ * \param state        A pending linker action.
+ * \param type         The type of the input data.
+ * \param data         The input data.  PTX must be NULL-terminated.
+ * \param size         The length of the input data.
+ * \param name         An optional name for this input in log messages.
+ * \param numOptions   Size of options.
+ * \param options      Options to be applied only for this input (overrides options from ::cuLinkCreate).
+ * \param optionValues Array of option values, each cast to void *.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_IMAGE,
+ * ::CUDA_ERROR_INVALID_PTX,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_NO_BINARY_FOR_GPU
+ *
+ * \sa ::cuLinkCreate,
+ * ::cuLinkAddFile,
+ * ::cuLinkComplete,
+ * ::cuLinkDestroy
+ */
+CUresult CUDAAPI
+cuLinkAddData(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name,
+    unsigned int numOptions, CUjit_option *options, void **optionValues);
+
+/**
+ * \brief Add a file input to a pending linker invocation
+ *
+ * No reference is retained to any inputs after this call returns.
+ *
+ * This method accepts only compiler options, which are used if the input
+ * must be compiled from PTX, and does not accept any of
+ * ::CU_JIT_WALL_TIME, ::CU_JIT_INFO_LOG_BUFFER, ::CU_JIT_ERROR_LOG_BUFFER,
+ * ::CU_JIT_TARGET_FROM_CUCONTEXT, or ::CU_JIT_TARGET.
+ *
+ * This method is equivalent to invoking ::cuLinkAddData on the contents
+ * of the file.
+ *
+ * \param state        A pending linker action
+ * \param type         The type of the input data
+ * \param path         Path to the input file
+ * \param numOptions   Size of options
+ * \param options      Options to be applied only for this input (overrides options from ::cuLinkCreate)
+ * \param optionValues Array of option values, each cast to void *
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_FILE_NOT_FOUND
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_IMAGE,
+ * ::CUDA_ERROR_INVALID_PTX,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_NO_BINARY_FOR_GPU
+ *
+ * \sa ::cuLinkCreate,
+ * ::cuLinkAddData,
+ * ::cuLinkComplete,
+ * ::cuLinkDestroy
+ */
+CUresult CUDAAPI
+cuLinkAddFile(CUlinkState state, CUjitInputType type, const char *path,
+    unsigned int numOptions, CUjit_option *options, void **optionValues);
+
+/**
+ * \brief Complete a pending linker invocation
+ *
+ * Completes the pending linker action and returns the cubin image for the linked
+ * device code, which can be used with ::cuModuleLoadData.  The cubin is owned by
+ * \p state, so it should be loaded before \p state is destroyed via ::cuLinkDestroy.
+ * This call does not destroy \p state.
+ *
+ * \param state    A pending linker invocation
+ * \param cubinOut On success, this will point to the output image
+ * \param sizeOut  Optional parameter to receive the size of the generated image
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ *
+ * \sa ::cuLinkCreate,
+ * ::cuLinkAddData,
+ * ::cuLinkAddFile,
+ * ::cuLinkDestroy,
+ * ::cuModuleLoadData
+ */
+CUresult CUDAAPI
+cuLinkComplete(CUlinkState state, void **cubinOut, size_t *sizeOut);
+
+/**
+ * \brief Destroys state for a JIT linker invocation.
+ *
+ * \param state State object for the linker invocation
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ *
+ * \sa ::cuLinkCreate
+ */
+CUresult CUDAAPI
+cuLinkDestroy(CUlinkState state);
+
+#endif /* __CUDA_API_VERSION >= 5050 */
+
+/** @} */ /* END CUDA_MODULE */
+
+
+/**
+ * \defgroup CUDA_MEM Memory Management
+ *
+ * ___MANBRIEF___ memory management functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the memory management functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+#if __CUDA_API_VERSION >= 3020
+/**
+ * \brief Gets free and total memory
+ *
+ * Returns in \p *free and \p *total respectively, the free and total amount of
+ * memory available for allocation by the CUDA context, in bytes.
+ *
+ * \param free  - Returned free memory in bytes
+ * \param total - Returned total memory in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemGetInfo
+ */
+CUresult CUDAAPI cuMemGetInfo(size_t *free, size_t *total);
+
+/**
+ * \brief Allocates device memory
+ *
+ * Allocates \p bytesize bytes of linear memory on the device and returns in
+ * \p *dptr a pointer to the allocated memory. The allocated memory is suitably
+ * aligned for any kind of variable. The memory is not cleared. If \p bytesize
+ * is 0, ::cuMemAlloc() returns ::CUDA_ERROR_INVALID_VALUE.
+ *
+ * \param dptr     - Returned device pointer
+ * \param bytesize - Requested allocation size in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMalloc
+ */
+CUresult CUDAAPI cuMemAlloc(CUdeviceptr *dptr, size_t bytesize);
+
+/**
+ * \brief Allocates pitched device memory
+ *
+ * Allocates at least \p WidthInBytes * \p Height bytes of linear memory on
+ * the device and returns in \p *dptr a pointer to the allocated memory. The
+ * function may pad the allocation to ensure that corresponding pointers in
+ * any given row will continue to meet the alignment requirements for
+ * coalescing as the address is updated from row to row. \p ElementSizeBytes
+ * specifies the size of the largest reads and writes that will be performed
+ * on the memory range. \p ElementSizeBytes may be 4, 8 or 16 (since coalesced
+ * memory transactions are not possible on other data sizes). If
+ * \p ElementSizeBytes is smaller than the actual read/write size of a kernel,
+ * the kernel will run correctly, but possibly at reduced speed. The pitch
+ * returned in \p *pPitch by ::cuMemAllocPitch() is the width in bytes of the
+ * allocation. The intended usage of pitch is as a separate parameter of the
+ * allocation, used to compute addresses within the 2D array. Given the row
+ * and column of an array element of type \b T, the address is computed as:
+ * \code
+   T* pElement = (T*)((char*)BaseAddress + Row * Pitch) + Column;
+ * \endcode
+ *
+ * The pitch returned by ::cuMemAllocPitch() is guaranteed to work with
+ * ::cuMemcpy2D() under all circumstances. For allocations of 2D arrays, it is
+ * recommended that programmers consider performing pitch allocations using
+ * ::cuMemAllocPitch(). Due to alignment restrictions in the hardware, this is
+ * especially true if the application will be performing 2D memory copies
+ * between different regions of device memory (whether linear memory or CUDA
+ * arrays).
+ *
+ * The byte alignment of the pitch returned by ::cuMemAllocPitch() is guaranteed
+ * to match or exceed the alignment requirement for texture binding with
+ * ::cuTexRefSetAddress2D().
+ *
+ * \param dptr             - Returned device pointer
+ * \param pPitch           - Returned pitch of allocation in bytes
+ * \param WidthInBytes     - Requested allocation width in bytes
+ * \param Height           - Requested allocation height in rows
+ * \param ElementSizeBytes - Size of largest reads/writes for range
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMallocPitch
+ */
+CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr *dptr, size_t *pPitch, size_t WidthInBytes, size_t Height, unsigned int ElementSizeBytes);
+
+/**
+ * \brief Frees device memory
+ *
+ * Frees the memory space pointed to by \p dptr, which must have been returned
+ * by a previous call to ::cuMemAlloc() or ::cuMemAllocPitch().
+ *
+ * \param dptr - Pointer to memory to free
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaFree
+ */
+CUresult CUDAAPI cuMemFree(CUdeviceptr dptr);
+
+/**
+ * \brief Get information on memory allocations
+ *
+ * Returns the base address in \p *pbase and size in \p *psize of the
+ * allocation by ::cuMemAlloc() or ::cuMemAllocPitch() that contains the input
+ * pointer \p dptr. Both parameters \p pbase and \p psize are optional. If one
+ * of them is NULL, it is ignored.
+ *
+ * \param pbase - Returned base address
+ * \param psize - Returned size of device memory allocation
+ * \param dptr  - Device pointer to query
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_NOT_FOUND,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
+ */
+CUresult CUDAAPI cuMemGetAddressRange(CUdeviceptr *pbase, size_t *psize, CUdeviceptr dptr);
+
+/**
+ * \brief Allocates page-locked host memory
+ *
+ * Allocates \p bytesize bytes of host memory that is page-locked and
+ * accessible to the device. The driver tracks the virtual memory ranges
+ * allocated with this function and automatically accelerates calls to
+ * functions such as ::cuMemcpy(). Since the memory can be accessed directly by
+ * the device, it can be read or written with much higher bandwidth than
+ * pageable memory obtained with functions such as ::malloc(). Allocating
+ * excessive amounts of memory with ::cuMemAllocHost() may degrade system
+ * performance, since it reduces the amount of memory available to the system
+ * for paging. As a result, this function is best used sparingly to allocate
+ * staging areas for data exchange between host and device.
+ *
+ * Note all host memory allocated using ::cuMemHostAlloc() will automatically
+ * be immediately accessible to all contexts on all devices which support unified
+ * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING).
+ * The device pointer that may be used to access this host memory from those
+ * contexts is always equal to the returned host pointer \p *pp.
+ * See \ref CUDA_UNIFIED for additional details.
+ *
+ * \param pp       - Returned host pointer to page-locked memory
+ * \param bytesize - Requested allocation size in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMallocHost
+ */
+CUresult CUDAAPI cuMemAllocHost(void **pp, size_t bytesize);
+#endif /* __CUDA_API_VERSION >= 3020 */
+
+/**
+ * \brief Frees page-locked host memory
+ *
+ * Frees the memory space pointed to by \p p, which must have been returned by
+ * a previous call to ::cuMemAllocHost().
+ *
+ * \param p - Pointer to memory to free
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaFreeHost
+ */
+CUresult CUDAAPI cuMemFreeHost(void *p);
+
+/**
+ * \brief Allocates page-locked host memory
+ *
+ * Allocates \p bytesize bytes of host memory that is page-locked and accessible
+ * to the device. The driver tracks the virtual memory ranges allocated with
+ * this function and automatically accelerates calls to functions such as
+ * ::cuMemcpyHtoD(). Since the memory can be accessed directly by the device,
+ * it can be read or written with much higher bandwidth than pageable memory
+ * obtained with functions such as ::malloc(). Allocating excessive amounts of
+ * pinned memory may degrade system performance, since it reduces the amount
+ * of memory available to the system for paging. As a result, this function is
+ * best used sparingly to allocate staging areas for data exchange between
+ * host and device.
+ *
+ * The \p Flags parameter enables different options to be specified that
+ * affect the allocation, as follows.
+ *
+ * - ::CU_MEMHOSTALLOC_PORTABLE: The memory returned by this call will be
+ *   considered as pinned memory by all CUDA contexts, not just the one that
+ *   performed the allocation.
+ *
+ * - ::CU_MEMHOSTALLOC_DEVICEMAP: Maps the allocation into the CUDA address
+ *   space. The device pointer to the memory may be obtained by calling
+ *   ::cuMemHostGetDevicePointer().
+ *
+ * - ::CU_MEMHOSTALLOC_WRITECOMBINED: Allocates the memory as write-combined
+ *   (WC). WC memory can be transferred across the PCI Express bus more
+ *   quickly on some system configurations, but cannot be read efficiently by
+ *   most CPUs. WC memory is a good option for buffers that will be written by
+ *   the CPU and read by the GPU via mapped pinned memory or host->device
+ *   transfers.
+ *
+ * All of these flags are orthogonal to one another: a developer may allocate
+ * memory that is portable, mapped and/or write-combined with no restrictions.
+ *
+ * The CUDA context must have been created with the ::CU_CTX_MAP_HOST flag in
+ * order for the ::CU_MEMHOSTALLOC_DEVICEMAP flag to have any effect.
+ *
+ * The ::CU_MEMHOSTALLOC_DEVICEMAP flag may be specified on CUDA contexts for
+ * devices that do not support mapped pinned memory. The failure is deferred
+ * to ::cuMemHostGetDevicePointer() because the memory may be mapped into
+ * other CUDA contexts via the ::CU_MEMHOSTALLOC_PORTABLE flag.
+ *
+ * The memory allocated by this function must be freed with ::cuMemFreeHost().
+ *
+ * Note all host memory allocated using ::cuMemHostAlloc() will automatically
+ * be immediately accessible to all contexts on all devices which support unified
+ * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING).
+ * Unless the flag ::CU_MEMHOSTALLOC_WRITECOMBINED is specified, the device pointer
+ * that may be used to access this host memory from those contexts is always equal
+ * to the returned host pointer \p *pp.  If the flag ::CU_MEMHOSTALLOC_WRITECOMBINED
+ * is specified, then the function ::cuMemHostGetDevicePointer() must be used
+ * to query the device pointer, even if the context supports unified addressing.
+ * See \ref CUDA_UNIFIED for additional details.
+ *
+ * \param pp       - Returned host pointer to page-locked memory
+ * \param bytesize - Requested allocation size in bytes
+ * \param Flags    - Flags for allocation request
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaHostAlloc
+ */
+CUresult CUDAAPI cuMemHostAlloc(void **pp, size_t bytesize, unsigned int Flags);
+
+#if __CUDA_API_VERSION >= 3020
+/**
+ * \brief Passes back device pointer of mapped pinned memory
+ *
+ * Passes back the device pointer \p pdptr corresponding to the mapped, pinned
+ * host buffer \p p allocated by ::cuMemHostAlloc.
+ *
+ * ::cuMemHostGetDevicePointer() will fail if the ::CU_MEMHOSTALLOC_DEVICEMAP
+ * flag was not specified at the time the memory was allocated, or if the
+ * function is called on a GPU that does not support mapped pinned memory.
+ *
+ * For devices that have a non-zero value for the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM, the memory
+ * can also be accessed from the device using the host pointer \p p.
+ * The device pointer returned by ::cuMemHostGetDevicePointer() may or may not
+ * match the original host pointer \p p and depends on the devices visible to the
+ * application. If all devices visible to the application have a non-zero value for the
+ * device attribute, the device pointer returned by ::cuMemHostGetDevicePointer()
+ * will match the original pointer \p p. If any device visible to the application
+ * has a zero value for the device attribute, the device pointer returned by
+ * ::cuMemHostGetDevicePointer() will not match the original host pointer \p p,
+ * but it will be suitable for use on all devices provided Unified Virtual Addressing
+ * is enabled. In such systems, it is valid to access the memory using either pointer
+ * on devices that have a non-zero value for the device attribute. Note however that
+ * such devices should access the memory using only of the two pointers and not both.
+ *
+ * \p Flags provides for future releases. For now, it must be set to 0.
+ *
+ * \param pdptr - Returned device pointer
+ * \param p     - Host pointer
+ * \param Flags - Options (must be 0)
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaHostGetDevicePointer
+ */
+CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned int Flags);
+#endif /* __CUDA_API_VERSION >= 3020 */
+
+/**
+ * \brief Passes back flags that were used for a pinned allocation
+ *
+ * Passes back the flags \p pFlags that were specified when allocating
+ * the pinned host buffer \p p allocated by ::cuMemHostAlloc.
+ *
+ * ::cuMemHostGetFlags() will fail if the pointer does not reside in
+ * an allocation performed by ::cuMemAllocHost() or ::cuMemHostAlloc().
+ *
+ * \param pFlags - Returned flags word
+ * \param p     - Host pointer
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuMemAllocHost,
+ * ::cuMemHostAlloc,
+ * ::cudaHostGetFlags
+ */
+CUresult CUDAAPI cuMemHostGetFlags(unsigned int *pFlags, void *p);
+
+#if __CUDA_API_VERSION >= 6000
+
+/**
+ * \brief Allocates memory that will be automatically managed by the Unified Memory system
+ *
+ * Allocates \p bytesize bytes of managed memory on the device and returns in
+ * \p *dptr a pointer to the allocated memory. If the device doesn't support
+ * allocating managed memory, ::CUDA_ERROR_NOT_SUPPORTED is returned. Support
+ * for managed memory can be queried using the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY. The allocated memory is suitably
+ * aligned for any kind of variable. The memory is not cleared. If \p bytesize
+ * is 0, ::cuMemAllocManaged returns ::CUDA_ERROR_INVALID_VALUE. The pointer
+ * is valid on the CPU and on all GPUs in the system that support managed memory.
+ * All accesses to this pointer must obey the Unified Memory programming model.
+ *
+ * \p flags specifies the default stream association for this allocation.
+ * \p flags must be one of ::CU_MEM_ATTACH_GLOBAL or ::CU_MEM_ATTACH_HOST. If
+ * ::CU_MEM_ATTACH_GLOBAL is specified, then this memory is accessible from
+ * any stream on any device. If ::CU_MEM_ATTACH_HOST is specified, then the
+ * allocation should not be accessed from devices that have a zero value for the
+ * device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS; an explicit call to
+ * ::cuStreamAttachMemAsync will be required to enable access on such devices.
+ *
+ * If the association is later changed via ::cuStreamAttachMemAsync to
+ * a single stream, the default association as specified during ::cuMemAllocManaged
+ * is restored when that stream is destroyed. For __managed__ variables, the
+ * default association is always ::CU_MEM_ATTACH_GLOBAL. Note that destroying a
+ * stream is an asynchronous operation, and as a result, the change to default
+ * association won't happen until all work in the stream has completed.
+ *
+ * Memory allocated with ::cuMemAllocManaged should be released with ::cuMemFree.
+ *
+ * Device memory oversubscription is possible for GPUs that have a non-zero value for the
+ * device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. Managed memory on
+ * such GPUs may be evicted from device memory to host memory at any time by the Unified
+ * Memory driver in order to make room for other allocations.
+ *
+ * In a multi-GPU system where all GPUs have a non-zero value for the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, managed memory may not be populated when this
+ * API returns and instead may be populated on access. In such systems, managed memory can
+ * migrate to any processor's memory at any time. The Unified Memory driver will employ heuristics to
+ * maintain data locality and prevent excessive page faults to the extent possible. The application
+ * can also guide the driver about memory usage patterns via ::cuMemAdvise. The application
+ * can also explicitly migrate memory to a desired processor's memory via
+ * ::cuMemPrefetchAsync.
+ *
+ * In a multi-GPU system where all of the GPUs have a zero value for the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS and all the GPUs have peer-to-peer support
+ * with each other, the physical storage for managed memory is created on the GPU which is active
+ * at the time ::cuMemAllocManaged is called. All other GPUs will reference the data at reduced
+ * bandwidth via peer mappings over the PCIe bus. The Unified Memory driver does not migrate
+ * memory among such GPUs.
+ *
+ * In a multi-GPU system where not all GPUs have peer-to-peer support with each other and
+ * where the value of the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS
+ * is zero for at least one of those GPUs, the location chosen for physical storage of managed
+ * memory is system-dependent.
+ * - On Linux, the location chosen will be device memory as long as the current set of active
+ * contexts are on devices that either have peer-to-peer support with each other or have a
+ * non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.
+ * If there is an active context on a GPU that does not have a non-zero value for that device
+ * attribute and it does not have peer-to-peer support with the other devices that have active
+ * contexts on them, then the location for physical storage will be 'zero-copy' or host memory.
+ * Note that this means that managed memory that is located in device memory is migrated to
+ * host memory if a new context is created on a GPU that doesn't have a non-zero value for
+ * the device attribute and does not support peer-to-peer with at least one of the other devices
+ * that has an active context. This in turn implies that context creation may fail if there is
+ * insufficient host memory to migrate all managed allocations.
+ * - On Windows, the physical storage is always created in 'zero-copy' or host memory.
+ * All GPUs will reference the data at reduced bandwidth over the PCIe bus. In these
+ * circumstances, use of the environment variable CUDA_VISIBLE_DEVICES is recommended to
+ * restrict CUDA to only use those GPUs that have peer-to-peer support.
+ * Alternatively, users can also set CUDA_MANAGED_FORCE_DEVICE_ALLOC to a
+ * non-zero value to force the driver to always use device memory for physical storage.
+ * When this environment variable is set to a non-zero value, all contexts created in
+ * that process on devices that support managed memory have to be peer-to-peer compatible
+ * with each other. Context creation will fail if a context is created on a device that
+ * supports managed memory and is not peer-to-peer compatible with any of the other
+ * managed memory supporting devices on which contexts were previously created, even if
+ * those contexts have been destroyed. These environment variables are described
+ * in the CUDA programming guide under the "CUDA environment variables" section.
+ * - On ARM, managed memory is not available on discrete gpu with Drive PX-2.
+ *
+ * \param dptr     - Returned device pointer
+ * \param bytesize - Requested allocation size in bytes
+ * \param flags    - Must be one of ::CU_MEM_ATTACH_GLOBAL or ::CU_MEM_ATTACH_HOST
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cuDeviceGetAttribute, ::cuStreamAttachMemAsync,
+ * ::cudaMallocManaged
+ */
+CUresult CUDAAPI cuMemAllocManaged(CUdeviceptr *dptr, size_t bytesize, unsigned int flags);
+
+#endif /* __CUDA_API_VERSION >= 6000 */
+
+#if __CUDA_API_VERSION >= 4010
+
+/**
+ * \brief Returns a handle to a compute device
+ *
+ * Returns in \p *device a device handle given a PCI bus ID string.
+ *
+ * \param dev      - Returned device handle
+ *
+ * \param pciBusId - String in one of the following forms:
+ * [domain]:[bus]:[device].[function]
+ * [domain]:[bus]:[device]
+ * [bus]:[device].[function]
+ * where \p domain, \p bus, \p device, and \p function are all hexadecimal values
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGet,
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetPCIBusId,
+ * ::cudaDeviceGetByPCIBusId
+ */
+CUresult CUDAAPI cuDeviceGetByPCIBusId(CUdevice *dev, const char *pciBusId);
+
+/**
+ * \brief Returns a PCI Bus Id string for the device
+ *
+ * Returns an ASCII string identifying the device \p dev in the NULL-terminated
+ * string pointed to by \p pciBusId. \p len specifies the maximum length of the
+ * string that may be returned.
+ *
+ * \param pciBusId - Returned identifier string for the device in the following format
+ * [domain]:[bus]:[device].[function]
+ * where \p domain, \p bus, \p device, and \p function are all hexadecimal values.
+ * pciBusId should be large enough to store 13 characters including the NULL-terminator.
+ *
+ * \param len      - Maximum length of string to store in \p name
+ *
+ * \param dev      - Device to get identifier string for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGet,
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetByPCIBusId,
+ * ::cudaDeviceGetPCIBusId
+ */
+CUresult CUDAAPI cuDeviceGetPCIBusId(char *pciBusId, int len, CUdevice dev);
+
+/**
+ * \brief Gets an interprocess handle for a previously allocated event
+ *
+ * Takes as input a previously allocated event. This event must have been
+ * created with the ::CU_EVENT_INTERPROCESS and ::CU_EVENT_DISABLE_TIMING
+ * flags set. This opaque handle may be copied into other processes and
+ * opened with ::cuIpcOpenEventHandle to allow efficient hardware
+ * synchronization between GPU work in different processes.
+ *
+ * After the event has been opened in the importing process,
+ * ::cuEventRecord, ::cuEventSynchronize, ::cuStreamWaitEvent and
+ * ::cuEventQuery may be used in either process. Performing operations
+ * on the imported event after the exported event has been freed
+ * with ::cuEventDestroy will result in undefined behavior.
+ *
+ * IPC functionality is restricted to devices with support for unified
+ * addressing on Linux and Windows operating systems.
+ * IPC functionality on Windows is restricted to GPUs in TCC mode
+ *
+ * \param pHandle - Pointer to a user allocated CUipcEventHandle
+ *                    in which to return the opaque event handle
+ * \param event   - Event allocated with ::CU_EVENT_INTERPROCESS and
+ *                    ::CU_EVENT_DISABLE_TIMING flags.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_MAP_FAILED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuEventCreate,
+ * ::cuEventDestroy,
+ * ::cuEventSynchronize,
+ * ::cuEventQuery,
+ * ::cuStreamWaitEvent,
+ * ::cuIpcOpenEventHandle,
+ * ::cuIpcGetMemHandle,
+ * ::cuIpcOpenMemHandle,
+ * ::cuIpcCloseMemHandle,
+ * ::cudaIpcGetEventHandle
+ */
+CUresult CUDAAPI cuIpcGetEventHandle(CUipcEventHandle *pHandle, CUevent event);
+
+/**
+ * \brief Opens an interprocess event handle for use in the current process
+ *
+ * Opens an interprocess event handle exported from another process with
+ * ::cuIpcGetEventHandle. This function returns a ::CUevent that behaves like
+ * a locally created event with the ::CU_EVENT_DISABLE_TIMING flag specified.
+ * This event must be freed with ::cuEventDestroy.
+ *
+ * Performing operations on the imported event after the exported event has
+ * been freed with ::cuEventDestroy will result in undefined behavior.
+ *
+ * IPC functionality is restricted to devices with support for unified
+ * addressing on Linux and Windows operating systems.
+ * IPC functionality on Windows is restricted to GPUs in TCC mode
+ *
+ * \param phEvent - Returns the imported event
+ * \param handle  - Interprocess handle to open
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_MAP_FAILED,
+ * ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuEventCreate,
+ * ::cuEventDestroy,
+ * ::cuEventSynchronize,
+ * ::cuEventQuery,
+ * ::cuStreamWaitEvent,
+ * ::cuIpcGetEventHandle,
+ * ::cuIpcGetMemHandle,
+ * ::cuIpcOpenMemHandle,
+ * ::cuIpcCloseMemHandle,
+ * ::cudaIpcOpenEventHandle
+ */
+CUresult CUDAAPI cuIpcOpenEventHandle(CUevent *phEvent, CUipcEventHandle handle);
+
+/**
+ * \brief Gets an interprocess memory handle for an existing device memory
+ * allocation
+ *
+ * Takes a pointer to the base of an existing device memory allocation created
+ * with ::cuMemAlloc and exports it for use in another process. This is a
+ * lightweight operation and may be called multiple times on an allocation
+ * without adverse effects.
+ *
+ * If a region of memory is freed with ::cuMemFree and a subsequent call
+ * to ::cuMemAlloc returns memory with the same device address,
+ * ::cuIpcGetMemHandle will return a unique handle for the
+ * new memory.
+ *
+ * IPC functionality is restricted to devices with support for unified
+ * addressing on Linux and Windows operating systems.
+ * IPC functionality on Windows is restricted to GPUs in TCC mode
+ *
+ * \param pHandle - Pointer to user allocated ::CUipcMemHandle to return
+ *                    the handle in.
+ * \param dptr    - Base pointer to previously allocated device memory
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_MAP_FAILED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuMemAlloc,
+ * ::cuMemFree,
+ * ::cuIpcGetEventHandle,
+ * ::cuIpcOpenEventHandle,
+ * ::cuIpcOpenMemHandle,
+ * ::cuIpcCloseMemHandle,
+ * ::cudaIpcGetMemHandle
+ */
+CUresult CUDAAPI cuIpcGetMemHandle(CUipcMemHandle *pHandle, CUdeviceptr dptr);
+
+/**
+ * \brief Opens an interprocess memory handle exported from another process
+ * and returns a device pointer usable in the local process.
+ *
+ * Maps memory exported from another process with ::cuIpcGetMemHandle into
+ * the current device address space. For contexts on different devices
+ * ::cuIpcOpenMemHandle can attempt to enable peer access between the
+ * devices as if the user called ::cuCtxEnablePeerAccess. This behavior is
+ * controlled by the ::CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS flag.
+ * ::cuDeviceCanAccessPeer can determine if a mapping is possible.
+ *
+ * ::cuIpcOpenMemHandle can open handles to devices that may not be visible
+ * in the process calling the API.
+ *
+ * Contexts that may open ::CUipcMemHandles are restricted in the following way.
+ * ::CUipcMemHandles from each ::CUdevice in a given process may only be opened
+ * by one ::CUcontext per ::CUdevice per other process.
+ *
+ * Memory returned from ::cuIpcOpenMemHandle must be freed with
+ * ::cuIpcCloseMemHandle.
+ *
+ * Calling ::cuMemFree on an exported memory region before calling
+ * ::cuIpcCloseMemHandle in the importing context will result in undefined
+ * behavior.
+ *
+ * IPC functionality is restricted to devices with support for unified
+ * addressing on Linux and Windows operating systems.
+ * IPC functionality on Windows is restricted to GPUs in TCC mode
+ *
+ * \param pdptr  - Returned device pointer
+ * \param handle - ::CUipcMemHandle to open
+ * \param Flags  - Flags for this operation. Must be specified as ::CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_MAP_FAILED,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_TOO_MANY_PEERS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \note No guarantees are made about the address returned in \p *pdptr.
+ * In particular, multiple processes may not receive the same address for the same \p handle.
+ *
+ * \sa
+ * ::cuMemAlloc,
+ * ::cuMemFree,
+ * ::cuIpcGetEventHandle,
+ * ::cuIpcOpenEventHandle,
+ * ::cuIpcGetMemHandle,
+ * ::cuIpcCloseMemHandle,
+ * ::cuCtxEnablePeerAccess,
+ * ::cuDeviceCanAccessPeer,
+ * ::cudaIpcOpenMemHandle
+ */
+CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, unsigned int Flags);
+
+/**
+ * \brief Close memory mapped with ::cuIpcOpenMemHandle
+ *
+ * Unmaps memory returned by ::cuIpcOpenMemHandle. The original allocation
+ * in the exporting process as well as imported mappings in other processes
+ * will be unaffected.
+ *
+ * Any resources used to enable peer access will be freed if this is the
+ * last mapping using them.
+ *
+ * IPC functionality is restricted to devices with support for unified
+ * addressing on Linux and Windows operating systems.
+ * IPC functionality on Windows is restricted to GPUs in TCC mode
+ *
+ * \param dptr - Device pointer returned by ::cuIpcOpenMemHandle
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_MAP_FAILED,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \sa
+ * ::cuMemAlloc,
+ * ::cuMemFree,
+ * ::cuIpcGetEventHandle,
+ * ::cuIpcOpenEventHandle,
+ * ::cuIpcGetMemHandle,
+ * ::cuIpcOpenMemHandle,
+ * ::cudaIpcCloseMemHandle
+ */
+CUresult CUDAAPI cuIpcCloseMemHandle(CUdeviceptr dptr);
+
+#endif /* __CUDA_API_VERSION >= 4010 */
+
+#if __CUDA_API_VERSION >= 4000
+/**
+ * \brief Registers an existing host memory range for use by CUDA
+ *
+ * Page-locks the memory range specified by \p p and \p bytesize and maps it
+ * for the device(s) as specified by \p Flags. This memory range also is added
+ * to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate
+ * calls to functions such as ::cuMemcpyHtoD(). Since the memory can be accessed
+ * directly by the device, it can be read or written with much higher bandwidth
+ * than pageable memory that has not been registered.  Page-locking excessive
+ * amounts of memory may degrade system performance, since it reduces the amount
+ * of memory available to the system for paging. As a result, this function is
+ * best used sparingly to register staging areas for data exchange between
+ * host and device.
+ *
+ * This function has limited support on Mac OS X. OS 10.7 or higher is required.
+ *
+ * The \p Flags parameter enables different options to be specified that
+ * affect the allocation, as follows.
+ *
+ * - ::CU_MEMHOSTREGISTER_PORTABLE: The memory returned by this call will be
+ *   considered as pinned memory by all CUDA contexts, not just the one that
+ *   performed the allocation.
+ *
+ * - ::CU_MEMHOSTREGISTER_DEVICEMAP: Maps the allocation into the CUDA address
+ *   space. The device pointer to the memory may be obtained by calling
+ *   ::cuMemHostGetDevicePointer().
+ *
+ * - ::CU_MEMHOSTREGISTER_IOMEMORY: The pointer is treated as pointing to some
+ *   I/O memory space, e.g. the PCI Express resource of a 3rd party device.
+ *
+ * All of these flags are orthogonal to one another: a developer may page-lock
+ * memory that is portable or mapped with no restrictions.
+ *
+ * The CUDA context must have been created with the ::CU_CTX_MAP_HOST flag in
+ * order for the ::CU_MEMHOSTREGISTER_DEVICEMAP flag to have any effect.
+ *
+ * The ::CU_MEMHOSTREGISTER_DEVICEMAP flag may be specified on CUDA contexts for
+ * devices that do not support mapped pinned memory. The failure is deferred
+ * to ::cuMemHostGetDevicePointer() because the memory may be mapped into
+ * other CUDA contexts via the ::CU_MEMHOSTREGISTER_PORTABLE flag.
+ *
+ * For devices that have a non-zero value for the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM, the memory
+ * can also be accessed from the device using the host pointer \p p.
+ * The device pointer returned by ::cuMemHostGetDevicePointer() may or may not
+ * match the original host pointer \p ptr and depends on the devices visible to the
+ * application. If all devices visible to the application have a non-zero value for the
+ * device attribute, the device pointer returned by ::cuMemHostGetDevicePointer()
+ * will match the original pointer \p ptr. If any device visible to the application
+ * has a zero value for the device attribute, the device pointer returned by
+ * ::cuMemHostGetDevicePointer() will not match the original host pointer \p ptr,
+ * but it will be suitable for use on all devices provided Unified Virtual Addressing
+ * is enabled. In such systems, it is valid to access the memory using either pointer
+ * on devices that have a non-zero value for the device attribute. Note however that
+ * such devices should access the memory using only of the two pointers and not both.
+ *
+ * The memory page-locked by this function must be unregistered with
+ * ::cuMemHostUnregister().
+ *
+ * \param p        - Host pointer to memory to page-lock
+ * \param bytesize - Size in bytes of the address range to page-lock
+ * \param Flags    - Flags for allocation request
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED,
+ * ::CUDA_ERROR_NOT_PERMITTED,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \notefnerr
+ *
+ * \sa
+ * ::cuMemHostUnregister,
+ * ::cuMemHostGetFlags,
+ * ::cuMemHostGetDevicePointer,
+ * ::cudaHostRegister
+ */
+CUresult CUDAAPI cuMemHostRegister(void *p, size_t bytesize, unsigned int Flags);
+
+/**
+ * \brief Unregisters a memory range that was registered with cuMemHostRegister.
+ *
+ * Unmaps the memory range whose base address is specified by \p p, and makes
+ * it pageable again.
+ *
+ * The base address must be the same one specified to ::cuMemHostRegister().
+ *
+ * \param p - Host pointer to memory to unregister
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED,
+ * \notefnerr
+ *
+ * \sa
+ * ::cuMemHostRegister,
+ * ::cudaHostUnregister
+ */
+CUresult CUDAAPI cuMemHostUnregister(void *p);
+
+/**
+ * \brief Copies memory
+ *
+ * Copies data between two pointers.
+ * \p dst and \p src are base pointers of the destination and source, respectively.
+ * \p ByteCount specifies the number of bytes to copy.
+ * Note that this function infers the type of the transfer (host to host, host to
+ *   device, device to device, or device to host) from the pointer values.  This
+ *   function is only allowed in contexts which support unified addressing.
+ *
+ * \param dst - Destination unified virtual address space pointer
+ * \param src - Source unified virtual address space pointer
+ * \param ByteCount - Size of memory copy in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpy,
+ * ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol
+ */
+CUresult CUDAAPI cuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount);
+
+/**
+ * \brief Copies device memory between two contexts
+ *
+ * Copies from device memory in one context to device memory in another
+ * context. \p dstDevice is the base device pointer of the destination memory
+ * and \p dstContext is the destination context.  \p srcDevice is the base
+ * device pointer of the source memory and \p srcContext is the source pointer.
+ * \p ByteCount specifies the number of bytes to copy.
+ *
+ * \param dstDevice  - Destination device pointer
+ * \param dstContext - Destination context
+ * \param srcDevice  - Source device pointer
+ * \param srcContext - Source context
+ * \param ByteCount  - Size of memory copy in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ *
+ * \sa ::cuMemcpyDtoD, ::cuMemcpy3DPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync,
+ * ::cuMemcpy3DPeerAsync,
+ * ::cudaMemcpyPeer
+ */
+CUresult CUDAAPI cuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount);
+
+#endif /* __CUDA_API_VERSION >= 4000 */
+
+#if __CUDA_API_VERSION >= 3020
+/**
+ * \brief Copies memory from Host to Device
+ *
+ * Copies from host memory to device memory. \p dstDevice and \p srcHost are
+ * the base addresses of the destination and source, respectively. \p ByteCount
+ * specifies the number of bytes to copy.
+ *
+ * \param dstDevice - Destination device pointer
+ * \param srcHost   - Source host pointer
+ * \param ByteCount - Size of memory copy in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpy,
+ * ::cudaMemcpyToSymbol
+ */
+CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount);
+
+/**
+ * \brief Copies memory from Device to Host
+ *
+ * Copies from device to host memory. \p dstHost and \p srcDevice specify the
+ * base pointers of the destination and source, respectively. \p ByteCount
+ * specifies the number of bytes to copy.
+ *
+ * \param dstHost   - Destination host pointer
+ * \param srcDevice - Source device pointer
+ * \param ByteCount - Size of memory copy in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpy,
+ * ::cudaMemcpyFromSymbol
+ */
+CUresult CUDAAPI cuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount);
+
+/**
+ * \brief Copies memory from Device to Device
+ *
+ * Copies from device memory to device memory. \p dstDevice and \p srcDevice
+ * are the base pointers of the destination and source, respectively.
+ * \p ByteCount specifies the number of bytes to copy.
+ *
+ * \param dstDevice - Destination device pointer
+ * \param srcDevice - Source device pointer
+ * \param ByteCount - Size of memory copy in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpy,
+ * ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol
+ */
+CUresult CUDAAPI cuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount);
+
+/**
+ * \brief Copies memory from Device to Array
+ *
+ * Copies from device memory to a 1D CUDA array. \p dstArray and \p dstOffset
+ * specify the CUDA array handle and starting index of the destination data.
+ * \p srcDevice specifies the base pointer of the source. \p ByteCount
+ * specifies the number of bytes to copy.
+ *
+ * \param dstArray  - Destination array
+ * \param dstOffset - Offset in bytes of destination array
+ * \param srcDevice - Source device pointer
+ * \param ByteCount - Size of memory copy in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpyToArray
+ */
+CUresult CUDAAPI cuMemcpyDtoA(CUarray dstArray, size_t dstOffset, CUdeviceptr srcDevice, size_t ByteCount);
+
+/**
+ * \brief Copies memory from Array to Device
+ *
+ * Copies from one 1D CUDA array to device memory. \p dstDevice specifies the
+ * base pointer of the destination and must be naturally aligned with the CUDA
+ * array elements. \p srcArray and \p srcOffset specify the CUDA array handle
+ * and the offset in bytes into the array where the copy is to begin.
+ * \p ByteCount specifies the number of bytes to copy and must be evenly
+ * divisible by the array element size.
+ *
+ * \param dstDevice - Destination device pointer
+ * \param srcArray  - Source array
+ * \param srcOffset - Offset in bytes of source array
+ * \param ByteCount - Size of memory copy in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpyFromArray
+ */
+CUresult CUDAAPI cuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+
+/**
+ * \brief Copies memory from Host to Array
+ *
+ * Copies from host memory to a 1D CUDA array. \p dstArray and \p dstOffset
+ * specify the CUDA array handle and starting offset in bytes of the destination
+ * data.  \p pSrc specifies the base address of the source. \p ByteCount specifies
+ * the number of bytes to copy.
+ *
+ * \param dstArray  - Destination array
+ * \param dstOffset - Offset in bytes of destination array
+ * \param srcHost   - Source host pointer
+ * \param ByteCount - Size of memory copy in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpyToArray
+ */
+CUresult CUDAAPI cuMemcpyHtoA(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount);
+
+/**
+ * \brief Copies memory from Array to Host
+ *
+ * Copies from one 1D CUDA array to host memory. \p dstHost specifies the base
+ * pointer of the destination. \p srcArray and \p srcOffset specify the CUDA
+ * array handle and starting offset in bytes of the source data.
+ * \p ByteCount specifies the number of bytes to copy.
+ *
+ * \param dstHost   - Destination device pointer
+ * \param srcArray  - Source array
+ * \param srcOffset - Offset in bytes of source array
+ * \param ByteCount - Size of memory copy in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpyFromArray
+ */
+CUresult CUDAAPI cuMemcpyAtoH(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+
+/**
+ * \brief Copies memory from Array to Array
+ *
+ * Copies from one 1D CUDA array to another. \p dstArray and \p srcArray
+ * specify the handles of the destination and source CUDA arrays for the copy,
+ * respectively. \p dstOffset and \p srcOffset specify the destination and
+ * source offsets in bytes into the CUDA arrays. \p ByteCount is the number of
+ * bytes to be copied. The size of the elements in the CUDA arrays need not be
+ * the same format, but the elements must be the same size; and count must be
+ * evenly divisible by that size.
+ *
+ * \param dstArray  - Destination array
+ * \param dstOffset - Offset in bytes of destination array
+ * \param srcArray  - Source array
+ * \param srcOffset - Offset in bytes of source array
+ * \param ByteCount - Size of memory copy in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpyArrayToArray
+ */
+CUresult CUDAAPI cuMemcpyAtoA(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+
+/**
+ * \brief Copies memory for 2D arrays
+ *
+ * Perform a 2D memory copy according to the parameters specified in \p pCopy.
+ * The ::CUDA_MEMCPY2D structure is defined as:
+ *
+ * \code
+   typedef struct CUDA_MEMCPY2D_st {
+      unsigned int srcXInBytes, srcY;
+      CUmemorytype srcMemoryType;
+          const void *srcHost;
+          CUdeviceptr srcDevice;
+          CUarray srcArray;
+          unsigned int srcPitch;
+
+      unsigned int dstXInBytes, dstY;
+      CUmemorytype dstMemoryType;
+          void *dstHost;
+          CUdeviceptr dstDevice;
+          CUarray dstArray;
+          unsigned int dstPitch;
+
+      unsigned int WidthInBytes;
+      unsigned int Height;
+   } CUDA_MEMCPY2D;
+ * \endcode
+ * where:
+ * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the
+ *   source and destination, respectively; ::CUmemorytype_enum is defined as:
+ *
+ * \code
+   typedef enum CUmemorytype_enum {
+      CU_MEMORYTYPE_HOST = 0x01,
+      CU_MEMORYTYPE_DEVICE = 0x02,
+      CU_MEMORYTYPE_ARRAY = 0x03,
+      CU_MEMORYTYPE_UNIFIED = 0x04
+   } CUmemorytype;
+ * \endcode
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch
+ *   specify the (unified virtual address space) base address of the source data
+ *   and the bytes per row to apply.  ::srcArray is ignored.
+ * This value may be used only if unified addressing is supported in the calling
+ *   context.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch
+ * specify the (host) base address of the source data and the bytes per row to
+ * apply. ::srcArray is ignored.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch
+ * specify the (device) base address of the source data and the bytes per row
+ * to apply. ::srcArray is ignored.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the
+ * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are
+ * ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch
+ * specify the (host) base address of the destination data and the bytes per
+ * row to apply. ::dstArray is ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch
+ *   specify the (unified virtual address space) base address of the source data
+ *   and the bytes per row to apply.  ::dstArray is ignored.
+ * This value may be used only if unified addressing is supported in the calling
+ *   context.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch
+ * specify the (device) base address of the destination data and the bytes per
+ * row to apply. ::dstArray is ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the
+ * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are
+ * ignored.
+ *
+ * - ::srcXInBytes and ::srcY specify the base address of the source data for
+ *   the copy.
+ *
+ * \par
+ * For host pointers, the starting address is
+ * \code
+  void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes);
+ * \endcode
+ *
+ * \par
+ * For device pointers, the starting address is
+ * \code
+  CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes;
+ * \endcode
+ *
+ * \par
+ * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array
+ * element size.
+ *
+ * - ::dstXInBytes and ::dstY specify the base address of the destination data
+ *   for the copy.
+ *
+ * \par
+ * For host pointers, the base address is
+ * \code
+  void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes);
+ * \endcode
+ *
+ * \par
+ * For device pointers, the starting address is
+ * \code
+  CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes;
+ * \endcode
+ *
+ * \par
+ * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array
+ * element size.
+ *
+ * - ::WidthInBytes and ::Height specify the width (in bytes) and height of
+ *   the 2D copy being performed.
+ * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
+ *   ::srcXInBytes, and ::dstPitch must be greater than or equal to
+ *   ::WidthInBytes + dstXInBytes.
+ *
+ * \par
+ * ::cuMemcpy2D() returns an error if any pitch is greater than the maximum
+ * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back
+ * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies
+ * (device to device, CUDA array to device, CUDA array to CUDA array),
+ * ::cuMemcpy2D() may fail for pitches not computed by ::cuMemAllocPitch().
+ * ::cuMemcpy2DUnaligned() does not have this restriction, but may run
+ * significantly slower in the cases where ::cuMemcpy2D() would have returned
+ * an error code.
+ *
+ * \param pCopy - Parameters for the memory copy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray,
+ * ::cudaMemcpy2DFromArray
+ */
+CUresult CUDAAPI cuMemcpy2D(const CUDA_MEMCPY2D *pCopy);
+
+/**
+ * \brief Copies memory for 2D arrays
+ *
+ * Perform a 2D memory copy according to the parameters specified in \p pCopy.
+ * The ::CUDA_MEMCPY2D structure is defined as:
+ *
+ * \code
+   typedef struct CUDA_MEMCPY2D_st {
+      unsigned int srcXInBytes, srcY;
+      CUmemorytype srcMemoryType;
+      const void *srcHost;
+      CUdeviceptr srcDevice;
+      CUarray srcArray;
+      unsigned int srcPitch;
+      unsigned int dstXInBytes, dstY;
+      CUmemorytype dstMemoryType;
+      void *dstHost;
+      CUdeviceptr dstDevice;
+      CUarray dstArray;
+      unsigned int dstPitch;
+      unsigned int WidthInBytes;
+      unsigned int Height;
+   } CUDA_MEMCPY2D;
+ * \endcode
+ * where:
+ * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the
+ *   source and destination, respectively; ::CUmemorytype_enum is defined as:
+ *
+ * \code
+   typedef enum CUmemorytype_enum {
+      CU_MEMORYTYPE_HOST = 0x01,
+      CU_MEMORYTYPE_DEVICE = 0x02,
+      CU_MEMORYTYPE_ARRAY = 0x03,
+      CU_MEMORYTYPE_UNIFIED = 0x04
+   } CUmemorytype;
+ * \endcode
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch
+ *   specify the (unified virtual address space) base address of the source data
+ *   and the bytes per row to apply.  ::srcArray is ignored.
+ * This value may be used only if unified addressing is supported in the calling
+ *   context.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch
+ * specify the (host) base address of the source data and the bytes per row to
+ * apply. ::srcArray is ignored.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch
+ * specify the (device) base address of the source data and the bytes per row
+ * to apply. ::srcArray is ignored.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the
+ * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are
+ * ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch
+ *   specify the (unified virtual address space) base address of the source data
+ *   and the bytes per row to apply.  ::dstArray is ignored.
+ * This value may be used only if unified addressing is supported in the calling
+ *   context.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch
+ * specify the (host) base address of the destination data and the bytes per
+ * row to apply. ::dstArray is ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch
+ * specify the (device) base address of the destination data and the bytes per
+ * row to apply. ::dstArray is ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the
+ * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are
+ * ignored.
+ *
+ * - ::srcXInBytes and ::srcY specify the base address of the source data for
+ *   the copy.
+ *
+ * \par
+ * For host pointers, the starting address is
+ * \code
+  void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes);
+ * \endcode
+ *
+ * \par
+ * For device pointers, the starting address is
+ * \code
+  CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes;
+ * \endcode
+ *
+ * \par
+ * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array
+ * element size.
+ *
+ * - ::dstXInBytes and ::dstY specify the base address of the destination data
+ *   for the copy.
+ *
+ * \par
+ * For host pointers, the base address is
+ * \code
+  void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes);
+ * \endcode
+ *
+ * \par
+ * For device pointers, the starting address is
+ * \code
+  CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes;
+ * \endcode
+ *
+ * \par
+ * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array
+ * element size.
+ *
+ * - ::WidthInBytes and ::Height specify the width (in bytes) and height of
+ *   the 2D copy being performed.
+ * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
+ *   ::srcXInBytes, and ::dstPitch must be greater than or equal to
+ *   ::WidthInBytes + dstXInBytes.
+ *
+ * \par
+ * ::cuMemcpy2D() returns an error if any pitch is greater than the maximum
+ * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back
+ * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies
+ * (device to device, CUDA array to device, CUDA array to CUDA array),
+ * ::cuMemcpy2D() may fail for pitches not computed by ::cuMemAllocPitch().
+ * ::cuMemcpy2DUnaligned() does not have this restriction, but may run
+ * significantly slower in the cases where ::cuMemcpy2D() would have returned
+ * an error code.
+ *
+ * \param pCopy - Parameters for the memory copy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray,
+ * ::cudaMemcpy2DFromArray
+ */
+CUresult CUDAAPI cuMemcpy2DUnaligned(const CUDA_MEMCPY2D *pCopy);
+
+/**
+ * \brief Copies memory for 3D arrays
+ *
+ * Perform a 3D memory copy according to the parameters specified in
+ * \p pCopy. The ::CUDA_MEMCPY3D structure is defined as:
+ *
+ * \code
+        typedef struct CUDA_MEMCPY3D_st {
+
+            unsigned int srcXInBytes, srcY, srcZ;
+            unsigned int srcLOD;
+            CUmemorytype srcMemoryType;
+                const void *srcHost;
+                CUdeviceptr srcDevice;
+                CUarray srcArray;
+                unsigned int srcPitch;  // ignored when src is array
+                unsigned int srcHeight; // ignored when src is array; may be 0 if Depth==1
+
+            unsigned int dstXInBytes, dstY, dstZ;
+            unsigned int dstLOD;
+            CUmemorytype dstMemoryType;
+                void *dstHost;
+                CUdeviceptr dstDevice;
+                CUarray dstArray;
+                unsigned int dstPitch;  // ignored when dst is array
+                unsigned int dstHeight; // ignored when dst is array; may be 0 if Depth==1
+
+            unsigned int WidthInBytes;
+            unsigned int Height;
+            unsigned int Depth;
+        } CUDA_MEMCPY3D;
+ * \endcode
+ * where:
+ * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the
+ *   source and destination, respectively; ::CUmemorytype_enum is defined as:
+ *
+ * \code
+   typedef enum CUmemorytype_enum {
+      CU_MEMORYTYPE_HOST = 0x01,
+      CU_MEMORYTYPE_DEVICE = 0x02,
+      CU_MEMORYTYPE_ARRAY = 0x03,
+      CU_MEMORYTYPE_UNIFIED = 0x04
+   } CUmemorytype;
+ * \endcode
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch
+ *   specify the (unified virtual address space) base address of the source data
+ *   and the bytes per row to apply.  ::srcArray is ignored.
+ * This value may be used only if unified addressing is supported in the calling
+ *   context.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost, ::srcPitch and
+ * ::srcHeight specify the (host) base address of the source data, the bytes
+ * per row, and the height of each 2D slice of the 3D array. ::srcArray is
+ * ignored.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice, ::srcPitch and
+ * ::srcHeight specify the (device) base address of the source data, the bytes
+ * per row, and the height of each 2D slice of the 3D array. ::srcArray is
+ * ignored.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the
+ * handle of the source data. ::srcHost, ::srcDevice, ::srcPitch and
+ * ::srcHeight are ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch
+ *   specify the (unified virtual address space) base address of the source data
+ *   and the bytes per row to apply.  ::dstArray is ignored.
+ * This value may be used only if unified addressing is supported in the calling
+ *   context.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch
+ * specify the (host) base address of the destination data, the bytes per row,
+ * and the height of each 2D slice of the 3D array. ::dstArray is ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch
+ * specify the (device) base address of the destination data, the bytes per
+ * row, and the height of each 2D slice of the 3D array. ::dstArray is ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the
+ * handle of the destination data. ::dstHost, ::dstDevice, ::dstPitch and
+ * ::dstHeight are ignored.
+ *
+ * - ::srcXInBytes, ::srcY and ::srcZ specify the base address of the source
+ *   data for the copy.
+ *
+ * \par
+ * For host pointers, the starting address is
+ * \code
+  void* Start = (void*)((char*)srcHost+(srcZ*srcHeight+srcY)*srcPitch + srcXInBytes);
+ * \endcode
+ *
+ * \par
+ * For device pointers, the starting address is
+ * \code
+  CUdeviceptr Start = srcDevice+(srcZ*srcHeight+srcY)*srcPitch+srcXInBytes;
+ * \endcode
+ *
+ * \par
+ * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array
+ * element size.
+ *
+ * - dstXInBytes, ::dstY and ::dstZ specify the base address of the
+ *   destination data for the copy.
+ *
+ * \par
+ * For host pointers, the base address is
+ * \code
+  void* dstStart = (void*)((char*)dstHost+(dstZ*dstHeight+dstY)*dstPitch + dstXInBytes);
+ * \endcode
+ *
+ * \par
+ * For device pointers, the starting address is
+ * \code
+  CUdeviceptr dstStart = dstDevice+(dstZ*dstHeight+dstY)*dstPitch+dstXInBytes;
+ * \endcode
+ *
+ * \par
+ * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array
+ * element size.
+ *
+ * - ::WidthInBytes, ::Height and ::Depth specify the width (in bytes), height
+ *   and depth of the 3D copy being performed.
+ * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
+ *   ::srcXInBytes, and ::dstPitch must be greater than or equal to
+ *   ::WidthInBytes + dstXInBytes.
+ * - If specified, ::srcHeight must be greater than or equal to ::Height +
+ *   ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY.
+ *
+ * \par
+ * ::cuMemcpy3D() returns an error if any pitch is greater than the maximum
+ * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH).
+ *
+ * The ::srcLOD and ::dstLOD members of the ::CUDA_MEMCPY3D structure must be
+ * set to 0.
+ *
+ * \param pCopy - Parameters for the memory copy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpy3D
+ */
+CUresult CUDAAPI cuMemcpy3D(const CUDA_MEMCPY3D *pCopy);
+#endif /* __CUDA_API_VERSION >= 3020 */
+
+#if __CUDA_API_VERSION >= 4000
+/**
+ * \brief Copies memory between contexts
+ *
+ * Perform a 3D memory copy according to the parameters specified in
+ * \p pCopy.  See the definition of the ::CUDA_MEMCPY3D_PEER structure
+ * for documentation of its parameters.
+ *
+ * \param pCopy - Parameters for the memory copy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ *
+ * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync,
+ * ::cuMemcpy3DPeerAsync,
+ * ::cudaMemcpy3DPeer
+ */
+CUresult CUDAAPI cuMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *pCopy);
+
+/**
+ * \brief Copies memory asynchronously
+ *
+ * Copies data between two pointers.
+ * \p dst and \p src are base pointers of the destination and source, respectively.
+ * \p ByteCount specifies the number of bytes to copy.
+ * Note that this function infers the type of the transfer (host to host, host to
+ *   device, device to device, or device to host) from the pointer values.  This
+ *   function is only allowed in contexts which support unified addressing.
+ *
+ * \param dst       - Destination unified virtual address space pointer
+ * \param src       - Source unified virtual address space pointer
+ * \param ByteCount - Size of memory copy in bytes
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemcpyAsync,
+ * ::cudaMemcpyToSymbolAsync,
+ * ::cudaMemcpyFromSymbolAsync
+ */
+CUresult CUDAAPI cuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount, CUstream hStream);
+
+/**
+ * \brief Copies device memory between two contexts asynchronously.
+ *
+ * Copies from device memory in one context to device memory in another
+ * context. \p dstDevice is the base device pointer of the destination memory
+ * and \p dstContext is the destination context.  \p srcDevice is the base
+ * device pointer of the source memory and \p srcContext is the source pointer.
+ * \p ByteCount specifies the number of bytes to copy.
+ *
+ * \param dstDevice  - Destination device pointer
+ * \param dstContext - Destination context
+ * \param srcDevice  - Source device pointer
+ * \param srcContext - Source context
+ * \param ByteCount  - Size of memory copy in bytes
+ * \param hStream    - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ *
+ * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpy3DPeer, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpy3DPeerAsync,
+ * ::cudaMemcpyPeerAsync
+ */
+CUresult CUDAAPI cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream);
+#endif /* __CUDA_API_VERSION >= 4000 */
+
+#if __CUDA_API_VERSION >= 3020
+/**
+ * \brief Copies memory from Host to Device
+ *
+ * Copies from host memory to device memory. \p dstDevice and \p srcHost are
+ * the base addresses of the destination and source, respectively. \p ByteCount
+ * specifies the number of bytes to copy.
+ *
+ * \param dstDevice - Destination device pointer
+ * \param srcHost   - Source host pointer
+ * \param ByteCount - Size of memory copy in bytes
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemcpyAsync,
+ * ::cudaMemcpyToSymbolAsync
+ */
+CUresult CUDAAPI cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream);
+
+/**
+ * \brief Copies memory from Device to Host
+ *
+ * Copies from device to host memory. \p dstHost and \p srcDevice specify the
+ * base pointers of the destination and source, respectively. \p ByteCount
+ * specifies the number of bytes to copy.
+ *
+ * \param dstHost   - Destination host pointer
+ * \param srcDevice - Source device pointer
+ * \param ByteCount - Size of memory copy in bytes
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemcpyAsync,
+ * ::cudaMemcpyFromSymbolAsync
+ */
+CUresult CUDAAPI cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
+
+/**
+ * \brief Copies memory from Device to Device
+ *
+ * Copies from device memory to device memory. \p dstDevice and \p srcDevice
+ * are the base pointers of the destination and source, respectively.
+ * \p ByteCount specifies the number of bytes to copy.
+ *
+ * \param dstDevice - Destination device pointer
+ * \param srcDevice - Source device pointer
+ * \param ByteCount - Size of memory copy in bytes
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemcpyAsync,
+ * ::cudaMemcpyToSymbolAsync,
+ * ::cudaMemcpyFromSymbolAsync
+ */
+CUresult CUDAAPI cuMemcpyDtoDAsync(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
+
+/**
+ * \brief Copies memory from Host to Array
+ *
+ * Copies from host memory to a 1D CUDA array. \p dstArray and \p dstOffset
+ * specify the CUDA array handle and starting offset in bytes of the
+ * destination data. \p srcHost specifies the base address of the source.
+ * \p ByteCount specifies the number of bytes to copy.
+ *
+ * \param dstArray  - Destination array
+ * \param dstOffset - Offset in bytes of destination array
+ * \param srcHost   - Source host pointer
+ * \param ByteCount - Size of memory copy in bytes
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemcpyToArrayAsync
+ */
+CUresult CUDAAPI cuMemcpyHtoAAsync(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream);
+
+/**
+ * \brief Copies memory from Array to Host
+ *
+ * Copies from one 1D CUDA array to host memory. \p dstHost specifies the base
+ * pointer of the destination. \p srcArray and \p srcOffset specify the CUDA
+ * array handle and starting offset in bytes of the source data.
+ * \p ByteCount specifies the number of bytes to copy.
+ *
+ * \param dstHost   - Destination pointer
+ * \param srcArray  - Source array
+ * \param srcOffset - Offset in bytes of source array
+ * \param ByteCount - Size of memory copy in bytes
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemcpyFromArrayAsync
+ */
+CUresult CUDAAPI cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream);
+
+/**
+ * \brief Copies memory for 2D arrays
+ *
+ * Perform a 2D memory copy according to the parameters specified in \p pCopy.
+ * The ::CUDA_MEMCPY2D structure is defined as:
+ *
+ * \code
+   typedef struct CUDA_MEMCPY2D_st {
+      unsigned int srcXInBytes, srcY;
+      CUmemorytype srcMemoryType;
+      const void *srcHost;
+      CUdeviceptr srcDevice;
+      CUarray srcArray;
+      unsigned int srcPitch;
+      unsigned int dstXInBytes, dstY;
+      CUmemorytype dstMemoryType;
+      void *dstHost;
+      CUdeviceptr dstDevice;
+      CUarray dstArray;
+      unsigned int dstPitch;
+      unsigned int WidthInBytes;
+      unsigned int Height;
+   } CUDA_MEMCPY2D;
+ * \endcode
+ * where:
+ * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the
+ *   source and destination, respectively; ::CUmemorytype_enum is defined as:
+ *
+ * \code
+   typedef enum CUmemorytype_enum {
+      CU_MEMORYTYPE_HOST = 0x01,
+      CU_MEMORYTYPE_DEVICE = 0x02,
+      CU_MEMORYTYPE_ARRAY = 0x03,
+      CU_MEMORYTYPE_UNIFIED = 0x04
+   } CUmemorytype;
+ * \endcode
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch
+ * specify the (host) base address of the source data and the bytes per row to
+ * apply. ::srcArray is ignored.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch
+ *   specify the (unified virtual address space) base address of the source data
+ *   and the bytes per row to apply.  ::srcArray is ignored.
+ * This value may be used only if unified addressing is supported in the calling
+ *   context.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch
+ * specify the (device) base address of the source data and the bytes per row
+ * to apply. ::srcArray is ignored.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the
+ * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are
+ * ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch
+ *   specify the (unified virtual address space) base address of the source data
+ *   and the bytes per row to apply.  ::dstArray is ignored.
+ * This value may be used only if unified addressing is supported in the calling
+ *   context.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch
+ * specify the (host) base address of the destination data and the bytes per
+ * row to apply. ::dstArray is ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch
+ * specify the (device) base address of the destination data and the bytes per
+ * row to apply. ::dstArray is ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the
+ * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are
+ * ignored.
+ *
+ * - ::srcXInBytes and ::srcY specify the base address of the source data for
+ *   the copy.
+ *
+ * \par
+ * For host pointers, the starting address is
+ * \code
+  void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes);
+ * \endcode
+ *
+ * \par
+ * For device pointers, the starting address is
+ * \code
+  CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes;
+ * \endcode
+ *
+ * \par
+ * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array
+ * element size.
+ *
+ * - ::dstXInBytes and ::dstY specify the base address of the destination data
+ *   for the copy.
+ *
+ * \par
+ * For host pointers, the base address is
+ * \code
+  void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes);
+ * \endcode
+ *
+ * \par
+ * For device pointers, the starting address is
+ * \code
+  CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes;
+ * \endcode
+ *
+ * \par
+ * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array
+ * element size.
+ *
+ * - ::WidthInBytes and ::Height specify the width (in bytes) and height of
+ *   the 2D copy being performed.
+ * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
+ *   ::srcXInBytes, and ::dstPitch must be greater than or equal to
+ *   ::WidthInBytes + dstXInBytes.
+ * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
+ *   ::srcXInBytes, and ::dstPitch must be greater than or equal to
+ *   ::WidthInBytes + dstXInBytes.
+ * - If specified, ::srcHeight must be greater than or equal to ::Height +
+ *   ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY.
+ *
+ * \par
+ * ::cuMemcpy2DAsync() returns an error if any pitch is greater than the maximum
+ * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back
+ * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies
+ * (device to device, CUDA array to device, CUDA array to CUDA array),
+ * ::cuMemcpy2DAsync() may fail for pitches not computed by ::cuMemAllocPitch().
+ *
+ * \param pCopy   - Parameters for the memory copy
+ * \param hStream - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync
+ */
+CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D *pCopy, CUstream hStream);
+
+/**
+ * \brief Copies memory for 3D arrays
+ *
+ * Perform a 3D memory copy according to the parameters specified in
+ * \p pCopy. The ::CUDA_MEMCPY3D structure is defined as:
+ *
+ * \code
+        typedef struct CUDA_MEMCPY3D_st {
+
+            unsigned int srcXInBytes, srcY, srcZ;
+            unsigned int srcLOD;
+            CUmemorytype srcMemoryType;
+                const void *srcHost;
+                CUdeviceptr srcDevice;
+                CUarray srcArray;
+                unsigned int srcPitch;  // ignored when src is array
+                unsigned int srcHeight; // ignored when src is array; may be 0 if Depth==1
+
+            unsigned int dstXInBytes, dstY, dstZ;
+            unsigned int dstLOD;
+            CUmemorytype dstMemoryType;
+                void *dstHost;
+                CUdeviceptr dstDevice;
+                CUarray dstArray;
+                unsigned int dstPitch;  // ignored when dst is array
+                unsigned int dstHeight; // ignored when dst is array; may be 0 if Depth==1
+
+            unsigned int WidthInBytes;
+            unsigned int Height;
+            unsigned int Depth;
+        } CUDA_MEMCPY3D;
+ * \endcode
+ * where:
+ * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the
+ *   source and destination, respectively; ::CUmemorytype_enum is defined as:
+ *
+ * \code
+   typedef enum CUmemorytype_enum {
+      CU_MEMORYTYPE_HOST = 0x01,
+      CU_MEMORYTYPE_DEVICE = 0x02,
+      CU_MEMORYTYPE_ARRAY = 0x03,
+      CU_MEMORYTYPE_UNIFIED = 0x04
+   } CUmemorytype;
+ * \endcode
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch
+ *   specify the (unified virtual address space) base address of the source data
+ *   and the bytes per row to apply.  ::srcArray is ignored.
+ * This value may be used only if unified addressing is supported in the calling
+ *   context.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost, ::srcPitch and
+ * ::srcHeight specify the (host) base address of the source data, the bytes
+ * per row, and the height of each 2D slice of the 3D array. ::srcArray is
+ * ignored.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice, ::srcPitch and
+ * ::srcHeight specify the (device) base address of the source data, the bytes
+ * per row, and the height of each 2D slice of the 3D array. ::srcArray is
+ * ignored.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the
+ * handle of the source data. ::srcHost, ::srcDevice, ::srcPitch and
+ * ::srcHeight are ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch
+ *   specify the (unified virtual address space) base address of the source data
+ *   and the bytes per row to apply.  ::dstArray is ignored.
+ * This value may be used only if unified addressing is supported in the calling
+ *   context.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch
+ * specify the (host) base address of the destination data, the bytes per row,
+ * and the height of each 2D slice of the 3D array. ::dstArray is ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch
+ * specify the (device) base address of the destination data, the bytes per
+ * row, and the height of each 2D slice of the 3D array. ::dstArray is ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the
+ * handle of the destination data. ::dstHost, ::dstDevice, ::dstPitch and
+ * ::dstHeight are ignored.
+ *
+ * - ::srcXInBytes, ::srcY and ::srcZ specify the base address of the source
+ *   data for the copy.
+ *
+ * \par
+ * For host pointers, the starting address is
+ * \code
+  void* Start = (void*)((char*)srcHost+(srcZ*srcHeight+srcY)*srcPitch + srcXInBytes);
+ * \endcode
+ *
+ * \par
+ * For device pointers, the starting address is
+ * \code
+  CUdeviceptr Start = srcDevice+(srcZ*srcHeight+srcY)*srcPitch+srcXInBytes;
+ * \endcode
+ *
+ * \par
+ * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array
+ * element size.
+ *
+ * - dstXInBytes, ::dstY and ::dstZ specify the base address of the
+ *   destination data for the copy.
+ *
+ * \par
+ * For host pointers, the base address is
+ * \code
+  void* dstStart = (void*)((char*)dstHost+(dstZ*dstHeight+dstY)*dstPitch + dstXInBytes);
+ * \endcode
+ *
+ * \par
+ * For device pointers, the starting address is
+ * \code
+  CUdeviceptr dstStart = dstDevice+(dstZ*dstHeight+dstY)*dstPitch+dstXInBytes;
+ * \endcode
+ *
+ * \par
+ * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array
+ * element size.
+ *
+ * - ::WidthInBytes, ::Height and ::Depth specify the width (in bytes), height
+ *   and depth of the 3D copy being performed.
+ * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
+ *   ::srcXInBytes, and ::dstPitch must be greater than or equal to
+ *   ::WidthInBytes + dstXInBytes.
+ * - If specified, ::srcHeight must be greater than or equal to ::Height +
+ *   ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY.
+ *
+ * \par
+ * ::cuMemcpy3DAsync() returns an error if any pitch is greater than the maximum
+ * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH).
+ *
+ * The ::srcLOD and ::dstLOD members of the ::CUDA_MEMCPY3D structure must be
+ * set to 0.
+ *
+ * \param pCopy - Parameters for the memory copy
+ * \param hStream - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemcpy3DAsync
+ */
+CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream);
+#endif /* __CUDA_API_VERSION >= 3020 */
+
+#if __CUDA_API_VERSION >= 4000
+/**
+ * \brief Copies memory between contexts asynchronously.
+ *
+ * Perform a 3D memory copy according to the parameters specified in
+ * \p pCopy.  See the definition of the ::CUDA_MEMCPY3D_PEER structure
+ * for documentation of its parameters.
+ *
+ * \param pCopy - Parameters for the memory copy
+ * \param hStream - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ *
+ * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync,
+ * ::cuMemcpy3DPeerAsync,
+ * ::cudaMemcpy3DPeerAsync
+ */
+CUresult CUDAAPI cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy, CUstream hStream);
+#endif /* __CUDA_API_VERSION >= 4000 */
+
+#if __CUDA_API_VERSION >= 3020
+/**
+ * \brief Initializes device memory
+ *
+ * Sets the memory range of \p N 8-bit values to the specified value
+ * \p uc.
+ *
+ * \param dstDevice - Destination device pointer
+ * \param uc        - Value to set
+ * \param N         - Number of elements
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemset
+ */
+CUresult CUDAAPI cuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, size_t N);
+
+/**
+ * \brief Initializes device memory
+ *
+ * Sets the memory range of \p N 16-bit values to the specified value
+ * \p us. The \p dstDevice pointer must be two byte aligned.
+ *
+ * \param dstDevice - Destination device pointer
+ * \param us        - Value to set
+ * \param N         - Number of elements
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemset
+ */
+CUresult CUDAAPI cuMemsetD16(CUdeviceptr dstDevice, unsigned short us, size_t N);
+
+/**
+ * \brief Initializes device memory
+ *
+ * Sets the memory range of \p N 32-bit values to the specified value
+ * \p ui. The \p dstDevice pointer must be four byte aligned.
+ *
+ * \param dstDevice - Destination device pointer
+ * \param ui        - Value to set
+ * \param N         - Number of elements
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32Async,
+ * ::cudaMemset
+ */
+CUresult CUDAAPI cuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, size_t N);
+
+/**
+ * \brief Initializes device memory
+ *
+ * Sets the 2D memory range of \p Width 8-bit values to the specified value
+ * \p uc. \p Height specifies the number of rows to set, and \p dstPitch
+ * specifies the number of bytes between each row. This function performs
+ * fastest when the pitch is one that has been passed back by
+ * ::cuMemAllocPitch().
+ *
+ * \param dstDevice - Destination device pointer
+ * \param dstPitch  - Pitch of destination device pointer
+ * \param uc        - Value to set
+ * \param Width     - Width of row
+ * \param Height    - Number of rows
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemset2D
+ */
+CUresult CUDAAPI cuMemsetD2D8(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height);
+
+/**
+ * \brief Initializes device memory
+ *
+ * Sets the 2D memory range of \p Width 16-bit values to the specified value
+ * \p us. \p Height specifies the number of rows to set, and \p dstPitch
+ * specifies the number of bytes between each row. The \p dstDevice pointer
+ * and \p dstPitch offset must be two byte aligned. This function performs
+ * fastest when the pitch is one that has been passed back by
+ * ::cuMemAllocPitch().
+ *
+ * \param dstDevice - Destination device pointer
+ * \param dstPitch  - Pitch of destination device pointer
+ * \param us        - Value to set
+ * \param Width     - Width of row
+ * \param Height    - Number of rows
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemset2D
+ */
+CUresult CUDAAPI cuMemsetD2D16(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height);
+
+/**
+ * \brief Initializes device memory
+ *
+ * Sets the 2D memory range of \p Width 32-bit values to the specified value
+ * \p ui. \p Height specifies the number of rows to set, and \p dstPitch
+ * specifies the number of bytes between each row. The \p dstDevice pointer
+ * and \p dstPitch offset must be four byte aligned. This function performs
+ * fastest when the pitch is one that has been passed back by
+ * ::cuMemAllocPitch().
+ *
+ * \param dstDevice - Destination device pointer
+ * \param dstPitch  - Pitch of destination device pointer
+ * \param ui        - Value to set
+ * \param Width     - Width of row
+ * \param Height    - Number of rows
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemset2D
+ */
+CUresult CUDAAPI cuMemsetD2D32(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height);
+
+/**
+ * \brief Sets device memory
+ *
+ * Sets the memory range of \p N 8-bit values to the specified value
+ * \p uc.
+ *
+ * \param dstDevice - Destination device pointer
+ * \param uc        - Value to set
+ * \param N         - Number of elements
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ * \note_null_stream
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemsetAsync
+ */
+CUresult CUDAAPI cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream);
+
+/**
+ * \brief Sets device memory
+ *
+ * Sets the memory range of \p N 16-bit values to the specified value
+ * \p us. The \p dstDevice pointer must be two byte aligned.
+ *
+ * \param dstDevice - Destination device pointer
+ * \param us        - Value to set
+ * \param N         - Number of elements
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ * \note_null_stream
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemsetAsync
+ */
+CUresult CUDAAPI cuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us, size_t N, CUstream hStream);
+
+/**
+ * \brief Sets device memory
+ *
+ * Sets the memory range of \p N 32-bit values to the specified value
+ * \p ui. The \p dstDevice pointer must be four byte aligned.
+ *
+ * \param dstDevice - Destination device pointer
+ * \param ui        - Value to set
+ * \param N         - Number of elements
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ * \note_null_stream
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, ::cuMemsetD32,
+ * ::cudaMemsetAsync
+ */
+CUresult CUDAAPI cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream);
+
+/**
+ * \brief Sets device memory
+ *
+ * Sets the 2D memory range of \p Width 8-bit values to the specified value
+ * \p uc. \p Height specifies the number of rows to set, and \p dstPitch
+ * specifies the number of bytes between each row. This function performs
+ * fastest when the pitch is one that has been passed back by
+ * ::cuMemAllocPitch().
+ *
+ * \param dstDevice - Destination device pointer
+ * \param dstPitch  - Pitch of destination device pointer
+ * \param uc        - Value to set
+ * \param Width     - Width of row
+ * \param Height    - Number of rows
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ * \note_null_stream
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemset2DAsync
+ */
+CUresult CUDAAPI cuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream);
+
+/**
+ * \brief Sets device memory
+ *
+ * Sets the 2D memory range of \p Width 16-bit values to the specified value
+ * \p us. \p Height specifies the number of rows to set, and \p dstPitch
+ * specifies the number of bytes between each row. The \p dstDevice pointer
+ * and \p dstPitch offset must be two byte aligned. This function performs
+ * fastest when the pitch is one that has been passed back by
+ * ::cuMemAllocPitch().
+ *
+ * \param dstDevice - Destination device pointer
+ * \param dstPitch  - Pitch of destination device pointer
+ * \param us        - Value to set
+ * \param Width     - Width of row
+ * \param Height    - Number of rows
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ * \note_null_stream
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemset2DAsync
+ */
+CUresult CUDAAPI cuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream);
+
+/**
+ * \brief Sets device memory
+ *
+ * Sets the 2D memory range of \p Width 32-bit values to the specified value
+ * \p ui. \p Height specifies the number of rows to set, and \p dstPitch
+ * specifies the number of bytes between each row. The \p dstDevice pointer
+ * and \p dstPitch offset must be four byte aligned. This function performs
+ * fastest when the pitch is one that has been passed back by
+ * ::cuMemAllocPitch().
+ *
+ * \param dstDevice - Destination device pointer
+ * \param dstPitch  - Pitch of destination device pointer
+ * \param ui        - Value to set
+ * \param Width     - Width of row
+ * \param Height    - Number of rows
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ * \note_null_stream
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemset2DAsync
+ */
+CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream);
+
+/**
+ * \brief Creates a 1D or 2D CUDA array
+ *
+ * Creates a CUDA array according to the ::CUDA_ARRAY_DESCRIPTOR structure
+ * \p pAllocateArray and returns a handle to the new CUDA array in \p *pHandle.
+ * The ::CUDA_ARRAY_DESCRIPTOR is defined as:
+ *
+ * \code
+    typedef struct {
+        unsigned int Width;
+        unsigned int Height;
+        CUarray_format Format;
+        unsigned int NumChannels;
+    } CUDA_ARRAY_DESCRIPTOR;
+ * \endcode
+ * where:
+ *
+ * - \p Width, and \p Height are the width, and height of the CUDA array (in
+ * elements); the CUDA array is one-dimensional if height is 0, two-dimensional
+ * otherwise;
+ * - ::Format specifies the format of the elements; ::CUarray_format is
+ * defined as:
+ * \code
+    typedef enum CUarray_format_enum {
+        CU_AD_FORMAT_UNSIGNED_INT8 = 0x01,
+        CU_AD_FORMAT_UNSIGNED_INT16 = 0x02,
+        CU_AD_FORMAT_UNSIGNED_INT32 = 0x03,
+        CU_AD_FORMAT_SIGNED_INT8 = 0x08,
+        CU_AD_FORMAT_SIGNED_INT16 = 0x09,
+        CU_AD_FORMAT_SIGNED_INT32 = 0x0a,
+        CU_AD_FORMAT_HALF = 0x10,
+        CU_AD_FORMAT_FLOAT = 0x20
+    } CUarray_format;
+ *  \endcode
+ * - \p NumChannels specifies the number of packed components per CUDA array
+ * element; it may be 1, 2, or 4;
+ *
+ * Here are examples of CUDA array descriptions:
+ *
+ * Description for a CUDA array of 2048 floats:
+ * \code
+    CUDA_ARRAY_DESCRIPTOR desc;
+    desc.Format = CU_AD_FORMAT_FLOAT;
+    desc.NumChannels = 1;
+    desc.Width = 2048;
+    desc.Height = 1;
+ * \endcode
+ *
+ * Description for a 64 x 64 CUDA array of floats:
+ * \code
+    CUDA_ARRAY_DESCRIPTOR desc;
+    desc.Format = CU_AD_FORMAT_FLOAT;
+    desc.NumChannels = 1;
+    desc.Width = 64;
+    desc.Height = 64;
+ * \endcode
+ *
+ * Description for a \p width x \p height CUDA array of 64-bit, 4x16-bit
+ * float16's:
+ * \code
+    CUDA_ARRAY_DESCRIPTOR desc;
+    desc.FormatFlags = CU_AD_FORMAT_HALF;
+    desc.NumChannels = 4;
+    desc.Width = width;
+    desc.Height = height;
+ * \endcode
+ *
+ * Description for a \p width x \p height CUDA array of 16-bit elements, each
+ * of which is two 8-bit unsigned chars:
+ * \code
+    CUDA_ARRAY_DESCRIPTOR arrayDesc;
+    desc.FormatFlags = CU_AD_FORMAT_UNSIGNED_INT8;
+    desc.NumChannels = 2;
+    desc.Width = width;
+    desc.Height = height;
+ * \endcode
+ *
+ * \param pHandle        - Returned array
+ * \param pAllocateArray - Array descriptor
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMallocArray
+ */
+CUresult CUDAAPI cuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR *pAllocateArray);
+
+/**
+ * \brief Get a 1D or 2D CUDA array descriptor
+ *
+ * Returns in \p *pArrayDescriptor a descriptor containing information on the
+ * format and dimensions of the CUDA array \p hArray. It is useful for
+ * subroutines that have been passed a CUDA array, but need to know the CUDA
+ * array parameters for validation or other purposes.
+ *
+ * \param pArrayDescriptor - Returned array descriptor
+ * \param hArray           - Array to get descriptor of
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaArrayGetInfo
+ */
+CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor, CUarray hArray);
+#endif /* __CUDA_API_VERSION >= 3020 */
+
+
+/**
+ * \brief Destroys a CUDA array
+ *
+ * Destroys the CUDA array \p hArray.
+ *
+ * \param hArray - Array to destroy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_ARRAY_IS_MAPPED,
+ * ::CUDA_ERROR_CONTEXT_IS_DESTROYED
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaFreeArray
+ */
+CUresult CUDAAPI cuArrayDestroy(CUarray hArray);
+
+#if __CUDA_API_VERSION >= 3020
+/**
+ * \brief Creates a 3D CUDA array
+ *
+ * Creates a CUDA array according to the ::CUDA_ARRAY3D_DESCRIPTOR structure
+ * \p pAllocateArray and returns a handle to the new CUDA array in \p *pHandle.
+ * The ::CUDA_ARRAY3D_DESCRIPTOR is defined as:
+ *
+ * \code
+    typedef struct {
+        unsigned int Width;
+        unsigned int Height;
+        unsigned int Depth;
+        CUarray_format Format;
+        unsigned int NumChannels;
+        unsigned int Flags;
+    } CUDA_ARRAY3D_DESCRIPTOR;
+ * \endcode
+ * where:
+ *
+ * - \p Width, \p Height, and \p Depth are the width, height, and depth of the
+ * CUDA array (in elements); the following types of CUDA arrays can be allocated:
+ *     - A 1D array is allocated if \p Height and \p Depth extents are both zero.
+ *     - A 2D array is allocated if only \p Depth extent is zero.
+ *     - A 3D array is allocated if all three extents are non-zero.
+ *     - A 1D layered CUDA array is allocated if only \p Height is zero and the
+ *       ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 1D array. The number
+ *       of layers is determined by the depth extent.
+ *     - A 2D layered CUDA array is allocated if all three extents are non-zero and
+ *       the ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 2D array. The number
+ *       of layers is determined by the depth extent.
+ *     - A cubemap CUDA array is allocated if all three extents are non-zero and the
+ *       ::CUDA_ARRAY3D_CUBEMAP flag is set. \p Width must be equal to \p Height, and
+ *       \p Depth must be six. A cubemap is a special type of 2D layered CUDA array,
+ *       where the six layers represent the six faces of a cube. The order of the six
+ *       layers in memory is the same as that listed in ::CUarray_cubemap_face.
+ *     - A cubemap layered CUDA array is allocated if all three extents are non-zero,
+ *       and both, ::CUDA_ARRAY3D_CUBEMAP and ::CUDA_ARRAY3D_LAYERED flags are set.
+ *       \p Width must be equal to \p Height, and \p Depth must be a multiple of six.
+ *       A cubemap layered CUDA array is a special type of 2D layered CUDA array that
+ *       consists of a collection of cubemaps. The first six layers represent the first
+ *       cubemap, the next six layers form the second cubemap, and so on.
+ *
+ * - ::Format specifies the format of the elements; ::CUarray_format is
+ * defined as:
+ * \code
+    typedef enum CUarray_format_enum {
+        CU_AD_FORMAT_UNSIGNED_INT8 = 0x01,
+        CU_AD_FORMAT_UNSIGNED_INT16 = 0x02,
+        CU_AD_FORMAT_UNSIGNED_INT32 = 0x03,
+        CU_AD_FORMAT_SIGNED_INT8 = 0x08,
+        CU_AD_FORMAT_SIGNED_INT16 = 0x09,
+        CU_AD_FORMAT_SIGNED_INT32 = 0x0a,
+        CU_AD_FORMAT_HALF = 0x10,
+        CU_AD_FORMAT_FLOAT = 0x20
+    } CUarray_format;
+ *  \endcode
+ *
+ * - \p NumChannels specifies the number of packed components per CUDA array
+ * element; it may be 1, 2, or 4;
+ *
+ * - ::Flags may be set to
+ *   - ::CUDA_ARRAY3D_LAYERED to enable creation of layered CUDA arrays. If this flag is set,
+ *     \p Depth specifies the number of layers, not the depth of a 3D array.
+ *   - ::CUDA_ARRAY3D_SURFACE_LDST to enable surface references to be bound to the CUDA array.
+ *     If this flag is not set, ::cuSurfRefSetArray will fail when attempting to bind the CUDA array
+ *     to a surface reference.
+ *   - ::CUDA_ARRAY3D_CUBEMAP to enable creation of cubemaps. If this flag is set, \p Width must be
+ *     equal to \p Height, and \p Depth must be six. If the ::CUDA_ARRAY3D_LAYERED flag is also set,
+ *     then \p Depth must be a multiple of six.
+ *   - ::CUDA_ARRAY3D_TEXTURE_GATHER to indicate that the CUDA array will be used for texture gather.
+ *     Texture gather can only be performed on 2D CUDA arrays.
+ *
+ * \p Width, \p Height and \p Depth must meet certain size requirements as listed in the following table.
+ * All values are specified in elements. Note that for brevity's sake, the full name of the device attribute
+ * is not specified. For ex., TEXTURE1D_WIDTH refers to the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH.
+ *
+ * Note that 2D CUDA arrays have different size requirements if the ::CUDA_ARRAY3D_TEXTURE_GATHER flag
+ * is set. \p Width and \p Height must not be greater than ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH
+ * and ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT respectively, in that case.
+ *
+ * <table>
+ * <tr><td><b>CUDA array type</b></td>
+ * <td><b>Valid extents that must always be met<br>{(width range in elements), (height range),
+ * (depth range)}</b></td>
+ * <td><b>Valid extents with CUDA_ARRAY3D_SURFACE_LDST set<br>
+ * {(width range in elements), (height range), (depth range)}</b></td></tr>
+ * <tr><td>1D</td>
+ * <td><small>{ (1,TEXTURE1D_WIDTH), 0, 0 }</small></td>
+ * <td><small>{ (1,SURFACE1D_WIDTH), 0, 0 }</small></td></tr>
+ * <tr><td>2D</td>
+ * <td><small>{ (1,TEXTURE2D_WIDTH), (1,TEXTURE2D_HEIGHT), 0 }</small></td>
+ * <td><small>{ (1,SURFACE2D_WIDTH), (1,SURFACE2D_HEIGHT), 0 }</small></td></tr>
+ * <tr><td>3D</td>
+ * <td><small>{ (1,TEXTURE3D_WIDTH), (1,TEXTURE3D_HEIGHT), (1,TEXTURE3D_DEPTH) }
+ * <br>OR<br>{ (1,TEXTURE3D_WIDTH_ALTERNATE), (1,TEXTURE3D_HEIGHT_ALTERNATE),
+ * (1,TEXTURE3D_DEPTH_ALTERNATE) }</small></td>
+ * <td><small>{ (1,SURFACE3D_WIDTH), (1,SURFACE3D_HEIGHT),
+ * (1,SURFACE3D_DEPTH) }</small></td></tr>
+ * <tr><td>1D Layered</td>
+ * <td><small>{ (1,TEXTURE1D_LAYERED_WIDTH), 0,
+ * (1,TEXTURE1D_LAYERED_LAYERS) }</small></td>
+ * <td><small>{ (1,SURFACE1D_LAYERED_WIDTH), 0,
+ * (1,SURFACE1D_LAYERED_LAYERS) }</small></td></tr>
+ * <tr><td>2D Layered</td>
+ * <td><small>{ (1,TEXTURE2D_LAYERED_WIDTH), (1,TEXTURE2D_LAYERED_HEIGHT),
+ * (1,TEXTURE2D_LAYERED_LAYERS) }</small></td>
+ * <td><small>{ (1,SURFACE2D_LAYERED_WIDTH), (1,SURFACE2D_LAYERED_HEIGHT),
+ * (1,SURFACE2D_LAYERED_LAYERS) }</small></td></tr>
+ * <tr><td>Cubemap</td>
+ * <td><small>{ (1,TEXTURECUBEMAP_WIDTH), (1,TEXTURECUBEMAP_WIDTH), 6 }</small></td>
+ * <td><small>{ (1,SURFACECUBEMAP_WIDTH),
+ * (1,SURFACECUBEMAP_WIDTH), 6 }</small></td></tr>
+ * <tr><td>Cubemap Layered</td>
+ * <td><small>{ (1,TEXTURECUBEMAP_LAYERED_WIDTH), (1,TEXTURECUBEMAP_LAYERED_WIDTH),
+ * (1,TEXTURECUBEMAP_LAYERED_LAYERS) }</small></td>
+ * <td><small>{ (1,SURFACECUBEMAP_LAYERED_WIDTH), (1,SURFACECUBEMAP_LAYERED_WIDTH),
+ * (1,SURFACECUBEMAP_LAYERED_LAYERS) }</small></td></tr>
+ * </table>
+ *
+ * Here are examples of CUDA array descriptions:
+ *
+ * Description for a CUDA array of 2048 floats:
+ * \code
+    CUDA_ARRAY3D_DESCRIPTOR desc;
+    desc.Format = CU_AD_FORMAT_FLOAT;
+    desc.NumChannels = 1;
+    desc.Width = 2048;
+    desc.Height = 0;
+    desc.Depth = 0;
+ * \endcode
+ *
+ * Description for a 64 x 64 CUDA array of floats:
+ * \code
+    CUDA_ARRAY3D_DESCRIPTOR desc;
+    desc.Format = CU_AD_FORMAT_FLOAT;
+    desc.NumChannels = 1;
+    desc.Width = 64;
+    desc.Height = 64;
+    desc.Depth = 0;
+ * \endcode
+ *
+ * Description for a \p width x \p height x \p depth CUDA array of 64-bit,
+ * 4x16-bit float16's:
+ * \code
+    CUDA_ARRAY3D_DESCRIPTOR desc;
+    desc.FormatFlags = CU_AD_FORMAT_HALF;
+    desc.NumChannels = 4;
+    desc.Width = width;
+    desc.Height = height;
+    desc.Depth = depth;
+ * \endcode
+ *
+ * \param pHandle        - Returned array
+ * \param pAllocateArray - 3D array descriptor
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMalloc3DArray
+ */
+CUresult CUDAAPI cuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray);
+
+/**
+ * \brief Get a 3D CUDA array descriptor
+ *
+ * Returns in \p *pArrayDescriptor a descriptor containing information on the
+ * format and dimensions of the CUDA array \p hArray. It is useful for
+ * subroutines that have been passed a CUDA array, but need to know the CUDA
+ * array parameters for validation or other purposes.
+ *
+ * This function may be called on 1D and 2D arrays, in which case the \p Height
+ * and/or \p Depth members of the descriptor struct will be set to 0.
+ *
+ * \param pArrayDescriptor - Returned 3D array descriptor
+ * \param hArray           - 3D array to get descriptor of
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_CONTEXT_IS_DESTROYED
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaArrayGetInfo
+ */
+CUresult CUDAAPI cuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray);
+#endif /* __CUDA_API_VERSION >= 3020 */
+
+#if __CUDA_API_VERSION >= 5000
+
+/**
+ * \brief Creates a CUDA mipmapped array
+ *
+ * Creates a CUDA mipmapped array according to the ::CUDA_ARRAY3D_DESCRIPTOR structure
+ * \p pMipmappedArrayDesc and returns a handle to the new CUDA mipmapped array in \p *pHandle.
+ * \p numMipmapLevels specifies the number of mipmap levels to be allocated. This value is
+ * clamped to the range [1, 1 + floor(log2(max(width, height, depth)))].
+ *
+ * The ::CUDA_ARRAY3D_DESCRIPTOR is defined as:
+ *
+ * \code
+    typedef struct {
+        unsigned int Width;
+        unsigned int Height;
+        unsigned int Depth;
+        CUarray_format Format;
+        unsigned int NumChannels;
+        unsigned int Flags;
+    } CUDA_ARRAY3D_DESCRIPTOR;
+ * \endcode
+ * where:
+ *
+ * - \p Width, \p Height, and \p Depth are the width, height, and depth of the
+ * CUDA array (in elements); the following types of CUDA arrays can be allocated:
+ *     - A 1D mipmapped array is allocated if \p Height and \p Depth extents are both zero.
+ *     - A 2D mipmapped array is allocated if only \p Depth extent is zero.
+ *     - A 3D mipmapped array is allocated if all three extents are non-zero.
+ *     - A 1D layered CUDA mipmapped array is allocated if only \p Height is zero and the
+ *       ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 1D array. The number
+ *       of layers is determined by the depth extent.
+ *     - A 2D layered CUDA mipmapped array is allocated if all three extents are non-zero and
+ *       the ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 2D array. The number
+ *       of layers is determined by the depth extent.
+ *     - A cubemap CUDA mipmapped array is allocated if all three extents are non-zero and the
+ *       ::CUDA_ARRAY3D_CUBEMAP flag is set. \p Width must be equal to \p Height, and
+ *       \p Depth must be six. A cubemap is a special type of 2D layered CUDA array,
+ *       where the six layers represent the six faces of a cube. The order of the six
+ *       layers in memory is the same as that listed in ::CUarray_cubemap_face.
+ *     - A cubemap layered CUDA mipmapped array is allocated if all three extents are non-zero,
+ *       and both, ::CUDA_ARRAY3D_CUBEMAP and ::CUDA_ARRAY3D_LAYERED flags are set.
+ *       \p Width must be equal to \p Height, and \p Depth must be a multiple of six.
+ *       A cubemap layered CUDA array is a special type of 2D layered CUDA array that
+ *       consists of a collection of cubemaps. The first six layers represent the first
+ *       cubemap, the next six layers form the second cubemap, and so on.
+ *
+ * - ::Format specifies the format of the elements; ::CUarray_format is
+ * defined as:
+ * \code
+    typedef enum CUarray_format_enum {
+        CU_AD_FORMAT_UNSIGNED_INT8 = 0x01,
+        CU_AD_FORMAT_UNSIGNED_INT16 = 0x02,
+        CU_AD_FORMAT_UNSIGNED_INT32 = 0x03,
+        CU_AD_FORMAT_SIGNED_INT8 = 0x08,
+        CU_AD_FORMAT_SIGNED_INT16 = 0x09,
+        CU_AD_FORMAT_SIGNED_INT32 = 0x0a,
+        CU_AD_FORMAT_HALF = 0x10,
+        CU_AD_FORMAT_FLOAT = 0x20
+    } CUarray_format;
+ *  \endcode
+ *
+ * - \p NumChannels specifies the number of packed components per CUDA array
+ * element; it may be 1, 2, or 4;
+ *
+ * - ::Flags may be set to
+ *   - ::CUDA_ARRAY3D_LAYERED to enable creation of layered CUDA mipmapped arrays. If this flag is set,
+ *     \p Depth specifies the number of layers, not the depth of a 3D array.
+ *   - ::CUDA_ARRAY3D_SURFACE_LDST to enable surface references to be bound to individual mipmap levels of
+ *     the CUDA mipmapped array. If this flag is not set, ::cuSurfRefSetArray will fail when attempting to
+ *     bind a mipmap level of the CUDA mipmapped array to a surface reference.
+  *   - ::CUDA_ARRAY3D_CUBEMAP to enable creation of mipmapped cubemaps. If this flag is set, \p Width must be
+ *     equal to \p Height, and \p Depth must be six. If the ::CUDA_ARRAY3D_LAYERED flag is also set,
+ *     then \p Depth must be a multiple of six.
+ *   - ::CUDA_ARRAY3D_TEXTURE_GATHER to indicate that the CUDA mipmapped array will be used for texture gather.
+ *     Texture gather can only be performed on 2D CUDA mipmapped arrays.
+ *
+ * \p Width, \p Height and \p Depth must meet certain size requirements as listed in the following table.
+ * All values are specified in elements. Note that for brevity's sake, the full name of the device attribute
+ * is not specified. For ex., TEXTURE1D_MIPMAPPED_WIDTH refers to the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH.
+ *
+ * <table>
+ * <tr><td><b>CUDA array type</b></td>
+ * <td><b>Valid extents that must always be met<br>{(width range in elements), (height range),
+ * (depth range)}</b></td>
+ * <td><b>Valid extents with CUDA_ARRAY3D_SURFACE_LDST set<br>
+ * {(width range in elements), (height range), (depth range)}</b></td></tr>
+ * <tr><td>1D</td>
+ * <td><small>{ (1,TEXTURE1D_MIPMAPPED_WIDTH), 0, 0 }</small></td>
+ * <td><small>{ (1,SURFACE1D_WIDTH), 0, 0 }</small></td></tr>
+ * <tr><td>2D</td>
+ * <td><small>{ (1,TEXTURE2D_MIPMAPPED_WIDTH), (1,TEXTURE2D_MIPMAPPED_HEIGHT), 0 }</small></td>
+ * <td><small>{ (1,SURFACE2D_WIDTH), (1,SURFACE2D_HEIGHT), 0 }</small></td></tr>
+ * <tr><td>3D</td>
+ * <td><small>{ (1,TEXTURE3D_WIDTH), (1,TEXTURE3D_HEIGHT), (1,TEXTURE3D_DEPTH) }
+ * <br>OR<br>{ (1,TEXTURE3D_WIDTH_ALTERNATE), (1,TEXTURE3D_HEIGHT_ALTERNATE),
+ * (1,TEXTURE3D_DEPTH_ALTERNATE) }</small></td>
+ * <td><small>{ (1,SURFACE3D_WIDTH), (1,SURFACE3D_HEIGHT),
+ * (1,SURFACE3D_DEPTH) }</small></td></tr>
+ * <tr><td>1D Layered</td>
+ * <td><small>{ (1,TEXTURE1D_LAYERED_WIDTH), 0,
+ * (1,TEXTURE1D_LAYERED_LAYERS) }</small></td>
+ * <td><small>{ (1,SURFACE1D_LAYERED_WIDTH), 0,
+ * (1,SURFACE1D_LAYERED_LAYERS) }</small></td></tr>
+ * <tr><td>2D Layered</td>
+ * <td><small>{ (1,TEXTURE2D_LAYERED_WIDTH), (1,TEXTURE2D_LAYERED_HEIGHT),
+ * (1,TEXTURE2D_LAYERED_LAYERS) }</small></td>
+ * <td><small>{ (1,SURFACE2D_LAYERED_WIDTH), (1,SURFACE2D_LAYERED_HEIGHT),
+ * (1,SURFACE2D_LAYERED_LAYERS) }</small></td></tr>
+ * <tr><td>Cubemap</td>
+ * <td><small>{ (1,TEXTURECUBEMAP_WIDTH), (1,TEXTURECUBEMAP_WIDTH), 6 }</small></td>
+ * <td><small>{ (1,SURFACECUBEMAP_WIDTH),
+ * (1,SURFACECUBEMAP_WIDTH), 6 }</small></td></tr>
+ * <tr><td>Cubemap Layered</td>
+ * <td><small>{ (1,TEXTURECUBEMAP_LAYERED_WIDTH), (1,TEXTURECUBEMAP_LAYERED_WIDTH),
+ * (1,TEXTURECUBEMAP_LAYERED_LAYERS) }</small></td>
+ * <td><small>{ (1,SURFACECUBEMAP_LAYERED_WIDTH), (1,SURFACECUBEMAP_LAYERED_WIDTH),
+ * (1,SURFACECUBEMAP_LAYERED_LAYERS) }</small></td></tr>
+ * </table>
+ *
+ *
+ * \param pHandle             - Returned mipmapped array
+ * \param pMipmappedArrayDesc - mipmapped array descriptor
+ * \param numMipmapLevels     - Number of mipmap levels
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa
+ * ::cuMipmappedArrayDestroy,
+ * ::cuMipmappedArrayGetLevel,
+ * ::cuArrayCreate,
+ * ::cudaMallocMipmappedArray
+ */
+CUresult CUDAAPI cuMipmappedArrayCreate(CUmipmappedArray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pMipmappedArrayDesc, unsigned int numMipmapLevels);
+
+/**
+ * \brief Gets a mipmap level of a CUDA mipmapped array
+ *
+ * Returns in \p *pLevelArray a CUDA array that represents a single mipmap level
+ * of the CUDA mipmapped array \p hMipmappedArray.
+ *
+ * If \p level is greater than the maximum number of levels in this mipmapped array,
+ * ::CUDA_ERROR_INVALID_VALUE is returned.
+ *
+ * \param pLevelArray     - Returned mipmap level CUDA array
+ * \param hMipmappedArray - CUDA mipmapped array
+ * \param level           - Mipmap level
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuMipmappedArrayCreate,
+ * ::cuMipmappedArrayDestroy,
+ * ::cuArrayCreate,
+ * ::cudaGetMipmappedArrayLevel
+ */
+CUresult CUDAAPI cuMipmappedArrayGetLevel(CUarray *pLevelArray, CUmipmappedArray hMipmappedArray, unsigned int level);
+
+/**
+ * \brief Destroys a CUDA mipmapped array
+ *
+ * Destroys the CUDA mipmapped array \p hMipmappedArray.
+ *
+ * \param hMipmappedArray - Mipmapped array to destroy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_ARRAY_IS_MAPPED,
+ * ::CUDA_ERROR_CONTEXT_IS_DESTROYED
+ * \notefnerr
+ *
+ * \sa
+ * ::cuMipmappedArrayCreate,
+ * ::cuMipmappedArrayGetLevel,
+ * ::cuArrayCreate,
+ * ::cudaFreeMipmappedArray
+ */
+CUresult CUDAAPI cuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray);
+
+#endif /* __CUDA_API_VERSION >= 5000 */
+
+/** @} */ /* END CUDA_MEM */
+
+/**
+ * \defgroup CUDA_UNIFIED Unified Addressing
+ *
+ * ___MANBRIEF___ unified addressing functions of the low-level CUDA driver
+ * API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the unified addressing functions of the
+ * low-level CUDA driver application programming interface.
+ *
+ * @{
+ *
+ * \section CUDA_UNIFIED_overview Overview
+ *
+ * CUDA devices can share a unified address space with the host.
+ * For these devices there is no distinction between a device
+ * pointer and a host pointer -- the same pointer value may be
+ * used to access memory from the host program and from a kernel
+ * running on the device (with exceptions enumerated below).
+ *
+ * \section CUDA_UNIFIED_support Supported Platforms
+ *
+ * Whether or not a device supports unified addressing may be
+ * queried by calling ::cuDeviceGetAttribute() with the device
+ * attribute ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING.
+ *
+ * Unified addressing is automatically enabled in 64-bit processes
+ *
+ * \section CUDA_UNIFIED_lookup Looking Up Information from Pointer Values
+ *
+ * It is possible to look up information about the memory which backs a
+ * pointer value.  For instance, one may want to know if a pointer points
+ * to host or device memory.  As another example, in the case of device
+ * memory, one may want to know on which CUDA device the memory
+ * resides.  These properties may be queried using the function
+ * ::cuPointerGetAttribute()
+ *
+ * Since pointers are unique, it is not necessary to specify information
+ * about the pointers specified to the various copy functions in the
+ * CUDA API.  The function ::cuMemcpy() may be used to perform a copy
+ * between two pointers, ignoring whether they point to host or device
+ * memory (making ::cuMemcpyHtoD(), ::cuMemcpyDtoD(), and ::cuMemcpyDtoH()
+ * unnecessary for devices supporting unified addressing).  For
+ * multidimensional copies, the memory type ::CU_MEMORYTYPE_UNIFIED may be
+ * used to specify that the CUDA driver should infer the location of the
+ * pointer from its value.
+ *
+ * \section CUDA_UNIFIED_automaphost Automatic Mapping of Host Allocated Host Memory
+ *
+ * All host memory allocated in all contexts using ::cuMemAllocHost() and
+ * ::cuMemHostAlloc() is always directly accessible from all contexts on
+ * all devices that support unified addressing.  This is the case regardless
+ * of whether or not the flags ::CU_MEMHOSTALLOC_PORTABLE and
+ * ::CU_MEMHOSTALLOC_DEVICEMAP are specified.
+ *
+ * The pointer value through which allocated host memory may be accessed
+ * in kernels on all devices that support unified addressing is the same
+ * as the pointer value through which that memory is accessed on the host,
+ * so it is not necessary to call ::cuMemHostGetDevicePointer() to get the device
+ * pointer for these allocations.
+ *
+ * Note that this is not the case for memory allocated using the flag
+ * ::CU_MEMHOSTALLOC_WRITECOMBINED, as discussed below.
+ *
+ * \section CUDA_UNIFIED_autopeerregister Automatic Registration of Peer Memory
+ *
+ * Upon enabling direct access from a context that supports unified addressing
+ * to another peer context that supports unified addressing using
+ * ::cuCtxEnablePeerAccess() all memory allocated in the peer context using
+ * ::cuMemAlloc() and ::cuMemAllocPitch() will immediately be accessible
+ * by the current context.  The device pointer value through
+ * which any peer memory may be accessed in the current context
+ * is the same pointer value through which that memory may be
+ * accessed in the peer context.
+ *
+ * \section CUDA_UNIFIED_exceptions Exceptions, Disjoint Addressing
+ *
+ * Not all memory may be accessed on devices through the same pointer
+ * value through which they are accessed on the host.  These exceptions
+ * are host memory registered using ::cuMemHostRegister() and host memory
+ * allocated using the flag ::CU_MEMHOSTALLOC_WRITECOMBINED.  For these
+ * exceptions, there exists a distinct host and device address for the
+ * memory.  The device address is guaranteed to not overlap any valid host
+ * pointer range and is guaranteed to have the same value across all
+ * contexts that support unified addressing.
+ *
+ * This device address may be queried using ::cuMemHostGetDevicePointer()
+ * when a context using unified addressing is current.  Either the host
+ * or the unified device pointer value may be used to refer to this memory
+ * through ::cuMemcpy() and similar functions using the
+ * ::CU_MEMORYTYPE_UNIFIED memory type.
+ *
+ */
+
+#if __CUDA_API_VERSION >= 4000
+/**
+ * \brief Returns information about a pointer
+ *
+ * The supported attributes are:
+ *
+ * - ::CU_POINTER_ATTRIBUTE_CONTEXT:
+ *
+ *      Returns in \p *data the ::CUcontext in which \p ptr was allocated or
+ *      registered.
+ *      The type of \p data must be ::CUcontext *.
+ *
+ *      If \p ptr was not allocated by, mapped by, or registered with
+ *      a ::CUcontext which uses unified virtual addressing then
+ *      ::CUDA_ERROR_INVALID_VALUE is returned.
+ *
+ * - ::CU_POINTER_ATTRIBUTE_MEMORY_TYPE:
+ *
+ *      Returns in \p *data the physical memory type of the memory that
+ *      \p ptr addresses as a ::CUmemorytype enumerated value.
+ *      The type of \p data must be unsigned int.
+ *
+ *      If \p ptr addresses device memory then \p *data is set to
+ *      ::CU_MEMORYTYPE_DEVICE.  The particular ::CUdevice on which the
+ *      memory resides is the ::CUdevice of the ::CUcontext returned by the
+ *      ::CU_POINTER_ATTRIBUTE_CONTEXT attribute of \p ptr.
+ *
+ *      If \p ptr addresses host memory then \p *data is set to
+ *      ::CU_MEMORYTYPE_HOST.
+ *
+ *      If \p ptr was not allocated by, mapped by, or registered with
+ *      a ::CUcontext which uses unified virtual addressing then
+ *      ::CUDA_ERROR_INVALID_VALUE is returned.
+ *
+ *      If the current ::CUcontext does not support unified virtual
+ *      addressing then ::CUDA_ERROR_INVALID_CONTEXT is returned.
+ *
+ * - ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER:
+ *
+ *      Returns in \p *data the device pointer value through which
+ *      \p ptr may be accessed by kernels running in the current
+ *      ::CUcontext.
+ *      The type of \p data must be CUdeviceptr *.
+ *
+ *      If there exists no device pointer value through which
+ *      kernels running in the current ::CUcontext may access
+ *      \p ptr then ::CUDA_ERROR_INVALID_VALUE is returned.
+ *
+ *      If there is no current ::CUcontext then
+ *      ::CUDA_ERROR_INVALID_CONTEXT is returned.
+ *
+ *      Except in the exceptional disjoint addressing cases discussed
+ *      below, the value returned in \p *data will equal the input
+ *      value \p ptr.
+ *
+ * - ::CU_POINTER_ATTRIBUTE_HOST_POINTER:
+ *
+ *      Returns in \p *data the host pointer value through which
+ *      \p ptr may be accessed by by the host program.
+ *      The type of \p data must be void **.
+ *      If there exists no host pointer value through which
+ *      the host program may directly access \p ptr then
+ *      ::CUDA_ERROR_INVALID_VALUE is returned.
+ *
+ *      Except in the exceptional disjoint addressing cases discussed
+ *      below, the value returned in \p *data will equal the input
+ *      value \p ptr.
+ *
+ * - ::CU_POINTER_ATTRIBUTE_P2P_TOKENS:
+ *
+ *      Returns in \p *data two tokens for use with the nv-p2p.h Linux
+ *      kernel interface. \p data must be a struct of type
+ *      CUDA_POINTER_ATTRIBUTE_P2P_TOKENS.
+ *
+ *      \p ptr must be a pointer to memory obtained from :cuMemAlloc().
+ *      Note that p2pToken and vaSpaceToken are only valid for the
+ *      lifetime of the source allocation. A subsequent allocation at
+ *      the same address may return completely different tokens.
+ *      Querying this attribute has a side effect of setting the attribute
+ *      ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS for the region of memory that
+ *      \p ptr points to.
+ *
+ * - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS:
+ *
+ *      A boolean attribute which when set, ensures that synchronous memory operations
+ *      initiated on the region of memory that \p ptr points to will always synchronize.
+ *      See further documentation in the section titled "API synchronization behavior"
+ *      to learn more about cases when synchronous memory operations can
+ *      exhibit asynchronous behavior.
+ *
+ * - ::CU_POINTER_ATTRIBUTE_BUFFER_ID:
+ *
+ *      Returns in \p *data a buffer ID which is guaranteed to be unique within the process.
+ *      \p data must point to an unsigned long long.
+ *
+ *      \p ptr must be a pointer to memory obtained from a CUDA memory allocation API.
+ *      Every memory allocation from any of the CUDA memory allocation APIs will
+ *      have a unique ID over a process lifetime. Subsequent allocations do not reuse IDs
+ *      from previous freed allocations. IDs are only unique within a single process.
+ *
+ *
+ * - ::CU_POINTER_ATTRIBUTE_IS_MANAGED:
+ *
+ *      Returns in \p *data a boolean that indicates whether the pointer points to
+ *      managed memory or not.
+ *
+ * - ::CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL:
+ *
+ *      Returns in \p *data an integer representing a device ordinal of a device against
+ *      which the memory was allocated or registered.
+ *
+ * \par
+ *
+ * Note that for most allocations in the unified virtual address space
+ * the host and device pointer for accessing the allocation will be the
+ * same.  The exceptions to this are
+ *  - user memory registered using ::cuMemHostRegister
+ *  - host memory allocated using ::cuMemHostAlloc with the
+ *    ::CU_MEMHOSTALLOC_WRITECOMBINED flag
+ * For these types of allocation there will exist separate, disjoint host
+ * and device addresses for accessing the allocation.  In particular
+ *  - The host address will correspond to an invalid unmapped device address
+ *    (which will result in an exception if accessed from the device)
+ *  - The device address will correspond to an invalid unmapped host address
+ *    (which will result in an exception if accessed from the host).
+ * For these types of allocations, querying ::CU_POINTER_ATTRIBUTE_HOST_POINTER
+ * and ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER may be used to retrieve the host
+ * and device addresses from either address.
+ *
+ * \param data      - Returned pointer attribute value
+ * \param attribute - Pointer attribute to query
+ * \param ptr       - Pointer
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuPointerSetAttribute,
+ * ::cuMemAlloc,
+ * ::cuMemFree,
+ * ::cuMemAllocHost,
+ * ::cuMemFreeHost,
+ * ::cuMemHostAlloc,
+ * ::cuMemHostRegister,
+ * ::cuMemHostUnregister,
+ * ::cudaPointerGetAttributes
+ */
+CUresult CUDAAPI cuPointerGetAttribute(void *data, CUpointer_attribute attribute, CUdeviceptr ptr);
+#endif /* __CUDA_API_VERSION >= 4000 */
+
+#if __CUDA_API_VERSION >= 8000
+/**
+ * \brief Prefetches memory to the specified destination device
+ *
+ * Prefetches memory to the specified destination device.  \p devPtr is the
+ * base device pointer of the memory to be prefetched and \p dstDevice is the
+ * destination device. \p count specifies the number of bytes to copy. \p hStream
+ * is the stream in which the operation is enqueued. The memory range must refer
+ * to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables.
+ *
+ * Passing in CU_DEVICE_CPU for \p dstDevice will prefetch the data to host memory. If
+ * \p dstDevice is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS
+ * must be non-zero. Additionally, \p hStream must be associated with a device that has a
+ * non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.
+ *
+ * The start address and end address of the memory range will be rounded down and rounded up
+ * respectively to be aligned to CPU page size before the prefetch operation is enqueued
+ * in the stream.
+ *
+ * If no physical memory has been allocated for this region, then this memory region
+ * will be populated and mapped on the destination device. If there's insufficient
+ * memory to prefetch the desired region, the Unified Memory driver may evict pages from other
+ * ::cuMemAllocManaged allocations to host memory in order to make room. Device memory
+ * allocated using ::cuMemAlloc or ::cuArrayCreate will not be evicted.
+ *
+ * By default, any mappings to the previous location of the migrated pages are removed and
+ * mappings for the new location are only setup on \p dstDevice. The exact behavior however
+ * also depends on the settings applied to this memory range via ::cuMemAdvise as described
+ * below:
+ *
+ * If ::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range,
+ * then that subset will create a read-only copy of the pages on \p dstDevice.
+ *
+ * If ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory
+ * range, then the pages will be migrated to \p dstDevice even if \p dstDevice is not the
+ * preferred location of any pages in the memory range.
+ *
+ * If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range,
+ * then mappings to those pages from all the appropriate processors are updated to
+ * refer to the new location if establishing such a mapping is possible. Otherwise,
+ * those mappings are cleared.
+ *
+ * Note that this API is not required for functionality and only serves to improve performance
+ * by allowing the application to migrate data to a suitable location before it is accessed.
+ * Memory accesses to this range are always coherent and are allowed even when the data is
+ * actively being migrated.
+ *
+ * Note that this function is asynchronous with respect to the host and all work
+ * on other devices.
+ *
+ * \param devPtr    - Pointer to be prefetched
+ * \param count     - Size in bytes
+ * \param dstDevice - Destination device to prefetch to
+ * \param hStream    - Stream to enqueue prefetch operation
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ *
+ * \sa ::cuMemcpy, ::cuMemcpyPeer, ::cuMemcpyAsync,
+ * ::cuMemcpy3DPeerAsync, ::cuMemAdvise,
+ * ::cudaMemPrefetchAsync
+ */
+CUresult CUDAAPI cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count, CUdevice dstDevice, CUstream hStream);
+
+/**
+ * \brief Advise about the usage of a given memory range
+ *
+ * Advise the Unified Memory subsystem about the usage pattern for the memory range
+ * starting at \p devPtr with a size of \p count bytes. The start address and end address of the memory
+ * range will be rounded down and rounded up respectively to be aligned to CPU page size before the
+ * advice is applied. The memory range must refer to managed memory allocated via ::cuMemAllocManaged
+ * or declared via __managed__ variables. The memory range could also refer to system-allocated pageable
+ * memory provided it represents a valid, host-accessible region of memory and all additional constraints
+ * imposed by \p advice as outlined below are also satisfied. Specifying an invalid system-allocated pageable
+ * memory range results in an error being returned.
+ *
+ * The \p advice parameter can take the following values:
+ * - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read
+ * from and only occasionally written to. Any read accesses from any processor to this region will create a
+ * read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync
+ * is called on this region, it will create a read-only copy of the data on the destination processor.
+ * If any processor writes to this region, all copies of the corresponding page will be invalidated
+ * except for the one where the write occurred. The \p device argument is ignored for this advice.
+ * Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU
+ * that has a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.
+ * Also, if a context is created on a device that does not have the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until
+ * all such contexts are destroyed.
+ * If the memory region refers to valid system-allocated pageable memory, then the accessing device must
+ * have a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only
+ * copy to be created on that device. Note however that if the accessing device also has a non-zero value for the
+ * device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice
+ * will not create a read-only copy when that device accesses this memory region.
+ *
+ * - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY:  Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the
+ * Unified Memory driver from attempting heuristic read-duplication on the memory range. Any read-duplicated
+ * copies of the data will be collapsed into a single copy. The location for the collapsed
+ * copy will be the preferred location if the page has a preferred location and one of the read-duplicated
+ * copies was resident at that location. Otherwise, the location chosen is arbitrary.
+ *
+ * - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the
+ * data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the
+ * preferred location as host memory. If \p device is a GPU, then it must have a non-zero value for the
+ * device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. Setting the preferred location
+ * does not cause data to migrate to that location immediately. Instead, it guides the migration policy
+ * when a fault occurs on that memory region. If the data is already in its preferred location and the
+ * faulting processor can establish a mapping without requiring the data to be migrated, then
+ * data migration will be avoided. On the other hand, if the data is not in its preferred location
+ * or if a direct mapping cannot be established, then it will be migrated to the processor accessing
+ * it. It is important to note that setting the preferred location does not prevent data prefetching
+ * done using ::cuMemPrefetchAsync.
+ * Having a preferred location can override the page thrash detection and resolution logic in the Unified
+ * Memory driver. Normally, if a page is detected to be constantly thrashing between for example host and device
+ * memory, the page may eventually be pinned to host memory by the Unified Memory driver. But
+ * if the preferred location is set as device memory, then the page will continue to thrash indefinitely.
+ * If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the
+ * policies associated with that advice will override the policies of this advice, unless read accesses from
+ * \p device will not result in a read-only copy being created on that device as outlined in description for
+ * the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY.
+ * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero
+ * value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. Additionally, if \p device has
+ * a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES,
+ * then this call has no effect. Note however that this behavior may change in the future.
+ *
+ * - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION
+ * and changes the preferred location to none.
+ *
+ * - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device.
+ * Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. If \p device is a GPU, then
+ * the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero.
+ * This advice does not cause data migration and has no impact on the location of the data per se. Instead,
+ * it causes the data to always be mapped in the specified processor's page tables, as long as the
+ * location of the data permits a mapping to be established. If the data gets migrated for any reason,
+ * the mappings are updated accordingly.
+ * This advice is recommended in scenarios where data locality is not important, but avoiding faults is.
+ * Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the
+ * data located on one GPU is occasionally accessed by peer GPUs. In such scenarios, migrating data
+ * over to the other GPUs is not as important because the accesses are infrequent and the overhead of
+ * migration may be too high. But preventing faults can still help improve performance, and so having
+ * a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated
+ * to host memory because the CPU typically cannot access device memory directly. Any GPU that had the
+ * ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the
+ * page in host memory.
+ * If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the
+ * policies associated with that advice will override the policies of this advice. Additionally, if the
+ * preferred location of this memory region or any subset of it is also \p device, then the policies
+ * associated with ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice.
+ * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero
+ * value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. Additionally, if \p device has
+ * a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES,
+ * then this call has no effect.
+ *
+ * - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY. Any mappings to
+ * the data from \p device may be removed at any time causing accesses to result in non-fatal page faults.
+ * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero
+ * value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. Additionally, if \p device has
+ * a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES,
+ * then this call has no effect.
+ *
+ * \param devPtr - Pointer to memory to set the advice for
+ * \param count  - Size in bytes of the memory range
+ * \param advice - Advice to be applied for the specified memory range
+ * \param device - Device to apply the advice for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ *
+ * \sa ::cuMemcpy, ::cuMemcpyPeer, ::cuMemcpyAsync,
+ * ::cuMemcpy3DPeerAsync, ::cuMemPrefetchAsync,
+ * ::cudaMemAdvise
+ */
+CUresult CUDAAPI cuMemAdvise(CUdeviceptr devPtr, size_t count, CUmem_advise advice, CUdevice device);
+
+/**
+ * \brief Query an attribute of a given memory range
+ *
+ * Query an attribute about the memory range starting at \p devPtr with a size of \p count bytes. The
+ * memory range must refer to managed memory allocated via ::cuMemAllocManaged or declared via
+ * __managed__ variables.
+ *
+ * The \p attribute parameter can take the following values:
+ * - ::CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY: If this attribute is specified, \p data will be interpreted
+ * as a 32-bit integer, and \p dataSize must be 4. The result returned will be 1 if all pages in the given
+ * memory range have read-duplication enabled, or 0 otherwise.
+ * - ::CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION: If this attribute is specified, \p data will be
+ * interpreted as a 32-bit integer, and \p dataSize must be 4. The result returned will be a GPU device
+ * id if all pages in the memory range have that GPU as their preferred location, or it will be CU_DEVICE_CPU
+ * if all pages in the memory range have the CPU as their preferred location, or it will be CU_DEVICE_INVALID
+ * if either all the pages don't have the same preferred location or some of the pages don't have a
+ * preferred location at all. Note that the actual location of the pages in the memory range at the time of
+ * the query may be different from the preferred location.
+ * - ::CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY: If this attribute is specified, \p data will be interpreted
+ * as an array of 32-bit integers, and \p dataSize must be a non-zero multiple of 4. The result returned
+ * will be a list of device ids that had ::CU_MEM_ADVISE_SET_ACCESSED_BY set for that entire memory range.
+ * If any device does not have that advice set for the entire memory range, that device will not be included.
+ * If \p data is larger than the number of devices that have that advice set for that memory range,
+ * CU_DEVICE_INVALID will be returned in all the extra space provided. For ex., if \p dataSize is 12
+ * (i.e. \p data has 3 elements) and only device 0 has the advice set, then the result returned will be
+ * { 0, CU_DEVICE_INVALID, CU_DEVICE_INVALID }. If \p data is smaller than the number of devices that have
+ * that advice set, then only as many devices will be returned as can fit in the array. There is no
+ * guarantee on which specific devices will be returned, however.
+ * - ::CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION: If this attribute is specified, \p data will be
+ * interpreted as a 32-bit integer, and \p dataSize must be 4. The result returned will be the last location
+ * to which all pages in the memory range were prefetched explicitly via ::cuMemPrefetchAsync. This will either be
+ * a GPU id or CU_DEVICE_CPU depending on whether the last location for prefetch was a GPU or the CPU
+ * respectively. If any page in the memory range was never explicitly prefetched or if all pages were not
+ * prefetched to the same location, CU_DEVICE_INVALID will be returned. Note that this simply returns the
+ * last location that the applicaton requested to prefetch the memory range to. It gives no indication as to
+ * whether the prefetch operation to that location has completed or even begun.
+ *
+ * \param data      - A pointers to a memory location where the result
+ *                    of each attribute query will be written to.
+ * \param dataSize  - Array containing the size of data
+ * \param attribute - The attribute to query
+ * \param devPtr    - Start of the range to query
+ * \param count     - Size of the range to query
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ *
+ * \sa ::cuMemRangeGetAttributes, ::cuMemPrefetchAsync,
+ * ::cuMemAdvise,
+ * ::cudaMemRangeGetAttribute
+ */
+CUresult CUDAAPI cuMemRangeGetAttribute(void *data, size_t dataSize, CUmem_range_attribute attribute, CUdeviceptr devPtr, size_t count);
+
+/**
+ * \brief Query attributes of a given memory range.
+ *
+ * Query attributes of the memory range starting at \p devPtr with a size of \p count bytes. The
+ * memory range must refer to managed memory allocated via ::cuMemAllocManaged or declared via
+ * __managed__ variables. The \p attributes array will be interpreted to have \p numAttributes
+ * entries. The \p dataSizes array will also be interpreted to have \p numAttributes entries.
+ * The results of the query will be stored in \p data.
+ *
+ * The list of supported attributes are given below. Please refer to ::cuMemRangeGetAttribute for
+ * attribute descriptions and restrictions.
+ *
+ * - ::CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY
+ * - ::CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION
+ * - ::CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY
+ * - ::CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION
+ *
+ * \param data          - A two-dimensional array containing pointers to memory
+ *                        locations where the result of each attribute query will be written to.
+ * \param dataSizes     - Array containing the sizes of each result
+ * \param attributes    - An array of attributes to query
+ *                        (numAttributes and the number of attributes in this array should match)
+ * \param numAttributes - Number of attributes to query
+ * \param devPtr        - Start of the range to query
+ * \param count         - Size of the range to query
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa ::cuMemRangeGetAttribute, ::cuMemAdvise
+ * ::cuMemPrefetchAsync,
+ * ::cudaMemRangeGetAttributes
+ */
+CUresult CUDAAPI cuMemRangeGetAttributes(void **data, size_t *dataSizes, CUmem_range_attribute *attributes, size_t numAttributes, CUdeviceptr devPtr, size_t count);
+#endif /* __CUDA_API_VERSION >= 8000 */
+
+#if __CUDA_API_VERSION >= 6000
+/**
+ * \brief Set attributes on a previously allocated memory region
+ *
+ * The supported attributes are:
+ *
+ * - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS:
+ *
+ *      A boolean attribute that can either be set (1) or unset (0). When set,
+ *      the region of memory that \p ptr points to is guaranteed to always synchronize
+ *      memory operations that are synchronous. If there are some previously initiated
+ *      synchronous memory operations that are pending when this attribute is set, the
+ *      function does not return until those memory operations are complete.
+ *      See further documentation in the section titled "API synchronization behavior"
+ *      to learn more about cases when synchronous memory operations can
+ *      exhibit asynchronous behavior.
+ *      \p value will be considered as a pointer to an unsigned integer to which this attribute is to be set.
+ *
+ * \param value     - Pointer to memory containing the value to be set
+ * \param attribute - Pointer attribute to set
+ * \param ptr       - Pointer to a memory region allocated using CUDA memory allocation APIs
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa ::cuPointerGetAttribute,
+ * ::cuPointerGetAttributes,
+ * ::cuMemAlloc,
+ * ::cuMemFree,
+ * ::cuMemAllocHost,
+ * ::cuMemFreeHost,
+ * ::cuMemHostAlloc,
+ * ::cuMemHostRegister,
+ * ::cuMemHostUnregister
+ */
+CUresult CUDAAPI cuPointerSetAttribute(const void *value, CUpointer_attribute attribute, CUdeviceptr ptr);
+#endif /* __CUDA_API_VERSION >= 6000 */
+
+#if __CUDA_API_VERSION >= 7000
+/**
+ * \brief Returns information about a pointer.
+ *
+ * The supported attributes are (refer to ::cuPointerGetAttribute for attribute descriptions and restrictions):
+ *
+ * - ::CU_POINTER_ATTRIBUTE_CONTEXT
+ * - ::CU_POINTER_ATTRIBUTE_MEMORY_TYPE
+ * - ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER
+ * - ::CU_POINTER_ATTRIBUTE_HOST_POINTER
+ * - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS
+ * - ::CU_POINTER_ATTRIBUTE_BUFFER_ID
+ * - ::CU_POINTER_ATTRIBUTE_IS_MANAGED
+ * - ::CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL
+ *
+ * \param numAttributes - Number of attributes to query
+ * \param attributes    - An array of attributes to query
+ *                      (numAttributes and the number of attributes in this array should match)
+ * \param data          - A two-dimensional array containing pointers to memory
+ *                      locations where the result of each attribute query will be written to.
+ * \param ptr           - Pointer to query
+ *
+ * Unlike ::cuPointerGetAttribute, this function will not return an error when the \p ptr
+ * encountered is not a valid CUDA pointer. Instead, the attributes are assigned default NULL values
+ * and CUDA_SUCCESS is returned.
+ *
+ * If \p ptr was not allocated by, mapped by, or registered with a ::CUcontext which uses UVA
+ * (Unified Virtual Addressing), ::CUDA_ERROR_INVALID_CONTEXT is returned.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuPointerGetAttribute,
+ * ::cuPointerSetAttribute,
+ * ::cudaPointerGetAttributes
+ */
+CUresult CUDAAPI cuPointerGetAttributes(unsigned int numAttributes, CUpointer_attribute *attributes, void **data, CUdeviceptr ptr);
+#endif /* __CUDA_API_VERSION >= 7000 */
+
+/** @} */ /* END CUDA_UNIFIED */
+
+/**
+ * \defgroup CUDA_STREAM Stream Management
+ *
+ * ___MANBRIEF___ stream management functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the stream management functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Create a stream
+ *
+ * Creates a stream and returns a handle in \p phStream.  The \p Flags argument
+ * determines behaviors of the stream.  Valid values for \p Flags are:
+ * - ::CU_STREAM_DEFAULT: Default stream creation flag.
+ * - ::CU_STREAM_NON_BLOCKING: Specifies that work running in the created
+ *   stream may run concurrently with work in stream 0 (the NULL stream), and that
+ *   the created stream should perform no implicit synchronization with stream 0.
+ *
+ * \param phStream - Returned newly created stream
+ * \param Flags    - Parameters for stream creation
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \sa ::cuStreamDestroy,
+ * ::cuStreamCreateWithPriority,
+ * ::cuStreamGetPriority,
+ * ::cuStreamGetFlags,
+ * ::cuStreamWaitEvent,
+ * ::cuStreamQuery,
+ * ::cuStreamSynchronize,
+ * ::cuStreamAddCallback,
+ * ::cudaStreamCreate,
+ * ::cudaStreamCreateWithFlags
+ */
+CUresult CUDAAPI cuStreamCreate(CUstream *phStream, unsigned int Flags);
+
+/**
+ * \brief Create a stream with the given priority
+ *
+ * Creates a stream with the specified priority and returns a handle in \p phStream.
+ * This API alters the scheduler priority of work in the stream. Work in a higher
+ * priority stream may preempt work already executing in a low priority stream.
+ *
+ * \p priority follows a convention where lower numbers represent higher priorities.
+ * '0' represents default priority. The range of meaningful numerical priorities can
+ * be queried using ::cuCtxGetStreamPriorityRange. If the specified priority is
+ * outside the numerical range returned by ::cuCtxGetStreamPriorityRange,
+ * it will automatically be clamped to the lowest or the highest number in the range.
+ *
+ * \param phStream    - Returned newly created stream
+ * \param flags       - Flags for stream creation. See ::cuStreamCreate for a list of
+ *                      valid flags
+ * \param priority    - Stream priority. Lower numbers represent higher priorities.
+ *                      See ::cuCtxGetStreamPriorityRange for more information about
+ *                      meaningful stream priorities that can be passed.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \note Stream priorities are supported only on GPUs
+ * with compute capability 3.5 or higher.
+ *
+ * \note In the current implementation, only compute kernels launched in
+ * priority streams are affected by the stream's priority. Stream priorities have
+ * no effect on host-to-device and device-to-host memory operations.
+ *
+ * \sa ::cuStreamDestroy,
+ * ::cuStreamCreate,
+ * ::cuStreamGetPriority,
+ * ::cuCtxGetStreamPriorityRange,
+ * ::cuStreamGetFlags,
+ * ::cuStreamWaitEvent,
+ * ::cuStreamQuery,
+ * ::cuStreamSynchronize,
+ * ::cuStreamAddCallback,
+ * ::cudaStreamCreateWithPriority
+ */
+CUresult CUDAAPI cuStreamCreateWithPriority(CUstream *phStream, unsigned int flags, int priority);
+
+
+/**
+ * \brief Query the priority of a given stream
+ *
+ * Query the priority of a stream created using ::cuStreamCreate or ::cuStreamCreateWithPriority
+ * and return the priority in \p priority. Note that if the stream was created with a
+ * priority outside the numerical range returned by ::cuCtxGetStreamPriorityRange,
+ * this function returns the clamped priority.
+ * See ::cuStreamCreateWithPriority for details about priority clamping.
+ *
+ * \param hStream    - Handle to the stream to be queried
+ * \param priority   - Pointer to a signed integer in which the stream's priority is returned
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \sa ::cuStreamDestroy,
+ * ::cuStreamCreate,
+ * ::cuStreamCreateWithPriority,
+ * ::cuCtxGetStreamPriorityRange,
+ * ::cuStreamGetFlags,
+ * ::cudaStreamGetPriority
+ */
+CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority);
+
+/**
+ * \brief Query the flags of a given stream
+ *
+ * Query the flags of a stream created using ::cuStreamCreate or ::cuStreamCreateWithPriority
+ * and return the flags in \p flags.
+ *
+ * \param hStream    - Handle to the stream to be queried
+ * \param flags      - Pointer to an unsigned integer in which the stream's flags are returned
+ *                     The value returned in \p flags is a logical 'OR' of all flags that
+ *                     were used while creating this stream. See ::cuStreamCreate for the list
+ *                     of valid flags
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \sa ::cuStreamDestroy,
+ * ::cuStreamCreate,
+ * ::cuStreamGetPriority,
+ * ::cudaStreamGetFlags
+ */
+CUresult CUDAAPI cuStreamGetFlags(CUstream hStream, unsigned int *flags);
+
+#if __CUDA_API_VERSION >= 9020
+
+/**
+ * \brief Query the context associated with a stream
+ *
+ * Returns the CUDA context that the stream is associated with.
+ *
+ * The stream handle \p hStream can refer to any of the following:
+ * <ul>
+ *   <li>a stream created via any of the CUDA driver APIs such as ::cuStreamCreate
+ *   and ::cuStreamCreateWithPriority, or their runtime API equivalents such as
+ *   ::cudaStreamCreate, ::cudaStreamCreateWithFlags and ::cudaStreamCreateWithPriority.
+ *   The returned context is the context that was active in the calling thread when the
+ *   stream was created. Passing an invalid handle will result in undefined behavior.</li>
+ *   <li>any of the special streams such as the NULL stream, ::CU_STREAM_LEGACY and
+ *   ::CU_STREAM_PER_THREAD. The runtime API equivalents of these are also accepted,
+ *   which are NULL, ::cudaStreamLegacy and ::cudaStreamPerThread respectively.
+ *   Specifying any of the special handles will return the context current to the
+ *   calling thread. If no context is current to the calling thread,
+ *   ::CUDA_ERROR_INVALID_CONTEXT is returned.</li>
+ * </ul>
+ *
+ * \param hStream - Handle to the stream to be queried
+ * \param pctx    - Returned context associated with the stream
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * \notefnerr
+ *
+ * \sa ::cuStreamDestroy,
+ * ::cuStreamCreateWithPriority,
+ * ::cuStreamGetPriority,
+ * ::cuStreamGetFlags,
+ * ::cuStreamWaitEvent,
+ * ::cuStreamQuery,
+ * ::cuStreamSynchronize,
+ * ::cuStreamAddCallback,
+ * ::cudaStreamCreate,
+ * ::cudaStreamCreateWithFlags
+ */
+CUresult CUDAAPI cuStreamGetCtx(CUstream hStream, CUcontext *pctx);
+
+#endif /* __CUDA_API_VERSION >= 9020 */
+
+/**
+ * \brief Make a compute stream wait on an event
+ *
+ * Makes all future work submitted to \p hStream wait for all work captured in
+ * \p hEvent.  See ::cuEventRecord() for details on what is captured by an event.
+ * The synchronization will be performed efficiently on the device when applicable.
+ * \p hEvent may be from a different context or device than \p hStream.
+ *
+ * \param hStream - Stream to wait
+ * \param hEvent  - Event to wait on (may not be NULL)
+ * \param Flags   - Parameters for the operation (must be 0)
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuStreamCreate,
+ * ::cuEventRecord,
+ * ::cuStreamQuery,
+ * ::cuStreamSynchronize,
+ * ::cuStreamAddCallback,
+ * ::cuStreamDestroy,
+ * ::cudaStreamWaitEvent
+ */
+CUresult CUDAAPI cuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags);
+
+/**
+ * \brief Add a callback to a compute stream
+ *
+ * \note This function is slated for eventual deprecation and removal. If
+ * you do not require the callback to execute in case of a device error,
+ * consider using ::cuLaunchHostFunc. Additionally, this function is not
+ * supported with ::cuStreamBeginCapture and ::cuStreamEndCapture, unlike
+ * ::cuLaunchHostFunc.
+ *
+ * Adds a callback to be called on the host after all currently enqueued
+ * items in the stream have completed.  For each
+ * cuStreamAddCallback call, the callback will be executed exactly once.
+ * The callback will block later work in the stream until it is finished.
+ *
+ * The callback may be passed ::CUDA_SUCCESS or an error code.  In the event
+ * of a device error, all subsequently executed callbacks will receive an
+ * appropriate ::CUresult.
+ *
+ * Callbacks must not make any CUDA API calls.  Attempting to use a CUDA API
+ * will result in ::CUDA_ERROR_NOT_PERMITTED.  Callbacks must not perform any
+ * synchronization that may depend on outstanding device work or other callbacks
+ * that are not mandated to run earlier.  Callbacks without a mandated order
+ * (in independent streams) execute in undefined order and may be serialized.
+ *
+ * For the purposes of Unified Memory, callback execution makes a number of
+ * guarantees:
+ * <ul>
+ *   <li>The callback stream is considered idle for the duration of the
+ *   callback.  Thus, for example, a callback may always use memory attached
+ *   to the callback stream.</li>
+ *   <li>The start of execution of a callback has the same effect as
+ *   synchronizing an event recorded in the same stream immediately prior to
+ *   the callback.  It thus synchronizes streams which have been "joined"
+ *   prior to the callback.</li>
+ *   <li>Adding device work to any stream does not have the effect of making
+ *   the stream active until all preceding host functions and stream callbacks
+ *   have executed.  Thus, for
+ *   example, a callback might use global attached memory even if work has
+ *   been added to another stream, if the work has been ordered behind the
+ *   callback with an event.</li>
+ *   <li>Completion of a callback does not cause a stream to become
+ *   active except as described above.  The callback stream will remain idle
+ *   if no device work follows the callback, and will remain idle across
+ *   consecutive callbacks without device work in between.  Thus, for example,
+ *   stream synchronization can be done by signaling from a callback at the
+ *   end of the stream.</li>
+ * </ul>
+ *
+ * \param hStream  - Stream to add callback to
+ * \param callback - The function to call once preceding stream operations are complete
+ * \param userData - User specified data to be passed to the callback function
+ * \param flags    - Reserved for future use, must be 0
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuStreamCreate,
+ * ::cuStreamQuery,
+ * ::cuStreamSynchronize,
+ * ::cuStreamWaitEvent,
+ * ::cuStreamDestroy,
+ * ::cuMemAllocManaged,
+ * ::cuStreamAttachMemAsync,
+ * ::cuStreamLaunchHostFunc,
+ * ::cudaStreamAddCallback
+ */
+CUresult CUDAAPI cuStreamAddCallback(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags);
+
+#if __CUDA_API_VERSION >= 10000
+
+/**
+ * \brief Begins graph capture on a stream
+ *
+ * Begin graph capture on \p hStream. When a stream is in capture mode, all operations
+ * pushed into the stream will not be executed, but will instead be captured into
+ * a graph, which will be returned via ::cuStreamEndCapture. Capture may not be initiated
+ * if \p stream is CU_STREAM_LEGACY. Capture must be ended on the same stream in which
+ * it was initiated, and it may only be initiated if the stream is not already in capture
+ * mode. The capture mode may be queried via ::cuStreamIsCapturing. A unique id
+ * representing the capture sequence may be queried via ::cuStreamGetCaptureInfo.
+ *
+ * If \p mode is not ::CU_STREAM_CAPTURE_MODE_RELAXED, ::cuStreamEndCapture must be
+ * called on this stream from the same thread.
+ *
+ * \param hStream - Stream in which to initiate capture
+ * \param mode    - Controls the interaction of this capture sequence with other API
+ *                  calls that are potentially unsafe. For more details see
+ *                  ::cuThreadExchangeStreamCaptureMode.
+ *
+ * \note Kernels captured using this API must not use texture and surface references.
+ *       Reading or writing through any texture or surface reference is undefined
+ *       behavior. This restriction does not apply to texture and surface objects.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuStreamCreate,
+ * ::cuStreamIsCapturing,
+ * ::cuStreamEndCapture,
+ * ::cuThreadExchangeStreamCaptureMode
+ */
+CUresult CUDAAPI cuStreamBeginCapture(CUstream hStream, CUstreamCaptureMode mode);
+
+#endif /* __CUDA_API_VERSION >= 10000 */
+#if __CUDA_API_VERSION >= 10010
+
+/**
+ * \brief Swaps the stream capture interaction mode for a thread
+ *
+ * Sets the calling thread's stream capture interaction mode to the value contained
+ * in \p *mode, and overwrites \p *mode with the previous mode for the thread. To
+ * facilitate deterministic behavior across function or module boundaries, callers
+ * are encouraged to use this API in a push-pop fashion: \code
+     CUstreamCaptureMode mode = desiredMode;
+     cuThreadExchangeStreamCaptureMode(&mode);
+     ...
+     cuThreadExchangeStreamCaptureMode(&mode); // restore previous mode
+ * \endcode
+ *
+ * During stream capture (see ::cuStreamBeginCapture), some actions, such as a call
+ * to ::cudaMalloc, may be unsafe. In the case of ::cudaMalloc, the operation is
+ * not enqueued asynchronously to a stream, and is not observed by stream capture.
+ * Therefore, if the sequence of operations captured via ::cuStreamBeginCapture
+ * depended on the allocation being replayed whenever the graph is launched, the
+ * captured graph would be invalid.
+ *
+ * Therefore, stream capture places restrictions on API calls that can be made within
+ * or concurrently to a ::cuStreamBeginCapture-::cuStreamEndCapture sequence. This
+ * behavior can be controlled via this API and flags to ::cuStreamBeginCapture.
+ *
+ * A thread's mode is one of the following:
+ * - \p CU_STREAM_CAPTURE_MODE_GLOBAL: This is the default mode. If the local thread has
+ *   an ongoing capture sequence that was not initiated with
+ *   \p CU_STREAM_CAPTURE_MODE_RELAXED at \p cuStreamBeginCapture, or if any other thread
+ *   has a concurrent capture sequence initiated with \p CU_STREAM_CAPTURE_MODE_GLOBAL,
+ *   this thread is prohibited from potentially unsafe API calls.
+ * - \p CU_STREAM_CAPTURE_MODE_THREAD_LOCAL: If the local thread has an ongoing capture
+ *   sequence not initiated with \p CU_STREAM_CAPTURE_MODE_RELAXED, it is prohibited
+ *   from potentially unsafe API calls. Concurrent capture sequences in other threads
+ *   are ignored.
+ * - \p CU_STREAM_CAPTURE_MODE_RELAXED: The local thread is not prohibited from potentially
+ *   unsafe API calls. Note that the thread is still prohibited from API calls which
+ *   necessarily conflict with stream capture, for example, attempting ::cuEventQuery
+ *   on an event that was last recorded inside a capture sequence.
+ *
+ * \param mode - Pointer to mode value to swap with the current mode
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuStreamBeginCapture
+ */
+CUresult CUDAAPI cuThreadExchangeStreamCaptureMode(CUstreamCaptureMode *mode);
+
+#endif /* __CUDA_API_VERSION >= 10010 */
+#if __CUDA_API_VERSION >= 10000
+
+/**
+ * \brief Ends capture on a stream, returning the captured graph
+ *
+ * End capture on \p hStream, returning the captured graph via \p phGraph.
+ * Capture must have been initiated on \p hStream via a call to ::cuStreamBeginCapture.
+ * If capture was invalidated, due to a violation of the rules of stream capture, then
+ * a NULL graph will be returned.
+ *
+ * If the \p mode argument to ::cuStreamBeginCapture was not
+ * ::CU_STREAM_CAPTURE_MODE_RELAXED, this call must be from the same thread as
+ * ::cuStreamBeginCapture.
+ *
+ * \param hStream - Stream to query
+ * \param phGraph - The captured graph
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD
+ * \notefnerr
+ *
+ * \sa
+ * ::cuStreamCreate,
+ * ::cuStreamBeginCapture,
+ * ::cuStreamIsCapturing
+ */
+CUresult CUDAAPI cuStreamEndCapture(CUstream hStream, CUgraph *phGraph);
+
+/**
+ * \brief Returns a stream's capture status
+ *
+ * Return the capture status of \p hStream via \p captureStatus. After a successful
+ * call, \p *captureStatus will contain one of the following:
+ * - ::CU_STREAM_CAPTURE_STATUS_NONE: The stream is not capturing.
+ * - ::CU_STREAM_CAPTURE_STATUS_ACTIVE: The stream is capturing.
+ * - ::CU_STREAM_CAPTURE_STATUS_INVALIDATED: The stream was capturing but an error
+ *   has invalidated the capture sequence. The capture sequence must be terminated
+ *   with ::cuStreamEndCapture on the stream where it was initiated in order to
+ *   continue using \p hStream.
+ *
+ * Note that, if this is called on ::CU_STREAM_LEGACY (the "null stream") while
+ * a blocking stream in the same context is capturing, it will return
+ * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT and \p *captureStatus is unspecified
+ * after the call. The blocking stream capture is not invalidated.
+ *
+ * When a blocking stream is capturing, the legacy stream is in an
+ * unusable state until the blocking stream capture is terminated. The legacy
+ * stream is not supported for stream capture, but attempted use would have an
+ * implicit dependency on the capturing stream(s).
+ *
+ * \param hStream       - Stream to query
+ * \param captureStatus - Returns the stream's capture status
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT
+ * \notefnerr
+ *
+ * \sa
+ * ::cuStreamCreate,
+ * ::cuStreamBeginCapture,
+ * ::cuStreamEndCapture
+ */
+CUresult CUDAAPI cuStreamIsCapturing(CUstream hStream, CUstreamCaptureStatus *captureStatus);
+
+#endif /* __CUDA_API_VERSION >= 10000 */
+
+#if __CUDA_API_VERSION >= 10010
+
+/**
+ * \brief Query capture status of a stream
+ *
+ * Query the capture status of a stream and and get an id for
+ * the capture sequence, which is unique over the lifetime of the process.
+ *
+ * If called on ::CU_STREAM_LEGACY (the "null stream") while a stream not created
+ * with ::CU_STREAM_NON_BLOCKING is capturing, returns ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT.
+ *
+ * A valid id is returned only if both of the following are true:
+ * - the call returns CUDA_SUCCESS
+ * - captureStatus is set to ::CU_STREAM_CAPTURE_STATUS_ACTIVE
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT
+ * \notefnerr
+ *
+ * \sa
+ * ::cuStreamBeginCapture,
+ * ::cuStreamIsCapturing
+ */
+ CUresult CUDAAPI cuStreamGetCaptureInfo(CUstream hStream, CUstreamCaptureStatus *captureStatus, cuuint64_t *id);
+
+#endif /* __CUDA_API_VERSION >= 10010 */
+
+#if __CUDA_API_VERSION >= 6000
+
+/**
+ * \brief Attach memory to a stream asynchronously
+ *
+ * Enqueues an operation in \p hStream to specify stream association of
+ * \p length bytes of memory starting from \p dptr. This function is a
+ * stream-ordered operation, meaning that it is dependent on, and will
+ * only take effect when, previous work in stream has completed. Any
+ * previous association is automatically replaced.
+ *
+ * \p dptr must point to one of the following types of memories:
+ * - managed memory declared using the __managed__ keyword or allocated with
+ *   ::cuMemAllocManaged.
+ * - a valid host-accessible region of system-allocated pageable memory. This
+ *   type of memory may only be specified if the device associated with the
+ *   stream reports a non-zero value for the device attribute
+ *   ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS.
+ *
+ * For managed allocations, \p length must be either zero or the entire
+ * allocation's size. Both indicate that the entire allocation's stream
+ * association is being changed. Currently, it is not possible to change stream
+ * association for a portion of a managed allocation.
+ *
+ * For pageable host allocations, \p length must be non-zero.
+ *
+ * The stream association is specified using \p flags which must be
+ * one of ::CUmemAttach_flags.
+ * If the ::CU_MEM_ATTACH_GLOBAL flag is specified, the memory can be accessed
+ * by any stream on any device.
+ * If the ::CU_MEM_ATTACH_HOST flag is specified, the program makes a guarantee
+ * that it won't access the memory on the device from any stream on a device that
+ * has a zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.
+ * If the ::CU_MEM_ATTACH_SINGLE flag is specified and \p hStream is associated with
+ * a device that has a zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS,
+ * the program makes a guarantee that it will only access the memory on the device
+ * from \p hStream. It is illegal to attach singly to the NULL stream, because the
+ * NULL stream is a virtual global stream and not a specific stream. An error will
+ * be returned in this case.
+ *
+ * When memory is associated with a single stream, the Unified Memory system will
+ * allow CPU access to this memory region so long as all operations in \p hStream
+ * have completed, regardless of whether other streams are active. In effect,
+ * this constrains exclusive ownership of the managed memory region by
+ * an active GPU to per-stream activity instead of whole-GPU activity.
+ *
+ * Accessing memory on the device from streams that are not associated with
+ * it will produce undefined results. No error checking is performed by the
+ * Unified Memory system to ensure that kernels launched into other streams
+ * do not access this region.
+ *
+ * It is a program's responsibility to order calls to ::cuStreamAttachMemAsync
+ * via events, synchronization or other means to ensure legal access to memory
+ * at all times. Data visibility and coherency will be changed appropriately
+ * for all kernels which follow a stream-association change.
+ *
+ * If \p hStream is destroyed while data is associated with it, the association is
+ * removed and the association reverts to the default visibility of the allocation
+ * as specified at ::cuMemAllocManaged. For __managed__ variables, the default
+ * association is always ::CU_MEM_ATTACH_GLOBAL. Note that destroying a stream is an
+ * asynchronous operation, and as a result, the change to default association won't
+ * happen until all work in the stream has completed.
+ *
+ * \param hStream - Stream in which to enqueue the attach operation
+ * \param dptr    - Pointer to memory (must be a pointer to managed memory or
+ *                  to a valid host-accessible region of system-allocated
+ *                  pageable memory)
+ * \param length  - Length of memory
+ * \param flags   - Must be one of ::CUmemAttach_flags
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuStreamCreate,
+ * ::cuStreamQuery,
+ * ::cuStreamSynchronize,
+ * ::cuStreamWaitEvent,
+ * ::cuStreamDestroy,
+ * ::cuMemAllocManaged,
+ * ::cudaStreamAttachMemAsync
+ */
+CUresult CUDAAPI cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr, size_t length, unsigned int flags);
+
+#endif /* __CUDA_API_VERSION >= 6000 */
+
+/**
+ * \brief Determine status of a compute stream
+ *
+ * Returns ::CUDA_SUCCESS if all operations in the stream specified by
+ * \p hStream have completed, or ::CUDA_ERROR_NOT_READY if not.
+ *
+ * For the purposes of Unified Memory, a return value of ::CUDA_SUCCESS
+ * is equivalent to having called ::cuStreamSynchronize().
+ *
+ * \param hStream - Stream to query status of
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_READY
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuStreamCreate,
+ * ::cuStreamWaitEvent,
+ * ::cuStreamDestroy,
+ * ::cuStreamSynchronize,
+ * ::cuStreamAddCallback,
+ * ::cudaStreamQuery
+ */
+CUresult CUDAAPI cuStreamQuery(CUstream hStream);
+
+/**
+ * \brief Wait until a stream's tasks are completed
+ *
+ * Waits until the device has completed all operations in the stream specified
+ * by \p hStream. If the context was created with the
+ * ::CU_CTX_SCHED_BLOCKING_SYNC flag, the CPU thread will block until the
+ * stream is finished with all of its tasks.
+ *
+ * \param hStream - Stream to wait for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE
+
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuStreamCreate,
+ * ::cuStreamDestroy,
+ * ::cuStreamWaitEvent,
+ * ::cuStreamQuery,
+ * ::cuStreamAddCallback,
+ * ::cudaStreamSynchronize
+ */
+CUresult CUDAAPI cuStreamSynchronize(CUstream hStream);
+
+#if __CUDA_API_VERSION >= 4000
+/**
+ * \brief Destroys a stream
+ *
+ * Destroys the stream specified by \p hStream.
+ *
+ * In case the device is still doing work in the stream \p hStream
+ * when ::cuStreamDestroy() is called, the function will return immediately
+ * and the resources associated with \p hStream will be released automatically
+ * once the device has completed all work in \p hStream.
+ *
+ * \param hStream - Stream to destroy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa ::cuStreamCreate,
+ * ::cuStreamWaitEvent,
+ * ::cuStreamQuery,
+ * ::cuStreamSynchronize,
+ * ::cuStreamAddCallback,
+ * ::cudaStreamDestroy
+ */
+CUresult CUDAAPI cuStreamDestroy(CUstream hStream);
+#endif /* __CUDA_API_VERSION >= 4000 */
+
+/** @} */ /* END CUDA_STREAM */
+
+
+/**
+ * \defgroup CUDA_EVENT Event Management
+ *
+ * ___MANBRIEF___ event management functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the event management functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Creates an event
+ *
+ * Creates an event *phEvent for the current context with the flags specified via
+ * \p Flags. Valid flags include:
+ * - ::CU_EVENT_DEFAULT: Default event creation flag.
+ * - ::CU_EVENT_BLOCKING_SYNC: Specifies that the created event should use blocking
+ *   synchronization.  A CPU thread that uses ::cuEventSynchronize() to wait on
+ *   an event created with this flag will block until the event has actually
+ *   been recorded.
+ * - ::CU_EVENT_DISABLE_TIMING: Specifies that the created event does not need
+ *   to record timing data.  Events created with this flag specified and
+ *   the ::CU_EVENT_BLOCKING_SYNC flag not specified will provide the best
+ *   performance when used with ::cuStreamWaitEvent() and ::cuEventQuery().
+ * - ::CU_EVENT_INTERPROCESS: Specifies that the created event may be used as an
+ *   interprocess event by ::cuIpcGetEventHandle(). ::CU_EVENT_INTERPROCESS must
+ *   be specified along with ::CU_EVENT_DISABLE_TIMING.
+ *
+ * \param phEvent - Returns newly created event
+ * \param Flags   - Event creation flags
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \sa
+ * ::cuEventRecord,
+ * ::cuEventQuery,
+ * ::cuEventSynchronize,
+ * ::cuEventDestroy,
+ * ::cuEventElapsedTime,
+ * ::cudaEventCreate,
+ * ::cudaEventCreateWithFlags
+ */
+CUresult CUDAAPI cuEventCreate(CUevent *phEvent, unsigned int Flags);
+
+/**
+ * \brief Records an event
+ *
+ * Captures in \p hEvent the contents of \p hStream at the time of this call.
+ * \p hEvent and \p hStream must be from the same context.
+ * Calls such as ::cuEventQuery() or ::cuStreamWaitEvent() will then
+ * examine or wait for completion of the work that was captured. Uses of
+ * \p hStream after this call do not modify \p hEvent. See note on default
+ * stream behavior for what is captured in the default case.
+ *
+ * ::cuEventRecord() can be called multiple times on the same event and
+ * will overwrite the previously captured state. Other APIs such as
+ * ::cuStreamWaitEvent() use the most recently captured state at the time
+ * of the API call, and are not affected by later calls to
+ * ::cuEventRecord(). Before the first call to ::cuEventRecord(), an
+ * event represents an empty set of work, so for example ::cuEventQuery()
+ * would return ::CUDA_SUCCESS.
+ *
+ * \param hEvent  - Event to record
+ * \param hStream - Stream to record event for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuEventCreate,
+ * ::cuEventQuery,
+ * ::cuEventSynchronize,
+ * ::cuStreamWaitEvent,
+ * ::cuEventDestroy,
+ * ::cuEventElapsedTime,
+ * ::cudaEventRecord
+ */
+CUresult CUDAAPI cuEventRecord(CUevent hEvent, CUstream hStream);
+
+/**
+ * \brief Queries an event's status
+ *
+ * Queries the status of all work currently captured by \p hEvent. See
+ * ::cuEventRecord() for details on what is captured by an event.
+ *
+ * Returns ::CUDA_SUCCESS if all captured work has been completed, or
+ * ::CUDA_ERROR_NOT_READY if any captured work is incomplete.
+ *
+ * For the purposes of Unified Memory, a return value of ::CUDA_SUCCESS
+ * is equivalent to having called ::cuEventSynchronize().
+ *
+ * \param hEvent - Event to query
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_READY
+ * \notefnerr
+ *
+ * \sa ::cuEventCreate,
+ * ::cuEventRecord,
+ * ::cuEventSynchronize,
+ * ::cuEventDestroy,
+ * ::cuEventElapsedTime,
+ * ::cudaEventQuery
+ */
+CUresult CUDAAPI cuEventQuery(CUevent hEvent);
+
+/**
+ * \brief Waits for an event to complete
+ *
+ * Waits until the completion of all work currently captured in \p hEvent.
+ * See ::cuEventRecord() for details on what is captured by an event.
+ *
+ * Waiting for an event that was created with the ::CU_EVENT_BLOCKING_SYNC
+ * flag will cause the calling CPU thread to block until the event has
+ * been completed by the device.  If the ::CU_EVENT_BLOCKING_SYNC flag has
+ * not been set, then the CPU thread will busy-wait until the event has
+ * been completed by the device.
+ *
+ * \param hEvent - Event to wait for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa ::cuEventCreate,
+ * ::cuEventRecord,
+ * ::cuEventQuery,
+ * ::cuEventDestroy,
+ * ::cuEventElapsedTime,
+ * ::cudaEventSynchronize
+ */
+CUresult CUDAAPI cuEventSynchronize(CUevent hEvent);
+
+#if __CUDA_API_VERSION >= 4000
+/**
+ * \brief Destroys an event
+ *
+ * Destroys the event specified by \p hEvent.
+ *
+ * An event may be destroyed before it is complete (i.e., while
+ * ::cuEventQuery() would return ::CUDA_ERROR_NOT_READY). In this case, the
+ * call does not block on completion of the event, and any associated
+ * resources will automatically be released asynchronously at completion.
+ *
+ * \param hEvent - Event to destroy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa ::cuEventCreate,
+ * ::cuEventRecord,
+ * ::cuEventQuery,
+ * ::cuEventSynchronize,
+ * ::cuEventElapsedTime,
+ * ::cudaEventDestroy
+ */
+CUresult CUDAAPI cuEventDestroy(CUevent hEvent);
+#endif /* __CUDA_API_VERSION >= 4000 */
+
+/**
+ * \brief Computes the elapsed time between two events
+ *
+ * Computes the elapsed time between two events (in milliseconds with a
+ * resolution of around 0.5 microseconds).
+ *
+ * If either event was last recorded in a non-NULL stream, the resulting time
+ * may be greater than expected (even if both used the same stream handle). This
+ * happens because the ::cuEventRecord() operation takes place asynchronously
+ * and there is no guarantee that the measured latency is actually just between
+ * the two events. Any number of other different stream operations could execute
+ * in between the two measured events, thus altering the timing in a significant
+ * way.
+ *
+ * If ::cuEventRecord() has not been called on either event then
+ * ::CUDA_ERROR_INVALID_HANDLE is returned. If ::cuEventRecord() has been called
+ * on both events but one or both of them has not yet been completed (that is,
+ * ::cuEventQuery() would return ::CUDA_ERROR_NOT_READY on at least one of the
+ * events), ::CUDA_ERROR_NOT_READY is returned. If either event was created with
+ * the ::CU_EVENT_DISABLE_TIMING flag, then this function will return
+ * ::CUDA_ERROR_INVALID_HANDLE.
+ *
+ * \param pMilliseconds - Time between \p hStart and \p hEnd in ms
+ * \param hStart        - Starting event
+ * \param hEnd          - Ending event
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_READY
+ * \notefnerr
+ *
+ * \sa ::cuEventCreate,
+ * ::cuEventRecord,
+ * ::cuEventQuery,
+ * ::cuEventSynchronize,
+ * ::cuEventDestroy,
+ * ::cudaEventElapsedTime
+ */
+CUresult CUDAAPI cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUevent hEnd);
+
+/** @} */ /* END CUDA_EVENT */
+
+/**
+ * \defgroup CUDA_EXTRES_INTEROP External Resource Interoperability
+ *
+ * ___MANBRIEF___ External resource interoperability functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the external resource interoperability functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+#if __CUDA_API_VERSION >= 10000
+
+ /**
+ * \brief Imports an external memory object
+ *
+ * Imports an externally allocated memory object and returns
+ * a handle to that in \p extMem_out.
+ *
+ * The properties of the handle being imported must be described in
+ * \p memHandleDesc. The ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC structure
+ * is defined as follows:
+ *
+ * \code
+        typedef struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st {
+            CUexternalMemoryHandleType type;
+            union {
+                int fd;
+                struct {
+                    void *handle;
+                    const void *name;
+                } win32;
+            } handle;
+            unsigned long long size;
+            unsigned int flags;
+        } CUDA_EXTERNAL_MEMORY_HANDLE_DESC;
+ * \endcode
+ *
+ * where ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type specifies the type
+ * of handle being imported. ::CUexternalMemoryHandleType is
+ * defined as:
+ *
+ * \code
+        typedef enum CUexternalMemoryHandleType_enum {
+            CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD        = 1,
+            CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32     = 2,
+            CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT = 3,
+            CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP       = 4,
+            CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE   = 5
+        } CUexternalMemoryHandleType;
+ * \endcode
+ *
+ * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD, then
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::fd must be a valid
+ * file descriptor referencing a memory object. Ownership of
+ * the file descriptor is transferred to the CUDA driver when the
+ * handle is imported successfully. Performing any operations on the
+ * file descriptor after it is imported results in undefined behavior.
+ *
+ * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32, then exactly one
+ * of ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle and
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name must not be
+ * NULL. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle
+ * is not NULL, then it must represent a valid shared NT handle that
+ * references a memory object. Ownership of this handle is
+ * not transferred to CUDA after the import operation, so the
+ * application must release the handle using the appropriate system
+ * call. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name
+ * is not NULL, then it must point to a NULL-terminated array of
+ * UTF-16 characters that refers to a memory object.
+ *
+ * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT, then
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle must
+ * be non-NULL and
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name
+ * must be NULL. The handle specified must be a globally shared KMT
+ * handle. This handle does not hold a reference to the underlying
+ * object, and thus will be invalid when all references to the
+ * memory object are destroyed.
+ *
+ * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP, then exactly one
+ * of ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle and
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name must not be
+ * NULL. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle
+ * is not NULL, then it must represent a valid shared NT handle that
+ * is returned by ID3DDevice::CreateSharedHandle when referring to a
+ * ID3D12Heap object. This handle holds a reference to the underlying
+ * object. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name
+ * is not NULL, then it must point to a NULL-terminated array of
+ * UTF-16 characters that refers to a ID3D12Heap object.
+ *
+ * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE, then exactly one
+ * of ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle and
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name must not be
+ * NULL. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle
+ * is not NULL, then it must represent a valid shared NT handle that
+ * is returned by ID3DDevice::CreateSharedHandle when referring to a
+ * ID3D12Resource object. This handle holds a reference to the
+ * underlying object. If
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name
+ * is not NULL, then it must point to a NULL-terminated array of
+ * UTF-16 characters that refers to a ID3D12Resource object.
+ *
+ * The size of the memory object must be specified in
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::size.
+ *
+ * Specifying the flag ::CUDA_EXTERNAL_MEMORY_DEDICATED in
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::flags indicates that the
+ * resource is a dedicated resource. The definition of what a
+ * dedicated resource is outside the scope of this extension.
+ *
+ * \param extMem_out    - Returned handle to an external memory object
+ * \param memHandleDesc - Memory import handle descriptor
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \note If the Vulkan memory imported into CUDA is mapped on the CPU then the
+ * application must use vkInvalidateMappedMemoryRanges/vkFlushMappedMemoryRanges
+ * as well as appropriate Vulkan pipeline barriers to maintain coherence between
+ * CPU and GPU. For more information on these APIs, please refer to "Synchronization
+ * and Cache Control" chapter from Vulkan specification.
+ *
+ * \sa ::cuDestroyExternalMemory,
+ * ::cuExternalMemoryGetMappedBuffer,
+ * ::cuExternalMemoryGetMappedMipmappedArray
+ */
+CUresult CUDAAPI cuImportExternalMemory(CUexternalMemory *extMem_out, const CUDA_EXTERNAL_MEMORY_HANDLE_DESC *memHandleDesc);
+
+/**
+ * \brief Maps a buffer onto an imported memory object
+ *
+ * Maps a buffer onto an imported memory object and returns a device
+ * pointer in \p devPtr.
+ *
+ * The properties of the buffer being mapped must be described in
+ * \p bufferDesc. The ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC structure is
+ * defined as follows:
+ *
+ * \code
+        typedef struct CUDA_EXTERNAL_MEMORY_BUFFER_DESC_st {
+            unsigned long long offset;
+            unsigned long long size;
+            unsigned int flags;
+        } CUDA_EXTERNAL_MEMORY_BUFFER_DESC;
+ * \endcode
+ *
+ * where ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC::offset is the offset in
+ * the memory object where the buffer's base address is.
+ * ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC::size is the size of the buffer.
+ * ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC::flags must be zero.
+ *
+ * The offset and size have to be suitably aligned to match the
+ * requirements of the external API. Mapping two buffers whose ranges
+ * overlap may or may not result in the same virtual address being
+ * returned for the overlapped portion. In such cases, the application
+ * must ensure that all accesses to that region from the GPU are
+ * volatile. Otherwise writes made via one address are not guaranteed
+ * to be visible via the other address, even if they're issued by the
+ * same thread. It is recommended that applications map the combined
+ * range instead of mapping separate buffers and then apply the
+ * appropriate offsets to the returned pointer to derive the
+ * individual buffers.
+ *
+ * The returned pointer \p devPtr must be freed using ::cuMemFree.
+ *
+ * \param devPtr     - Returned device pointer to buffer
+ * \param extMem     - Handle to external memory object
+ * \param bufferDesc - Buffer descriptor
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa ::cuImportExternalMemory
+ * ::cuDestroyExternalMemory,
+ * ::cuExternalMemoryGetMappedMipmappedArray
+ */
+CUresult CUDAAPI cuExternalMemoryGetMappedBuffer(CUdeviceptr *devPtr, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_BUFFER_DESC *bufferDesc);
+
+/**
+ * \brief Maps a CUDA mipmapped array onto an external memory object
+ *
+ * Maps a CUDA mipmapped array onto an external object and returns a
+ * handle to it in \p mipmap.
+ *
+ * The properties of the CUDA mipmapped array being mapped must be
+ * described in \p mipmapDesc. The structure
+ * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC is defined as follows:
+ *
+ * \code
+        typedef struct CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_st {
+            unsigned long long offset;
+            CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
+            unsigned int numLevels;
+        } CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC;
+ * \endcode
+ *
+ * where ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::offset is the
+ * offset in the memory object where the base level of the mipmap
+ * chain is.
+ * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::arrayDesc describes
+ * the format, dimensions and type of the base level of the mipmap
+ * chain. For further details on these parameters, please refer to the
+ * documentation for ::cuMipmappedArrayCreate. Note that if the mipmapped
+ * array is bound as a color target in the graphics API, then the flag
+ * ::CUDA_ARRAY3D_COLOR_ATTACHMENT must be specified in
+ * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::arrayDesc::Flags.
+ * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::numLevels specifies
+ * the total number of levels in the mipmap chain.
+ *
+ * The returned CUDA mipmapped array must be freed using ::cuMipmappedArrayDestroy.
+ *
+ * \param mipmap     - Returned CUDA mipmapped array
+ * \param extMem     - Handle to external memory object
+ * \param mipmapDesc - CUDA array descriptor
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa ::cuImportExternalMemory
+ * ::cuDestroyExternalMemory,
+ * ::cuExternalMemoryGetMappedBuffer
+ */
+CUresult CUDAAPI cuExternalMemoryGetMappedMipmappedArray(CUmipmappedArray *mipmap, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC *mipmapDesc);
+
+/**
+ * \brief Destroys an external memory object.
+ *
+ * Destroys the specified external memory object. Any existing buffers
+ * and CUDA mipmapped arrays mapped onto this object must no longer be
+ * used and must be explicitly freed using ::cuMemFree and
+ * ::cuMipmappedArrayDestroy respectively.
+ *
+ * \param extMem - External memory object to be destroyed
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa ::cuImportExternalMemory
+ * ::cuExternalMemoryGetMappedBuffer,
+ * ::cuExternalMemoryGetMappedMipmappedArray
+ */
+CUresult CUDAAPI cuDestroyExternalMemory(CUexternalMemory extMem);
+
+/**
+ * \brief Imports an external semaphore
+ *
+ * Imports an externally allocated synchronization object and returns
+ * a handle to that in \p extSem_out.
+ *
+ * The properties of the handle being imported must be described in
+ * \p semHandleDesc. The ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC is
+ * defined as follows:
+ *
+ * \code
+        typedef struct CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st {
+            CUexternalSemaphoreHandleType type;
+            union {
+                int fd;
+                struct {
+                    void *handle;
+                    const void *name;
+                } win32;
+            } handle;
+            unsigned int flags;
+        } CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC;
+ * \endcode
+ *
+ * where ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type specifies the type of
+ * handle being imported. ::CUexternalSemaphoreHandleType is defined
+ * as:
+ *
+ * \code
+        typedef enum CUexternalSemaphoreHandleType_enum {
+            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD        = 1,
+            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32     = 2,
+            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT = 3,
+            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE      = 4
+        } CUexternalSemaphoreHandleType;
+ * \endcode
+ *
+ * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD, then
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::fd must be a valid
+ * file descriptor referencing a synchronization object. Ownership of
+ * the file descriptor is transferred to the CUDA driver when the
+ * handle is imported successfully. Performing any operations on the
+ * file descriptor after it is imported results in undefined behavior.
+ *
+ * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32, then exactly one
+ * of ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle and
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must not be
+ * NULL. If
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle
+ * is not NULL, then it must represent a valid shared NT handle that
+ * references a synchronization object. Ownership of this handle is
+ * not transferred to CUDA after the import operation, so the
+ * application must release the handle using the appropriate system
+ * call. If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name
+ * is not NULL, then it must name a valid synchronization object.
+ *
+ * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT, then
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle must
+ * be non-NULL and
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name
+ * must be NULL. The handle specified must be a globally shared KMT
+ * handle. This handle does not hold a reference to the underlying
+ * object, and thus will be invalid when all references to the
+ * synchronization object are destroyed.
+ *
+ * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE, then exactly one
+ * of ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle and
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must not be
+ * NULL. If
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle
+ * is not NULL, then it must represent a valid shared NT handle that
+ * is returned by ID3DDevice::CreateSharedHandle when referring to a
+ * ID3D12Fence object. This handle holds a reference to the underlying
+ * object. If
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name
+ * is not NULL, then it must name a valid synchronization object that
+ * refers to a valid ID3D12Fence object.
+ *
+ * \param extSem_out    - Returned handle to an external semaphore
+ * \param semHandleDesc - Semaphore import handle descriptor
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa ::cuDestroyExternalSemaphore,
+ * ::cuSignalExternalSemaphoresAsync,
+ * ::cuWaitExternalSemaphoresAsync
+ */
+CUresult CUDAAPI cuImportExternalSemaphore(CUexternalSemaphore *extSem_out, const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC *semHandleDesc);
+
+/**
+ * \brief Signals a set of external semaphore objects
+ *
+ * Enqueues a signal operation on a set of externally allocated
+ * semaphore object in the specified stream. The operations will be
+ * executed when all prior operations in the stream complete.
+ *
+ * The exact semantics of signaling a semaphore depends on the type of
+ * the object.
+ *
+ * If the semaphore object is any one of the following types:
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD,
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32,
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT
+ * then signaling the semaphore will set it to the signaled state.
+ *
+ * If the semaphore object is of the type
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE, then the
+ * semaphore will be set to the value specified in
+ * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::fence::value.
+ *
+ * \param extSemArray - Set of external semaphores to be signaled
+ * \param paramsArray - Array of semaphore parameters
+ * \param numExtSems  - Number of semaphores to signal
+ * \param stream     - Stream to enqueue the signal operations in
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa ::cuImportExternalSemaphore,
+ * ::cuDestroyExternalSemaphore,
+ * ::cuWaitExternalSemaphoresAsync
+ */
+CUresult CUDAAPI cuSignalExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream);
+
+/**
+ * \brief Waits on a set of external semaphore objects
+ *
+ * Enqueues a wait operation on a set of externally allocated
+ * semaphore object in the specified stream. The operations will be
+ * executed when all prior operations in the stream complete.
+ *
+ * The exact semantics of waiting on a semaphore depends on the type
+ * of the object.
+ *
+ * If the semaphore object is any one of the following types:
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD,
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32,
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT
+ * then waiting on the semaphore will wait until the semaphore reaches
+ * the signaled state. The semaphore will then be reset to the
+ * unsignaled state. Therefore for every signal operation, there can
+ * only be one wait operation.
+ *
+ * If the semaphore object is of the type
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE, then waiting on
+ * the semaphore will wait until the value of the semaphore is
+ * greater than or equal to
+ * ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS::params::fence::value.
+ *
+ * \param extSemArray - External semaphores to be waited on
+ * \param paramsArray - Array of semaphore parameters
+ * \param numExtSems  - Number of semaphores to wait on
+ * \param stream      - Stream to enqueue the wait operations in
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa ::cuImportExternalSemaphore,
+ * ::cuDestroyExternalSemaphore,
+ * ::cuSignalExternalSemaphoresAsync
+ */
+CUresult CUDAAPI cuWaitExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream);
+
+/**
+ * \brief Destroys an external semaphore
+ *
+ * Destroys an external semaphore object and releases any references
+ * to the underlying resource. Any outstanding signals or waits must
+ * have completed before the semaphore is destroyed.
+ *
+ * \param extSem - External semaphore to be destroyed
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa ::cuImportExternalSemaphore,
+ * ::cuSignalExternalSemaphoresAsync,
+ * ::cuWaitExternalSemaphoresAsync
+ */
+CUresult CUDAAPI cuDestroyExternalSemaphore(CUexternalSemaphore extSem);
+
+#endif /* __CUDA_API_VERSION >= 10000 */
+
+/** @} */ /* END CUDA_EXTRES_INTEROP */
+
+/**
+ * \defgroup CUDA_MEMOP Stream memory operations
+ *
+ * ___MANBRIEF___ Stream memory operations of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the stream memory operations of the low-level CUDA
+ * driver application programming interface.
+ *
+ * The whole set of operations is disabled by default. Users are required
+ * to explicitly enable them, e.g. on Linux by passing the kernel module
+ * parameter shown below:
+ *     modprobe nvidia NVreg_EnableStreamMemOPs=1
+ * There is currently no way to enable these operations on other operating
+ * systems.
+ *
+ * Users can programmatically query whether the device supports these
+ * operations with ::cuDeviceGetAttribute() and
+ * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS.
+ *
+ * Support for the ::CU_STREAM_WAIT_VALUE_NOR flag can be queried with
+ * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR.
+ *
+ * Support for the ::cuStreamWriteValue64() and ::cuStreamWaitValue64()
+ * functions, as well as for the ::CU_STREAM_MEM_OP_WAIT_VALUE_64 and
+ * ::CU_STREAM_MEM_OP_WRITE_VALUE_64 flags, can be queried with
+ * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS.
+ *
+ * Support for both ::CU_STREAM_WAIT_VALUE_FLUSH and
+ * ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES requires dedicated platform
+ * hardware features and can be queried with ::cuDeviceGetAttribute() and
+ * ::CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES.
+ *
+ * Note that all memory pointers passed as parameters to these operations
+ * are device pointers. Where necessary a device pointer should be
+ * obtained, for example with ::cuMemHostGetDevicePointer().
+ *
+ * None of the operations accepts pointers to managed memory buffers
+ * (::cuMemAllocManaged).
+ *
+ * @{
+ */
+
+#if __CUDA_API_VERSION >= 8000
+/**
+ * \brief Wait on a memory location
+ *
+ * Enqueues a synchronization of the stream on the given memory location. Work
+ * ordered after the operation will block until the given condition on the
+ * memory is satisfied. By default, the condition is to wait for
+ * (int32_t)(*addr - value) >= 0, a cyclic greater-or-equal.
+ * Other condition types can be specified via \p flags.
+ *
+ * If the memory was registered via ::cuMemHostRegister(), the device pointer
+ * should be obtained with ::cuMemHostGetDevicePointer(). This function cannot
+ * be used with managed memory (::cuMemAllocManaged).
+ *
+ * Support for this can be queried with ::cuDeviceGetAttribute() and
+ * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS.
+ *
+ * Support for CU_STREAM_WAIT_VALUE_NOR can be queried with ::cuDeviceGetAttribute() and
+ * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR.
+ *
+ * \param stream The stream to synchronize on the memory location.
+ * \param addr The memory location to wait on.
+ * \param value The value to compare with the memory location.
+ * \param flags See ::CUstreamWaitValue_flags.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \notefnerr
+ *
+ * \sa ::cuStreamWaitValue64,
+ * ::cuStreamWriteValue32,
+ * ::cuStreamWriteValue64
+ * ::cuStreamBatchMemOp,
+ * ::cuMemHostRegister,
+ * ::cuStreamWaitEvent
+ */
+CUresult CUDAAPI cuStreamWaitValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags);
+
+/**
+ * \brief Wait on a memory location
+ *
+ * Enqueues a synchronization of the stream on the given memory location. Work
+ * ordered after the operation will block until the given condition on the
+ * memory is satisfied. By default, the condition is to wait for
+ * (int64_t)(*addr - value) >= 0, a cyclic greater-or-equal.
+ * Other condition types can be specified via \p flags.
+ *
+ * If the memory was registered via ::cuMemHostRegister(), the device pointer
+ * should be obtained with ::cuMemHostGetDevicePointer().
+ *
+ * Support for this can be queried with ::cuDeviceGetAttribute() and
+ * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS.
+ *
+ * \param stream The stream to synchronize on the memory location.
+ * \param addr The memory location to wait on.
+ * \param value The value to compare with the memory location.
+ * \param flags See ::CUstreamWaitValue_flags.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \notefnerr
+ *
+ * \sa ::cuStreamWaitValue32,
+ * ::cuStreamWriteValue32,
+ * ::cuStreamWriteValue64,
+ * ::cuStreamBatchMemOp,
+ * ::cuMemHostRegister,
+ * ::cuStreamWaitEvent
+ */
+CUresult CUDAAPI cuStreamWaitValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags);
+
+/**
+ * \brief Write a value to memory
+ *
+ * Write a value to memory. Unless the ::CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER
+ * flag is passed, the write is preceded by a system-wide memory fence,
+ * equivalent to a __threadfence_system() but scoped to the stream
+ * rather than a CUDA thread.
+ *
+ * If the memory was registered via ::cuMemHostRegister(), the device pointer
+ * should be obtained with ::cuMemHostGetDevicePointer(). This function cannot
+ * be used with managed memory (::cuMemAllocManaged).
+ *
+ * Support for this can be queried with ::cuDeviceGetAttribute() and
+ * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS.
+ *
+ * \param stream The stream to do the write in.
+ * \param addr The device address to write to.
+ * \param value The value to write.
+ * \param flags See ::CUstreamWriteValue_flags.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \notefnerr
+ *
+ * \sa ::cuStreamWriteValue64,
+ * ::cuStreamWaitValue32,
+ * ::cuStreamWaitValue64,
+ * ::cuStreamBatchMemOp,
+ * ::cuMemHostRegister,
+ * ::cuEventRecord
+ */
+CUresult CUDAAPI cuStreamWriteValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags);
+
+/**
+ * \brief Write a value to memory
+ *
+ * Write a value to memory. Unless the ::CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER
+ * flag is passed, the write is preceded by a system-wide memory fence,
+ * equivalent to a __threadfence_system() but scoped to the stream
+ * rather than a CUDA thread.
+ *
+ * If the memory was registered via ::cuMemHostRegister(), the device pointer
+ * should be obtained with ::cuMemHostGetDevicePointer().
+ *
+ * Support for this can be queried with ::cuDeviceGetAttribute() and
+ * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS.
+ *
+ * \param stream The stream to do the write in.
+ * \param addr The device address to write to.
+ * \param value The value to write.
+ * \param flags See ::CUstreamWriteValue_flags.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \notefnerr
+ *
+ * \sa ::cuStreamWriteValue32,
+ * ::cuStreamWaitValue32,
+ * ::cuStreamWaitValue64,
+ * ::cuStreamBatchMemOp,
+ * ::cuMemHostRegister,
+ * ::cuEventRecord
+ */
+CUresult CUDAAPI cuStreamWriteValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags);
+
+/**
+ * \brief Batch operations to synchronize the stream via memory operations
+ *
+ * This is a batch version of ::cuStreamWaitValue32() and ::cuStreamWriteValue32().
+ * Batching operations may avoid some performance overhead in both the API call
+ * and the device execution versus adding them to the stream in separate API
+ * calls. The operations are enqueued in the order they appear in the array.
+ *
+ * See ::CUstreamBatchMemOpType for the full set of supported operations, and
+ * ::cuStreamWaitValue32(), ::cuStreamWaitValue64(), ::cuStreamWriteValue32(),
+ * and ::cuStreamWriteValue64() for details of specific operations.
+ *
+ * Basic support for this can be queried with ::cuDeviceGetAttribute() and
+ * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS. See related APIs for details
+ * on querying support for specific operations.
+ *
+ * \param stream The stream to enqueue the operations in.
+ * \param count The number of operations in the array. Must be less than 256.
+ * \param paramArray The types and parameters of the individual operations.
+ * \param flags Reserved for future expansion; must be 0.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \notefnerr
+ *
+ * \sa ::cuStreamWaitValue32,
+ * ::cuStreamWaitValue64,
+ * ::cuStreamWriteValue32,
+ * ::cuStreamWriteValue64,
+ * ::cuMemHostRegister
+ */
+CUresult CUDAAPI cuStreamBatchMemOp(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags);
+#endif /* __CUDA_API_VERSION >= 8000 */
+
+/** @} */ /* END CUDA_MEMOP */
+
+/**
+ * \defgroup CUDA_EXEC Execution Control
+ *
+ * ___MANBRIEF___ execution control functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the execution control functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Returns information about a function
+ *
+ * Returns in \p *pi the integer value of the attribute \p attrib on the kernel
+ * given by \p hfunc. The supported attributes are:
+ * - ::CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK: The maximum number of threads
+ *   per block, beyond which a launch of the function would fail. This number
+ *   depends on both the function and the device on which the function is
+ *   currently loaded.
+ * - ::CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES: The size in bytes of
+ *   statically-allocated shared memory per block required by this function.
+ *   This does not include dynamically-allocated shared memory requested by
+ *   the user at runtime.
+ * - ::CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES: The size in bytes of user-allocated
+ *   constant memory required by this function.
+ * - ::CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES: The size in bytes of local memory
+ *   used by each thread of this function.
+ * - ::CU_FUNC_ATTRIBUTE_NUM_REGS: The number of registers used by each thread
+ *   of this function.
+ * - ::CU_FUNC_ATTRIBUTE_PTX_VERSION: The PTX virtual architecture version for
+ *   which the function was compiled. This value is the major PTX version * 10
+ *   + the minor PTX version, so a PTX version 1.3 function would return the
+ *   value 13. Note that this may return the undefined value of 0 for cubins
+ *   compiled prior to CUDA 3.0.
+ * - ::CU_FUNC_ATTRIBUTE_BINARY_VERSION: The binary architecture version for
+ *   which the function was compiled. This value is the major binary
+ *   version * 10 + the minor binary version, so a binary version 1.3 function
+ *   would return the value 13. Note that this will return a value of 10 for
+ *   legacy cubins that do not have a properly-encoded binary architecture
+ *   version.
+ * - ::CU_FUNC_CACHE_MODE_CA: The attribute to indicate whether the function has
+ *   been compiled with user specified option "-Xptxas --dlcm=ca" set .
+ * - ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: The maximum size in bytes of
+ *   dynamically-allocated shared memory.
+ * - ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: Preferred shared memory-L1
+ *   cache split ratio in percent of total shared memory.
+ *
+ * \param pi     - Returned attribute value
+ * \param attrib - Attribute requested
+ * \param hfunc  - Function to query attribute of
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuCtxGetCacheConfig,
+ * ::cuCtxSetCacheConfig,
+ * ::cuFuncSetCacheConfig,
+ * ::cuLaunchKernel,
+ * ::cudaFuncGetAttributes
+ * ::cudaFuncSetAttribute
+ */
+CUresult CUDAAPI cuFuncGetAttribute(int *pi, CUfunction_attribute attrib, CUfunction hfunc);
+
+#if __CUDA_API_VERSION >= 9000
+
+/**
+ * \brief Sets information about a function
+ *
+ * This call sets the value of a specified attribute \p attrib on the kernel given
+ * by \p hfunc to an integer value specified by \p val
+ * This function returns CUDA_SUCCESS if the new value of the attribute could be
+ * successfully set. If the set fails, this call will return an error.
+ * Not all attributes can have values set. Attempting to set a value on a read-only
+ * attribute will result in an error (CUDA_ERROR_INVALID_VALUE)
+ *
+ * Supported attributes for the cuFuncSetAttribute call are:
+ * - ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: This maximum size in bytes of
+ *   dynamically-allocated shared memory. The value should contain the requested
+ *   maximum size of dynamically-allocated shared memory. The sum of this value and
+ *   the function attribute ::CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES cannot exceed the
+ *   device attribute ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN.
+ *   The maximal size of requestable dynamic shared memory may differ by GPU
+ *   architecture.
+ * - ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: On devices where the L1
+ *   cache and shared memory use the same hardware resources, this sets the shared memory
+ *   carveout preference, in percent of the total shared memory.
+ *   See ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR
+ *   This is only a hint, and the driver can choose a different ratio if required to execute the function.
+ *
+ * \param hfunc  - Function to query attribute of
+ * \param attrib - Attribute requested
+ * \param value   - The value to set
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuCtxGetCacheConfig,
+ * ::cuCtxSetCacheConfig,
+ * ::cuFuncSetCacheConfig,
+ * ::cuLaunchKernel,
+ * ::cudaFuncGetAttributes
+ * ::cudaFuncSetAttribute
+ */
+CUresult CUDAAPI cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attrib, int value);
+#endif // __CUDA_API_VERSION >= 9000
+
+/**
+ * \brief Sets the preferred cache configuration for a device function
+ *
+ * On devices where the L1 cache and shared memory use the same hardware
+ * resources, this sets through \p config the preferred cache configuration for
+ * the device function \p hfunc. This is only a preference. The driver will use
+ * the requested configuration if possible, but it is free to choose a different
+ * configuration if required to execute \p hfunc.  Any context-wide preference
+ * set via ::cuCtxSetCacheConfig() will be overridden by this per-function
+ * setting unless the per-function setting is ::CU_FUNC_CACHE_PREFER_NONE. In
+ * that case, the current context-wide setting will be used.
+ *
+ * This setting does nothing on devices where the size of the L1 cache and
+ * shared memory are fixed.
+ *
+ * Launching a kernel with a different preference than the most recent
+ * preference setting may insert a device-side synchronization point.
+ *
+ *
+ * The supported cache configurations are:
+ * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default)
+ * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache
+ * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory
+ * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory
+ *
+ * \param hfunc  - Kernel to configure cache for
+ * \param config - Requested cache configuration
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT
+ * \notefnerr
+ *
+ * \sa ::cuCtxGetCacheConfig,
+ * ::cuCtxSetCacheConfig,
+ * ::cuFuncGetAttribute,
+ * ::cuLaunchKernel,
+ * ::cudaFuncSetCacheConfig
+ */
+CUresult CUDAAPI cuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config);
+
+#if __CUDA_API_VERSION >= 4020
+/**
+ * \brief Sets the shared memory configuration for a device function.
+ *
+ * On devices with configurable shared memory banks, this function will
+ * force all subsequent launches of the specified device function to have
+ * the given shared memory bank size configuration. On any given launch of the
+ * function, the shared memory configuration of the device will be temporarily
+ * changed if needed to suit the function's preferred configuration. Changes in
+ * shared memory configuration between subsequent launches of functions,
+ * may introduce a device side synchronization point.
+ *
+ * Any per-function setting of shared memory bank size set via
+ * ::cuFuncSetSharedMemConfig will override the context wide setting set with
+ * ::cuCtxSetSharedMemConfig.
+ *
+ * Changing the shared memory bank size will not increase shared memory usage
+ * or affect occupancy of kernels, but may have major effects on performance.
+ * Larger bank sizes will allow for greater potential bandwidth to shared memory,
+ * but will change what kinds of accesses to shared memory will result in bank
+ * conflicts.
+ *
+ * This function will do nothing on devices with fixed shared memory bank size.
+ *
+ * The supported bank configurations are:
+ * - ::CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE: use the context's shared memory
+ *   configuration when launching this function.
+ * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: set shared memory bank width to
+ *   be natively four bytes when launching this function.
+ * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: set shared memory bank width to
+ *   be natively eight bytes when launching this function.
+ *
+ * \param hfunc  - kernel to be given a shared memory config
+ * \param config - requested shared memory configuration
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT
+ * \notefnerr
+ *
+ * \sa ::cuCtxGetCacheConfig,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxGetSharedMemConfig,
+ * ::cuCtxSetSharedMemConfig,
+ * ::cuFuncGetAttribute,
+ * ::cuLaunchKernel,
+ * ::cudaFuncSetSharedMemConfig
+ */
+CUresult CUDAAPI cuFuncSetSharedMemConfig(CUfunction hfunc, CUsharedconfig config);
+#endif
+
+#if __CUDA_API_VERSION >= 4000
+/**
+ * \brief Launches a CUDA function
+ *
+ * Invokes the kernel \p f on a \p gridDimX x \p gridDimY x \p gridDimZ
+ * grid of blocks. Each block contains \p blockDimX x \p blockDimY x
+ * \p blockDimZ threads.
+ *
+ * \p sharedMemBytes sets the amount of dynamic shared memory that will be
+ * available to each thread block.
+ *
+ * Kernel parameters to \p f can be specified in one of two ways:
+ *
+ * 1) Kernel parameters can be specified via \p kernelParams.  If \p f
+ * has N parameters, then \p kernelParams needs to be an array of N
+ * pointers.  Each of \p kernelParams[0] through \p kernelParams[N-1]
+ * must point to a region of memory from which the actual kernel
+ * parameter will be copied.  The number of kernel parameters and their
+ * offsets and sizes do not need to be specified as that information is
+ * retrieved directly from the kernel's image.
+ *
+ * 2) Kernel parameters can also be packaged by the application into
+ * a single buffer that is passed in via the \p extra parameter.
+ * This places the burden on the application of knowing each kernel
+ * parameter's size and alignment/padding within the buffer.  Here is
+ * an example of using the \p extra parameter in this manner:
+ * \code
+    size_t argBufferSize;
+    char argBuffer[256];
+
+    // populate argBuffer and argBufferSize
+
+    void *config[] = {
+        CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,
+        CU_LAUNCH_PARAM_BUFFER_SIZE,    &argBufferSize,
+        CU_LAUNCH_PARAM_END
+    };
+    status = cuLaunchKernel(f, gx, gy, gz, bx, by, bz, sh, s, NULL, config);
+ * \endcode
+ *
+ * The \p extra parameter exists to allow ::cuLaunchKernel to take
+ * additional less commonly used arguments.  \p extra specifies a list of
+ * names of extra settings and their corresponding values.  Each extra
+ * setting name is immediately followed by the corresponding value.  The
+ * list must be terminated with either NULL or ::CU_LAUNCH_PARAM_END.
+ *
+ * - ::CU_LAUNCH_PARAM_END, which indicates the end of the \p extra
+ *   array;
+ * - ::CU_LAUNCH_PARAM_BUFFER_POINTER, which specifies that the next
+ *   value in \p extra will be a pointer to a buffer containing all
+ *   the kernel parameters for launching kernel \p f;
+ * - ::CU_LAUNCH_PARAM_BUFFER_SIZE, which specifies that the next
+ *   value in \p extra will be a pointer to a size_t containing the
+ *   size of the buffer specified with ::CU_LAUNCH_PARAM_BUFFER_POINTER;
+ *
+ * The error ::CUDA_ERROR_INVALID_VALUE will be returned if kernel
+ * parameters are specified with both \p kernelParams and \p extra
+ * (i.e. both \p kernelParams and \p extra are non-NULL).
+ *
+ * Calling ::cuLaunchKernel() sets persistent function state that is
+ * the same as function state set through the following deprecated APIs:
+ *  ::cuFuncSetBlockShape(),
+ *  ::cuFuncSetSharedSize(),
+ *  ::cuParamSetSize(),
+ *  ::cuParamSeti(),
+ *  ::cuParamSetf(),
+ *  ::cuParamSetv().
+ *
+ * When the kernel \p f is launched via ::cuLaunchKernel(), the previous
+ * block shape, shared size and parameter info associated with \p f
+ * is overwritten.
+ *
+ * Note that to use ::cuLaunchKernel(), the kernel \p f must either have
+ * been compiled with toolchain version 3.2 or later so that it will
+ * contain kernel parameter information, or have no kernel parameters.
+ * If either of these conditions is not met, then ::cuLaunchKernel() will
+ * return ::CUDA_ERROR_INVALID_IMAGE.
+ *
+ * \param f              - Kernel to launch
+ * \param gridDimX       - Width of grid in blocks
+ * \param gridDimY       - Height of grid in blocks
+ * \param gridDimZ       - Depth of grid in blocks
+ * \param blockDimX      - X dimension of each thread block
+ * \param blockDimY      - Y dimension of each thread block
+ * \param blockDimZ      - Z dimension of each thread block
+ * \param sharedMemBytes - Dynamic shared-memory size per thread block in bytes
+ * \param hStream        - Stream identifier
+ * \param kernelParams   - Array of pointers to kernel parameters
+ * \param extra          - Extra options
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_IMAGE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_LAUNCH_FAILED,
+ * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
+ * ::CUDA_ERROR_LAUNCH_TIMEOUT,
+ * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
+ * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuCtxGetCacheConfig,
+ * ::cuCtxSetCacheConfig,
+ * ::cuFuncSetCacheConfig,
+ * ::cuFuncGetAttribute,
+ * ::cudaLaunchKernel
+ */
+CUresult CUDAAPI cuLaunchKernel(CUfunction f,
+                                unsigned int gridDimX,
+                                unsigned int gridDimY,
+                                unsigned int gridDimZ,
+                                unsigned int blockDimX,
+                                unsigned int blockDimY,
+                                unsigned int blockDimZ,
+                                unsigned int sharedMemBytes,
+                                CUstream hStream,
+                                void **kernelParams,
+                                void **extra);
+#endif /* __CUDA_API_VERSION >= 4000 */
+#if __CUDA_API_VERSION >= 9000
+/**
+ * \brief Launches a CUDA function where thread blocks can cooperate and synchronize as they execute
+ *
+ * Invokes the kernel \p f on a \p gridDimX x \p gridDimY x \p gridDimZ
+ * grid of blocks. Each block contains \p blockDimX x \p blockDimY x
+ * \p blockDimZ threads.
+ *
+ * \p sharedMemBytes sets the amount of dynamic shared memory that will be
+ * available to each thread block.
+ *
+ * The device on which this kernel is invoked must have a non-zero value for
+ * the device attribute ::CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH.
+ *
+ * The total number of blocks launched cannot exceed the maximum number of blocks per
+ * multiprocessor as returned by ::cuOccupancyMaxActiveBlocksPerMultiprocessor (or
+ * ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) times the number of multiprocessors
+ * as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT.
+ *
+ * The kernel cannot make use of CUDA dynamic parallelism.
+ *
+ * Kernel parameters must be specified via \p kernelParams.  If \p f
+ * has N parameters, then \p kernelParams needs to be an array of N
+ * pointers.  Each of \p kernelParams[0] through \p kernelParams[N-1]
+ * must point to a region of memory from which the actual kernel
+ * parameter will be copied.  The number of kernel parameters and their
+ * offsets and sizes do not need to be specified as that information is
+ * retrieved directly from the kernel's image.
+ *
+ * Calling ::cuLaunchCooperativeKernel() sets persistent function state that is
+ * the same as function state set through ::cuLaunchKernel API
+ *
+ * When the kernel \p f is launched via ::cuLaunchCooperativeKernel(), the previous
+ * block shape, shared size and parameter info associated with \p f
+ * is overwritten.
+ *
+ * Note that to use ::cuLaunchCooperativeKernel(), the kernel \p f must either have
+ * been compiled with toolchain version 3.2 or later so that it will
+ * contain kernel parameter information, or have no kernel parameters.
+ * If either of these conditions is not met, then ::cuLaunchCooperativeKernel() will
+ * return ::CUDA_ERROR_INVALID_IMAGE.
+ *
+ * \param f              - Kernel to launch
+ * \param gridDimX       - Width of grid in blocks
+ * \param gridDimY       - Height of grid in blocks
+ * \param gridDimZ       - Depth of grid in blocks
+ * \param blockDimX      - X dimension of each thread block
+ * \param blockDimY      - Y dimension of each thread block
+ * \param blockDimZ      - Z dimension of each thread block
+ * \param sharedMemBytes - Dynamic shared-memory size per thread block in bytes
+ * \param hStream        - Stream identifier
+ * \param kernelParams   - Array of pointers to kernel parameters
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_IMAGE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_LAUNCH_FAILED,
+ * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
+ * ::CUDA_ERROR_LAUNCH_TIMEOUT,
+ * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
+ * ::CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE,
+ * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuCtxGetCacheConfig,
+ * ::cuCtxSetCacheConfig,
+ * ::cuFuncSetCacheConfig,
+ * ::cuFuncGetAttribute,
+ * ::cuLaunchCooperativeKernelMultiDevice,
+ * ::cudaLaunchCooperativeKernel
+ */
+CUresult CUDAAPI cuLaunchCooperativeKernel(CUfunction f,
+                                unsigned int gridDimX,
+                                unsigned int gridDimY,
+                                unsigned int gridDimZ,
+                                unsigned int blockDimX,
+                                unsigned int blockDimY,
+                                unsigned int blockDimZ,
+                                unsigned int sharedMemBytes,
+                                CUstream hStream,
+                                void **kernelParams);
+
+/**
+ * \brief Launches CUDA functions on multiple devices where thread blocks can cooperate and synchronize as they execute
+ *
+ * Invokes kernels as specified in the \p launchParamsList array where each element
+ * of the array specifies all the parameters required to perform a single kernel launch.
+ * These kernels can cooperate and synchronize as they execute. The size of the array is
+ * specified by \p numDevices.
+ *
+ * No two kernels can be launched on the same device. All the devices targeted by this
+ * multi-device launch must be identical. All devices must have a non-zero value for the
+ * device attribute ::CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH.
+ *
+ * All kernels launched must be identical with respect to the compiled code. Note that
+ * any __device__, __constant__ or __managed__ variables present in the module that owns
+ * the kernel launched on each device, are independently instantiated on every device.
+ * It is the application's responsibility to ensure these variables are initialized and
+ * used appropriately.
+ *
+ * The size of the grids as specified in blocks, the size of the blocks themselves
+ * and the amount of shared memory used by each thread block must also match across
+ * all launched kernels.
+ *
+ * The streams used to launch these kernels must have been created via either ::cuStreamCreate
+ * or ::cuStreamCreateWithPriority. The NULL stream or ::CU_STREAM_LEGACY or ::CU_STREAM_PER_THREAD
+ * cannot be used.
+ *
+ * The total number of blocks launched per kernel cannot exceed the maximum number of blocks
+ * per multiprocessor as returned by ::cuOccupancyMaxActiveBlocksPerMultiprocessor (or
+ * ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) times the number of multiprocessors
+ * as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT. Since the
+ * total number of blocks launched per device has to match across all devices, the maximum
+ * number of blocks that can be launched per device will be limited by the device with the
+ * least number of multiprocessors.
+ *
+ * The kernels cannot make use of CUDA dynamic parallelism.
+ *
+ * The ::CUDA_LAUNCH_PARAMS structure is defined as:
+ * \code
+        typedef struct CUDA_LAUNCH_PARAMS_st
+        {
+            CUfunction function;
+            unsigned int gridDimX;
+            unsigned int gridDimY;
+            unsigned int gridDimZ;
+            unsigned int blockDimX;
+            unsigned int blockDimY;
+            unsigned int blockDimZ;
+            unsigned int sharedMemBytes;
+            CUstream hStream;
+            void **kernelParams;
+        } CUDA_LAUNCH_PARAMS;
+ * \endcode
+ * where:
+ * - ::CUDA_LAUNCH_PARAMS::function specifies the kernel to be launched. All functions must
+ *   be identical with respect to the compiled code.
+ * - ::CUDA_LAUNCH_PARAMS::gridDimX is the width of the grid in blocks. This must match across
+ *   all kernels launched.
+ * - ::CUDA_LAUNCH_PARAMS::gridDimY is the height of the grid in blocks. This must match across
+ *   all kernels launched.
+ * - ::CUDA_LAUNCH_PARAMS::gridDimZ is the depth of the grid in blocks. This must match across
+ *   all kernels launched.
+ * - ::CUDA_LAUNCH_PARAMS::blockDimX is the X dimension of each thread block. This must match across
+ *   all kernels launched.
+ * - ::CUDA_LAUNCH_PARAMS::blockDimX is the Y dimension of each thread block. This must match across
+ *   all kernels launched.
+ * - ::CUDA_LAUNCH_PARAMS::blockDimZ is the Z dimension of each thread block. This must match across
+ *   all kernels launched.
+ * - ::CUDA_LAUNCH_PARAMS::sharedMemBytes is the dynamic shared-memory size per thread block in bytes.
+ *   This must match across all kernels launched.
+ * - ::CUDA_LAUNCH_PARAMS::hStream is the handle to the stream to perform the launch in. This cannot
+ *   be the NULL stream or ::CU_STREAM_LEGACY or ::CU_STREAM_PER_THREAD. The CUDA context associated
+ *   with this stream must match that associated with ::CUDA_LAUNCH_PARAMS::function.
+ * - ::CUDA_LAUNCH_PARAMS::kernelParams is an array of pointers to kernel parameters. If
+ *   ::CUDA_LAUNCH_PARAMS::function has N parameters, then ::CUDA_LAUNCH_PARAMS::kernelParams
+ *   needs to be an array of N pointers. Each of ::CUDA_LAUNCH_PARAMS::kernelParams[0] through
+ *   ::CUDA_LAUNCH_PARAMS::kernelParams[N-1] must point to a region of memory from which the actual
+ *   kernel parameter will be copied. The number of kernel parameters and their offsets and sizes
+ *   do not need to be specified as that information is retrieved directly from the kernel's image.
+ *
+ * By default, the kernel won't begin execution on any GPU until all prior work in all the specified
+ * streams has completed. This behavior can be overridden by specifying the flag
+ * ::CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_PRE_LAUNCH_SYNC. When this flag is specified, each kernel
+ * will only wait for prior work in the stream corresponding to that GPU to complete before it begins
+ * execution.
+ *
+ * Similarly, by default, any subsequent work pushed in any of the specified streams will not begin
+ * execution until the kernels on all GPUs have completed. This behavior can be overridden by specifying
+ * the flag ::CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_POST_LAUNCH_SYNC. When this flag is specified,
+ * any subsequent work pushed in any of the specified streams will only wait for the kernel launched
+ * on the GPU corresponding to that stream to complete before it begins execution.
+ *
+ * Calling ::cuLaunchCooperativeKernelMultiDevice() sets persistent function state that is
+ * the same as function state set through ::cuLaunchKernel API when called individually for each
+ * element in \p launchParamsList.
+ *
+ * When kernels are launched via ::cuLaunchCooperativeKernelMultiDevice(), the previous
+ * block shape, shared size and parameter info associated with each ::CUDA_LAUNCH_PARAMS::function
+ * in \p launchParamsList is overwritten.
+ *
+ * Note that to use ::cuLaunchCooperativeKernelMultiDevice(), the kernels must either have
+ * been compiled with toolchain version 3.2 or later so that it will
+ * contain kernel parameter information, or have no kernel parameters.
+ * If either of these conditions is not met, then ::cuLaunchCooperativeKernelMultiDevice() will
+ * return ::CUDA_ERROR_INVALID_IMAGE.
+ *
+ * \param launchParamsList - List of launch parameters, one per device
+ * \param numDevices       - Size of the \p launchParamsList array
+ * \param flags            - Flags to control launch behavior
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_IMAGE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_LAUNCH_FAILED,
+ * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
+ * ::CUDA_ERROR_LAUNCH_TIMEOUT,
+ * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
+ * ::CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE,
+ * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuCtxGetCacheConfig,
+ * ::cuCtxSetCacheConfig,
+ * ::cuFuncSetCacheConfig,
+ * ::cuFuncGetAttribute,
+ * ::cuLaunchCooperativeKernel,
+ * ::cudaLaunchCooperativeKernelMultiDevice
+ */
+CUresult CUDAAPI cuLaunchCooperativeKernelMultiDevice(CUDA_LAUNCH_PARAMS *launchParamsList, unsigned int numDevices, unsigned int flags);
+
+#endif /* __CUDA_API_VERSION >= 9000 */
+
+#if __CUDA_API_VERSION >= 10000
+
+/**
+ * \brief Enqueues a host function call in a stream
+ *
+ * Enqueues a host function to run in a stream.  The function will be called
+ * after currently enqueued work and will block work added after it.
+ *
+ * The host function must not make any CUDA API calls.  Attempting to use a
+ * CUDA API may result in ::CUDA_ERROR_NOT_PERMITTED, but this is not required.
+ * The host function must not perform any synchronization that may depend on
+ * outstanding CUDA work not mandated to run earlier.  Host functions without a
+ * mandated order (such as in independent streams) execute in undefined order
+ * and may be serialized.
+ *
+ * For the purposes of Unified Memory, execution makes a number of guarantees:
+ * <ul>
+ *   <li>The stream is considered idle for the duration of the function's
+ *   execution.  Thus, for example, the function may always use memory attached
+ *   to the stream it was enqueued in.</li>
+ *   <li>The start of execution of the function has the same effect as
+ *   synchronizing an event recorded in the same stream immediately prior to
+ *   the function.  It thus synchronizes streams which have been "joined"
+ *   prior to the function.</li>
+ *   <li>Adding device work to any stream does not have the effect of making
+ *   the stream active until all preceding host functions and stream callbacks
+ *   have executed.  Thus, for
+ *   example, a function might use global attached memory even if work has
+ *   been added to another stream, if the work has been ordered behind the
+ *   function call with an event.</li>
+ *   <li>Completion of the function does not cause a stream to become
+ *   active except as described above.  The stream will remain idle
+ *   if no device work follows the function, and will remain idle across
+ *   consecutive host functions or stream callbacks without device work in
+ *   between.  Thus, for example,
+ *   stream synchronization can be done by signaling from a host function at the
+ *   end of the stream.</li>
+ * </ul>
+ *
+ * Note that, in contrast to ::cuStreamAddCallback, the function will not be
+ * called in the event of an error in the CUDA context.
+ *
+ * \param hStream  - Stream to enqueue function call in
+ * \param fn       - The function to call once preceding stream operations are complete
+ * \param userData - User-specified data to be passed to the function
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuStreamCreate,
+ * ::cuStreamQuery,
+ * ::cuStreamSynchronize,
+ * ::cuStreamWaitEvent,
+ * ::cuStreamDestroy,
+ * ::cuMemAllocManaged,
+ * ::cuStreamAttachMemAsync,
+ * ::cuStreamAddCallback
+ */
+CUresult CUDAAPI cuLaunchHostFunc(CUstream hStream, CUhostFn fn, void *userData);
+
+#endif /* __CUDA_API_VERSION >= 10000 */
+
+/** @} */ /* END CUDA_EXEC */
+
+/**
+ * \defgroup CUDA_EXEC_DEPRECATED Execution Control [DEPRECATED]
+ *
+ * ___MANBRIEF___ deprecated execution control functions of the low-level CUDA
+ * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the deprecated execution control functions of the
+ * low-level CUDA driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Sets the block-dimensions for the function
+ *
+ * \deprecated
+ *
+ * Specifies the \p x, \p y, and \p z dimensions of the thread blocks that are
+ * created when the kernel given by \p hfunc is launched.
+ *
+ * \param hfunc - Kernel to specify dimensions of
+ * \param x     - X dimension
+ * \param y     - Y dimension
+ * \param z     - Z dimension
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuFuncSetSharedSize,
+ * ::cuFuncSetCacheConfig,
+ * ::cuFuncGetAttribute,
+ * ::cuParamSetSize,
+ * ::cuParamSeti,
+ * ::cuParamSetf,
+ * ::cuParamSetv,
+ * ::cuLaunch,
+ * ::cuLaunchGrid,
+ * ::cuLaunchGridAsync,
+ * ::cuLaunchKernel
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetBlockShape(CUfunction hfunc, int x, int y, int z);
+
+/**
+ * \brief Sets the dynamic shared-memory size for the function
+ *
+ * \deprecated
+ *
+ * Sets through \p bytes the amount of dynamic shared memory that will be
+ * available to each thread block when the kernel given by \p hfunc is launched.
+ *
+ * \param hfunc - Kernel to specify dynamic shared-memory size for
+ * \param bytes - Dynamic shared-memory size per thread in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuFuncSetBlockShape,
+ * ::cuFuncSetCacheConfig,
+ * ::cuFuncGetAttribute,
+ * ::cuParamSetSize,
+ * ::cuParamSeti,
+ * ::cuParamSetf,
+ * ::cuParamSetv,
+ * ::cuLaunch,
+ * ::cuLaunchGrid,
+ * ::cuLaunchGridAsync,
+ * ::cuLaunchKernel
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetSharedSize(CUfunction hfunc, unsigned int bytes);
+
+/**
+ * \brief Sets the parameter size for the function
+ *
+ * \deprecated
+ *
+ * Sets through \p numbytes the total size in bytes needed by the function
+ * parameters of the kernel corresponding to \p hfunc.
+ *
+ * \param hfunc    - Kernel to set parameter size for
+ * \param numbytes - Size of parameter list in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuFuncSetBlockShape,
+ * ::cuFuncSetSharedSize,
+ * ::cuFuncGetAttribute,
+ * ::cuParamSetf,
+ * ::cuParamSeti,
+ * ::cuParamSetv,
+ * ::cuLaunch,
+ * ::cuLaunchGrid,
+ * ::cuLaunchGridAsync,
+ * ::cuLaunchKernel
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetSize(CUfunction hfunc, unsigned int numbytes);
+
+/**
+ * \brief Adds an integer parameter to the function's argument list
+ *
+ * \deprecated
+ *
+ * Sets an integer parameter that will be specified the next time the
+ * kernel corresponding to \p hfunc will be invoked. \p offset is a byte offset.
+ *
+ * \param hfunc  - Kernel to add parameter to
+ * \param offset - Offset to add parameter to argument list
+ * \param value  - Value of parameter
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuFuncSetBlockShape,
+ * ::cuFuncSetSharedSize,
+ * ::cuFuncGetAttribute,
+ * ::cuParamSetSize,
+ * ::cuParamSetf,
+ * ::cuParamSetv,
+ * ::cuLaunch,
+ * ::cuLaunchGrid,
+ * ::cuLaunchGridAsync,
+ * ::cuLaunchKernel
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuParamSeti(CUfunction hfunc, int offset, unsigned int value);
+
+/**
+ * \brief Adds a floating-point parameter to the function's argument list
+ *
+ * \deprecated
+ *
+ * Sets a floating-point parameter that will be specified the next time the
+ * kernel corresponding to \p hfunc will be invoked. \p offset is a byte offset.
+ *
+ * \param hfunc  - Kernel to add parameter to
+ * \param offset - Offset to add parameter to argument list
+ * \param value  - Value of parameter
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuFuncSetBlockShape,
+ * ::cuFuncSetSharedSize,
+ * ::cuFuncGetAttribute,
+ * ::cuParamSetSize,
+ * ::cuParamSeti,
+ * ::cuParamSetv,
+ * ::cuLaunch,
+ * ::cuLaunchGrid,
+ * ::cuLaunchGridAsync,
+ * ::cuLaunchKernel
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetf(CUfunction hfunc, int offset, float value);
+
+/**
+ * \brief Adds arbitrary data to the function's argument list
+ *
+ * \deprecated
+ *
+ * Copies an arbitrary amount of data (specified in \p numbytes) from \p ptr
+ * into the parameter space of the kernel corresponding to \p hfunc. \p offset
+ * is a byte offset.
+ *
+ * \param hfunc    - Kernel to add data to
+ * \param offset   - Offset to add data to argument list
+ * \param ptr      - Pointer to arbitrary data
+ * \param numbytes - Size of data to copy in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuFuncSetBlockShape,
+ * ::cuFuncSetSharedSize,
+ * ::cuFuncGetAttribute,
+ * ::cuParamSetSize,
+ * ::cuParamSetf,
+ * ::cuParamSeti,
+ * ::cuLaunch,
+ * ::cuLaunchGrid,
+ * ::cuLaunchGridAsync,
+ * ::cuLaunchKernel
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetv(CUfunction hfunc, int offset, void *ptr, unsigned int numbytes);
+
+/**
+ * \brief Launches a CUDA function
+ *
+ * \deprecated
+ *
+ * Invokes the kernel \p f on a 1 x 1 x 1 grid of blocks. The block
+ * contains the number of threads specified by a previous call to
+ * ::cuFuncSetBlockShape().
+ *
+ * \param f - Kernel to launch
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_LAUNCH_FAILED,
+ * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
+ * ::CUDA_ERROR_LAUNCH_TIMEOUT,
+ * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
+ * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
+ * \notefnerr
+ *
+ * \sa ::cuFuncSetBlockShape,
+ * ::cuFuncSetSharedSize,
+ * ::cuFuncGetAttribute,
+ * ::cuParamSetSize,
+ * ::cuParamSetf,
+ * ::cuParamSeti,
+ * ::cuParamSetv,
+ * ::cuLaunchGrid,
+ * ::cuLaunchGridAsync,
+ * ::cuLaunchKernel
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuLaunch(CUfunction f);
+
+/**
+ * \brief Launches a CUDA function
+ *
+ * \deprecated
+ *
+ * Invokes the kernel \p f on a \p grid_width x \p grid_height grid of
+ * blocks. Each block contains the number of threads specified by a previous
+ * call to ::cuFuncSetBlockShape().
+ *
+ * \param f           - Kernel to launch
+ * \param grid_width  - Width of grid in blocks
+ * \param grid_height - Height of grid in blocks
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_LAUNCH_FAILED,
+ * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
+ * ::CUDA_ERROR_LAUNCH_TIMEOUT,
+ * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
+ * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
+ * \notefnerr
+ *
+ * \sa ::cuFuncSetBlockShape,
+ * ::cuFuncSetSharedSize,
+ * ::cuFuncGetAttribute,
+ * ::cuParamSetSize,
+ * ::cuParamSetf,
+ * ::cuParamSeti,
+ * ::cuParamSetv,
+ * ::cuLaunch,
+ * ::cuLaunchGridAsync,
+ * ::cuLaunchKernel
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGrid(CUfunction f, int grid_width, int grid_height);
+
+/**
+ * \brief Launches a CUDA function
+ *
+ * \deprecated
+ *
+ * Invokes the kernel \p f on a \p grid_width x \p grid_height grid of
+ * blocks. Each block contains the number of threads specified by a previous
+ * call to ::cuFuncSetBlockShape().
+ *
+ * \param f           - Kernel to launch
+ * \param grid_width  - Width of grid in blocks
+ * \param grid_height - Height of grid in blocks
+ * \param hStream     - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_LAUNCH_FAILED,
+ * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
+ * ::CUDA_ERROR_LAUNCH_TIMEOUT,
+ * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
+ * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
+ *
+ * \note In certain cases where cubins are created with no ABI (i.e., using \p ptxas \p --abi-compile \p no),
+ *       this function may serialize kernel launches. In order to force the CUDA driver to retain
+ *       asynchronous behavior, set the ::CU_CTX_LMEM_RESIZE_TO_MAX flag during context creation (see ::cuCtxCreate).
+ *
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuFuncSetBlockShape,
+ * ::cuFuncSetSharedSize,
+ * ::cuFuncGetAttribute,
+ * ::cuParamSetSize,
+ * ::cuParamSetf,
+ * ::cuParamSeti,
+ * ::cuParamSetv,
+ * ::cuLaunch,
+ * ::cuLaunchGrid,
+ * ::cuLaunchKernel
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGridAsync(CUfunction f, int grid_width, int grid_height, CUstream hStream);
+
+
+/**
+ * \brief Adds a texture-reference to the function's argument list
+ *
+ * \deprecated
+ *
+ * Makes the CUDA array or linear memory bound to the texture reference
+ * \p hTexRef available to a device program as a texture. In this version of
+ * CUDA, the texture-reference must be obtained via ::cuModuleGetTexRef() and
+ * the \p texunit parameter must be set to ::CU_PARAM_TR_DEFAULT.
+ *
+ * \param hfunc   - Kernel to add texture-reference to
+ * \param texunit - Texture unit (must be ::CU_PARAM_TR_DEFAULT)
+ * \param hTexRef - Texture-reference to add to argument list
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetTexRef(CUfunction hfunc, int texunit, CUtexref hTexRef);
+/** @} */ /* END CUDA_EXEC_DEPRECATED */
+
+#if __CUDA_API_VERSION >= 10000
+/**
+ * \defgroup CUDA_GRAPH Graph Management
+ *
+ * ___MANBRIEF___ graph management functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the graph management functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Creates a graph
+ *
+ * Creates an empty graph, which is returned via \p phGraph.
+ *
+ * \param phGraph - Returns newly created graph
+ * \param flags   - Graph creation flags, must be 0
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddHostNode,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphAddMemsetNode,
+ * ::cuGraphInstantiate,
+ * ::cuGraphDestroy,
+ * ::cuGraphGetNodes,
+ * ::cuGraphGetRootNodes,
+ * ::cuGraphGetEdges,
+ * ::cuGraphClone
+ */
+CUresult CUDAAPI cuGraphCreate(CUgraph *phGraph, unsigned int flags);
+
+/**
+ * \brief Creates a kernel execution node and adds it to a graph
+ *
+ * Creates a new kernel execution node and adds it to \p hGraph with \p numDependencies
+ * dependencies specified via \p dependencies and arguments specified in \p nodeParams.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p phGraphNode.
+ *
+ * The CUDA_KERNEL_NODE_PARAMS structure is defined as:
+ *
+ * \code
+ *  typedef struct CUDA_KERNEL_NODE_PARAMS_st {
+ *      CUfunction func;
+ *      unsigned int gridDimX;
+ *      unsigned int gridDimY;
+ *      unsigned int gridDimZ;
+ *      unsigned int blockDimX;
+ *      unsigned int blockDimY;
+ *      unsigned int blockDimZ;
+ *      unsigned int sharedMemBytes;
+ *      void **kernelParams;
+ *      void **extra;
+ *  } CUDA_KERNEL_NODE_PARAMS;
+ * \endcode
+ *
+ * When the graph is launched, the node will invoke kernel \p func on a (\p gridDimX x
+ * \p gridDimY x \p gridDimZ) grid of blocks. Each block contains
+ * (\p blockDimX x \p blockDimY x \p blockDimZ) threads.
+ *
+ * \p sharedMemBytes sets the amount of dynamic shared memory that will be
+ * available to each thread block.
+ *
+ * Kernel parameters to \p func can be specified in one of two ways:
+ *
+ * 1) Kernel parameters can be specified via \p kernelParams. If the kernel has N
+ * parameters, then \p kernelParams needs to be an array of N pointers. Each pointer,
+ * from \p kernelParams[0] to \p kernelParams[N-1], points to the region of memory from which the actual
+ * parameter will be copied. The number of kernel parameters and their offsets and sizes do not need
+ * to be specified as that information is retrieved directly from the kernel's image.
+ *
+ * 2) Kernel parameters can also be packaged by the application into a single buffer that is passed in
+ * via \p extra. This places the burden on the application of knowing each kernel
+ * parameter's size and alignment/padding within the buffer. The \p extra parameter exists
+ * to allow this function to take additional less commonly used arguments. \p extra specifies
+ * a list of names of extra settings and their corresponding values. Each extra setting name is
+ * immediately followed by the corresponding value. The list must be terminated with either NULL or
+ * CU_LAUNCH_PARAM_END.
+ *
+ * - ::CU_LAUNCH_PARAM_END, which indicates the end of the \p extra
+ *   array;
+ * - ::CU_LAUNCH_PARAM_BUFFER_POINTER, which specifies that the next
+ *   value in \p extra will be a pointer to a buffer
+ *   containing all the kernel parameters for launching kernel
+ *   \p func;
+ * - ::CU_LAUNCH_PARAM_BUFFER_SIZE, which specifies that the next
+ *   value in \p extra will be a pointer to a size_t
+ *   containing the size of the buffer specified with
+ *   ::CU_LAUNCH_PARAM_BUFFER_POINTER;
+ *
+ * The error ::CUDA_ERROR_INVALID_VALUE will be returned if kernel parameters are specified with both
+ * \p kernelParams and \p extra (i.e. both \p kernelParams and
+ * \p extra are non-NULL).
+ *
+ * The \p kernelParams or \p extra array, as well as the argument values it points to,
+ * are copied during this call.
+ *
+ * \note Kernels launched using graphs must not use texture and surface references. Reading or
+ *       writing through any texture or surface reference is undefined behavior.
+ *       This restriction does not apply to texture and surface objects.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param nodeParams      - Parameters for the GPU execution node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuLaunchKernel,
+ * ::cuGraphKernelNodeGetParams,
+ * ::cuGraphKernelNodeSetParams,
+ * ::cuGraphCreate,
+ * ::cuGraphDestroyNode,
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddHostNode,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphAddMemsetNode
+ */
+CUresult CUDAAPI cuGraphAddKernelNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_KERNEL_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Returns a kernel node's parameters
+ *
+ * Returns the parameters of kernel node \p hNode in \p nodeParams.
+ * The \p kernelParams or \p extra array returned in \p nodeParams,
+ * as well as the argument values it points to, are owned by the node.
+ * This memory remains valid until the node is destroyed or its
+ * parameters are modified, and should not be modified
+ * directly. Use ::cuGraphKernelNodeSetParams to update the
+ * parameters of this node.
+ *
+ * The params will contain either \p kernelParams or \p extra,
+ * according to which of these was most recently set on the node.
+ *
+ * \param hNode      - Node to get the parameters for
+ * \param nodeParams - Pointer to return the parameters
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuLaunchKernel,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphKernelNodeSetParams
+ */
+CUresult CUDAAPI cuGraphKernelNodeGetParams(CUgraphNode hNode, CUDA_KERNEL_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Sets a kernel node's parameters
+ *
+ * Sets the parameters of kernel node \p hNode to \p nodeParams.
+ *
+ * \param hNode      - Node to set the parameters for
+ * \param nodeParams - Parameters to copy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuLaunchKernel,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphKernelNodeGetParams
+ */
+CUresult CUDAAPI cuGraphKernelNodeSetParams(CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Creates a memcpy node and adds it to a graph
+ *
+ * Creates a new memcpy node and adds it to \p hGraph with \p numDependencies
+ * dependencies specified via \p dependencies.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p phGraphNode.
+ *
+ * When the graph is launched, the node will perform the memcpy described by \p copyParams.
+ * See ::cuMemcpy3D() for a description of the structure and its restrictions.
+ *
+ * Memcpy nodes have some additional restrictions with regards to managed memory, if the
+ * system contains at least one device which has a zero value for the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. If one or more of the operands refer
+ * to managed memory, then using the memory type ::CU_MEMORYTYPE_UNIFIED is disallowed
+ * for those operand(s). The managed memory will be treated as residing on either the
+ * host or the device, depending on which memory type is specified.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param copyParams      - Parameters for the memory copy
+ * \param ctx             - Context on which to run the node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuMemcpy3D,
+ * ::cuGraphMemcpyNodeGetParams,
+ * ::cuGraphMemcpyNodeSetParams,
+ * ::cuGraphCreate,
+ * ::cuGraphDestroyNode,
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddHostNode,
+ * ::cuGraphAddMemsetNode
+ */
+CUresult CUDAAPI cuGraphAddMemcpyNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMCPY3D *copyParams, CUcontext ctx);
+
+/**
+ * \brief Returns a memcpy node's parameters
+ *
+ * Returns the parameters of memcpy node \p hNode in \p nodeParams.
+ *
+ * \param hNode      - Node to get the parameters for
+ * \param nodeParams - Pointer to return the parameters
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuMemcpy3D,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphMemcpyNodeSetParams
+ */
+CUresult CUDAAPI cuGraphMemcpyNodeGetParams(CUgraphNode hNode, CUDA_MEMCPY3D *nodeParams);
+
+/**
+ * \brief Sets a memcpy node's parameters
+ *
+ * Sets the parameters of memcpy node \p hNode to \p nodeParams.
+ *
+ * \param hNode      - Node to set the parameters for
+ * \param nodeParams - Parameters to copy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuMemcpy3D,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphMemcpyNodeGetParams
+ */
+CUresult CUDAAPI cuGraphMemcpyNodeSetParams(CUgraphNode hNode, const CUDA_MEMCPY3D *nodeParams);
+
+/**
+ * \brief Creates a memset node and adds it to a graph
+ *
+ * Creates a new memset node and adds it to \p hGraph with \p numDependencies
+ * dependencies specified via \p dependencies.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p phGraphNode.
+ *
+ * The element size must be 1, 2, or 4 bytes.
+ * When the graph is launched, the node will perform the memset described by \p memsetParams.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param memsetParams    - Parameters for the memory set
+ * \param ctx             - Context on which to run the node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_CONTEXT
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuMemsetD2D32,
+ * ::cuGraphMemsetNodeGetParams,
+ * ::cuGraphMemsetNodeSetParams,
+ * ::cuGraphCreate,
+ * ::cuGraphDestroyNode,
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddHostNode,
+ * ::cuGraphAddMemcpyNode
+ */
+CUresult CUDAAPI cuGraphAddMemsetNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMSET_NODE_PARAMS *memsetParams, CUcontext ctx);
+
+/**
+ * \brief Returns a memset node's parameters
+ *
+ * Returns the parameters of memset node \p hNode in \p nodeParams.
+ *
+ * \param hNode      - Node to get the parameters for
+ * \param nodeParams - Pointer to return the parameters
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuMemsetD2D32,
+ * ::cuGraphAddMemsetNode,
+ * ::cuGraphMemsetNodeSetParams
+ */
+CUresult CUDAAPI cuGraphMemsetNodeGetParams(CUgraphNode hNode, CUDA_MEMSET_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Sets a memset node's parameters
+ *
+ * Sets the parameters of memset node \p hNode to \p nodeParams.
+ *
+ * \param hNode      - Node to set the parameters for
+ * \param nodeParams - Parameters to copy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuMemsetD2D32,
+ * ::cuGraphAddMemsetNode,
+ * ::cuGraphMemsetNodeGetParams
+ */
+CUresult CUDAAPI cuGraphMemsetNodeSetParams(CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Creates a host execution node and adds it to a graph
+ *
+ * Creates a new CPU execution node and adds it to \p hGraph with \p numDependencies
+ * dependencies specified via \p dependencies and arguments specified in \p nodeParams.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p phGraphNode.
+ *
+ * When the graph is launched, the node will invoke the specified CPU function.
+ * Host nodes are not supported under MPS with pre-Volta GPUs.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param nodeParams      - Parameters for the host node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuLaunchHostFunc,
+ * ::cuGraphHostNodeGetParams,
+ * ::cuGraphHostNodeSetParams,
+ * ::cuGraphCreate,
+ * ::cuGraphDestroyNode,
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphAddMemsetNode
+ */
+CUresult CUDAAPI cuGraphAddHostNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_HOST_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Returns a host node's parameters
+ *
+ * Returns the parameters of host node \p hNode in \p nodeParams.
+ *
+ * \param hNode      - Node to get the parameters for
+ * \param nodeParams - Pointer to return the parameters
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuLaunchHostFunc,
+ * ::cuGraphAddHostNode,
+ * ::cuGraphHostNodeSetParams
+ */
+CUresult CUDAAPI cuGraphHostNodeGetParams(CUgraphNode hNode, CUDA_HOST_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Sets a host node's parameters
+ *
+ * Sets the parameters of host node \p hNode to \p nodeParams.
+ *
+ * \param hNode      - Node to set the parameters for
+ * \param nodeParams - Parameters to copy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuLaunchHostFunc,
+ * ::cuGraphAddHostNode,
+ * ::cuGraphHostNodeGetParams
+ */
+CUresult CUDAAPI cuGraphHostNodeSetParams(CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Creates a child graph node and adds it to a graph
+ *
+ * Creates a new node which executes an embedded graph, and adds it to \p hGraph with
+ * \p numDependencies dependencies specified via \p dependencies.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p phGraphNode.
+ *
+ * The node executes an embedded child graph. The child graph is cloned in this call.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param childGraph      - The graph to clone into this node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphChildGraphNodeGetGraph,
+ * ::cuGraphCreate,
+ * ::cuGraphDestroyNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddHostNode,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphAddMemsetNode,
+ * ::cuGraphClone
+ */
+CUresult CUDAAPI cuGraphAddChildGraphNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUgraph childGraph);
+
+/**
+ * \brief Gets a handle to the embedded graph of a child graph node
+ *
+ * Gets a handle to the embedded graph in a child graph node. This call
+ * does not clone the graph. Changes to the graph will be reflected in
+ * the node, and the node retains ownership of the graph.
+ *
+ * \param hNode   - Node to get the embedded graph for
+ * \param phGraph - Location to store a handle to the graph
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphNodeFindInClone
+ */
+CUresult CUDAAPI cuGraphChildGraphNodeGetGraph(CUgraphNode hNode, CUgraph *phGraph);
+
+/**
+ * \brief Creates an empty node and adds it to a graph
+ *
+ * Creates a new node which performs no operation, and adds it to \p hGraph with
+ * \p numDependencies dependencies specified via \p dependencies.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p phGraphNode.
+ *
+ * An empty node performs no operation during execution, but can be used for
+ * transitive ordering. For example, a phased execution graph with 2 groups of n
+ * nodes with a barrier between them can be represented using an empty node and
+ * 2*n dependency edges, rather than no empty node and n^2 dependency edges.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphCreate,
+ * ::cuGraphDestroyNode,
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddHostNode,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphAddMemsetNode
+ */
+CUresult CUDAAPI cuGraphAddEmptyNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies);
+
+/**
+ * \brief Clones a graph
+ *
+ * This function creates a copy of \p originalGraph and returns it in \p * phGraphClone.
+ * All parameters are copied into the cloned graph. The original graph may be modified
+ * after this call without affecting the clone.
+ *
+ * Child graph nodes in the original graph are recursively copied into the clone.
+ *
+ * \param phGraphClone  - Returns newly created cloned graph
+ * \param originalGraph - Graph to clone
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphCreate,
+ * ::cuGraphNodeFindInClone
+ */
+CUresult CUDAAPI cuGraphClone(CUgraph *phGraphClone, CUgraph originalGraph);
+
+/**
+ * \brief Finds a cloned version of a node
+ *
+ * This function returns the node in \p hClonedGraph corresponding to \p hOriginalNode
+ * in the original graph.
+ *
+ * \p hClonedGraph must have been cloned from \p hOriginalGraph via ::cuGraphClone.
+ * \p hOriginalNode must have been in \p hOriginalGraph at the time of the call to
+ * ::cuGraphClone, and the corresponding cloned node in \p hClonedGraph must not have
+ * been removed. The cloned node is then returned via \p phClonedNode.
+ *
+ * \param phNode  - Returns handle to the cloned node
+ * \param hOriginalNode - Handle to the original node
+ * \param hClonedGraph - Cloned graph to query
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphClone
+ */
+CUresult CUDAAPI cuGraphNodeFindInClone(CUgraphNode *phNode, CUgraphNode hOriginalNode, CUgraph hClonedGraph);
+
+/**
+ * \brief Returns a node's type
+ *
+ * Returns the node type of \p hNode in \p type.
+ *
+ * \param hNode - Node to query
+ * \param type  - Pointer to return the node type
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphGetNodes,
+ * ::cuGraphGetRootNodes,
+ * ::cuGraphChildGraphNodeGetGraph,
+ * ::cuGraphKernelNodeGetParams,
+ * ::cuGraphKernelNodeSetParams,
+ * ::cuGraphHostNodeGetParams,
+ * ::cuGraphHostNodeSetParams,
+ * ::cuGraphMemcpyNodeGetParams,
+ * ::cuGraphMemcpyNodeSetParams,
+ * ::cuGraphMemsetNodeGetParams,
+ * ::cuGraphMemsetNodeSetParams
+ */
+CUresult CUDAAPI cuGraphNodeGetType(CUgraphNode hNode, CUgraphNodeType *type);
+
+/**
+ * \brief Returns a graph's nodes
+ *
+ * Returns a list of \p hGraph's nodes. \p nodes may be NULL, in which case this
+ * function will return the number of nodes in \p numNodes. Otherwise,
+ * \p numNodes entries will be filled in. If \p numNodes is higher than the actual
+ * number of nodes, the remaining entries in \p nodes will be set to NULL, and the
+ * number of nodes actually obtained will be returned in \p numNodes.
+ *
+ * \param hGraph   - Graph to query
+ * \param nodes    - Pointer to return the nodes
+ * \param numNodes - See description
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphCreate,
+ * ::cuGraphGetRootNodes,
+ * ::cuGraphGetEdges,
+ * ::cuGraphNodeGetType,
+ * ::cuGraphNodeGetDependencies,
+ * ::cuGraphNodeGetDependentNodes
+ */
+CUresult CUDAAPI cuGraphGetNodes(CUgraph hGraph, CUgraphNode *nodes, size_t *numNodes);
+
+/**
+ * \brief Returns a graph's root nodes
+ *
+ * Returns a list of \p hGraph's root nodes. \p rootNodes may be NULL, in which case this
+ * function will return the number of root nodes in \p numRootNodes. Otherwise,
+ * \p numRootNodes entries will be filled in. If \p numRootNodes is higher than the actual
+ * number of root nodes, the remaining entries in \p rootNodes will be set to NULL, and the
+ * number of nodes actually obtained will be returned in \p numRootNodes.
+ *
+ * \param hGraph       - Graph to query
+ * \param rootNodes    - Pointer to return the root nodes
+ * \param numRootNodes - See description
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphCreate,
+ * ::cuGraphGetNodes,
+ * ::cuGraphGetEdges,
+ * ::cuGraphNodeGetType,
+ * ::cuGraphNodeGetDependencies,
+ * ::cuGraphNodeGetDependentNodes
+ */
+CUresult CUDAAPI cuGraphGetRootNodes(CUgraph hGraph, CUgraphNode *rootNodes, size_t *numRootNodes);
+
+/**
+ * \brief Returns a graph's dependency edges
+ *
+ * Returns a list of \p hGraph's dependency edges. Edges are returned via corresponding
+ * indices in \p from and \p to; that is, the node in \p to[i] has a dependency on the
+ * node in \p from[i]. \p from and \p to may both be NULL, in which
+ * case this function only returns the number of edges in \p numEdges. Otherwise,
+ * \p numEdges entries will be filled in. If \p numEdges is higher than the actual
+ * number of edges, the remaining entries in \p from and \p to will be set to NULL, and
+ * the number of edges actually returned will be written to \p numEdges.
+ *
+ * \param hGraph   - Graph to get the edges from
+ * \param from     - Location to return edge endpoints
+ * \param to       - Location to return edge endpoints
+ * \param numEdges - See description
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphGetNodes,
+ * ::cuGraphGetRootNodes,
+ * ::cuGraphAddDependencies,
+ * ::cuGraphRemoveDependencies,
+ * ::cuGraphNodeGetDependencies,
+ * ::cuGraphNodeGetDependentNodes
+ */
+CUresult CUDAAPI cuGraphGetEdges(CUgraph hGraph, CUgraphNode *from, CUgraphNode *to, size_t *numEdges);
+
+/**
+ * \brief Returns a node's dependencies
+ *
+ * Returns a list of \p node's dependencies. \p dependencies may be NULL, in which case this
+ * function will return the number of dependencies in \p numDependencies. Otherwise,
+ * \p numDependencies entries will be filled in. If \p numDependencies is higher than the actual
+ * number of dependencies, the remaining entries in \p dependencies will be set to NULL, and the
+ * number of nodes actually obtained will be returned in \p numDependencies.
+ *
+ * \param hNode           - Node to query
+ * \param dependencies    - Pointer to return the dependencies
+ * \param numDependencies - See description
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphNodeGetDependentNodes,
+ * ::cuGraphGetNodes,
+ * ::cuGraphGetRootNodes,
+ * ::cuGraphGetEdges,
+ * ::cuGraphAddDependencies,
+ * ::cuGraphRemoveDependencies
+ */
+CUresult CUDAAPI cuGraphNodeGetDependencies(CUgraphNode hNode, CUgraphNode *dependencies, size_t *numDependencies);
+
+/**
+ * \brief Returns a node's dependent nodes
+ *
+ * Returns a list of \p node's dependent nodes. \p dependentNodes may be NULL, in which
+ * case this function will return the number of dependent nodes in \p numDependentNodes.
+ * Otherwise, \p numDependentNodes entries will be filled in. If \p numDependentNodes is
+ * higher than the actual number of dependent nodes, the remaining entries in
+ * \p dependentNodes will be set to NULL, and the number of nodes actually obtained will
+ * be returned in \p numDependentNodes.
+ *
+ * \param hNode             - Node to query
+ * \param dependentNodes    - Pointer to return the dependent nodes
+ * \param numDependentNodes - See description
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphNodeGetDependencies,
+ * ::cuGraphGetNodes,
+ * ::cuGraphGetRootNodes,
+ * ::cuGraphGetEdges,
+ * ::cuGraphAddDependencies,
+ * ::cuGraphRemoveDependencies
+ */
+CUresult CUDAAPI cuGraphNodeGetDependentNodes(CUgraphNode hNode, CUgraphNode *dependentNodes, size_t *numDependentNodes);
+
+/**
+ * \brief Adds dependency edges to a graph
+ *
+ * The number of dependencies to be added is defined by \p numDependencies
+ * Elements in \p from and \p to at corresponding indices define a dependency.
+ * Each node in \p from and \p to must belong to \p hGraph.
+ *
+ * If \p numDependencies is 0, elements in \p from and \p to will be ignored.
+ * Specifying an existing dependency will return an error.
+ *
+ * \param hGraph - Graph to which dependencies are added
+ * \param from - Array of nodes that provide the dependencies
+ * \param to - Array of dependent nodes
+ * \param numDependencies - Number of dependencies to be added
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphRemoveDependencies,
+ * ::cuGraphGetEdges,
+ * ::cuGraphNodeGetDependencies,
+ * ::cuGraphNodeGetDependentNodes
+ */
+CUresult CUDAAPI cuGraphAddDependencies(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, size_t numDependencies);
+
+/**
+ * \brief Removes dependency edges from a graph
+ *
+ * The number of \p dependencies to be removed is defined by \p numDependencies.
+ * Elements in \p from and \p to at corresponding indices define a dependency.
+ * Each node in \p from and \p to must belong to \p hGraph.
+ *
+ * If \p numDependencies is 0, elements in \p from and \p to will be ignored.
+ * Specifying a non-existing dependency will return an error.
+ *
+ * \param hGraph - Graph from which to remove dependencies
+ * \param from - Array of nodes that provide the dependencies
+ * \param to - Array of dependent nodes
+ * \param numDependencies - Number of dependencies to be removed
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddDependencies,
+ * ::cuGraphGetEdges,
+ * ::cuGraphNodeGetDependencies,
+ * ::cuGraphNodeGetDependentNodes
+ */
+CUresult CUDAAPI cuGraphRemoveDependencies(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, size_t numDependencies);
+
+/**
+ * \brief Remove a node from the graph
+ *
+ * Removes \p hNode from its graph. This operation also severs any dependencies of other nodes
+ * on \p hNode and vice versa.
+ *
+ * \param hNode  - Node to remove
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddHostNode,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphAddMemsetNode
+ */
+CUresult CUDAAPI cuGraphDestroyNode(CUgraphNode hNode);
+
+/**
+ * \brief Creates an executable graph from a graph
+ *
+ * Instantiates \p hGraph as an executable graph. The graph is validated for any
+ * structural constraints or intra-node constraints which were not previously
+ * validated. If instantiation is successful, a handle to the instantiated graph
+ * is returned in \p graphExec.
+ *
+ * If there are any errors, diagnostic information may be returned in \p errorNode and
+ * \p logBuffer. This is the primary way to inspect instantiation errors. The output
+ * will be null terminated unless the diagnostics overflow
+ * the buffer. In this case, they will be truncated, and the last byte can be
+ * inspected to determine if truncation occurred.
+ *
+ * \param phGraphExec - Returns instantiated graph
+ * \param hGraph      - Graph to instantiate
+ * \param phErrorNode - In case of an instantiation error, this may be modified to
+ *                      indicate a node contributing to the error
+ * \param logBuffer   - A character buffer to store diagnostic messages
+ * \param bufferSize  - Size of the log buffer in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphCreate,
+ * ::cuGraphLaunch,
+ * ::cuGraphExecDestroy
+ */
+CUresult CUDAAPI cuGraphInstantiate(CUgraphExec *phGraphExec, CUgraph hGraph, CUgraphNode *phErrorNode, char *logBuffer, size_t bufferSize);
+
+
+#if __CUDA_API_VERSION >= 10010
+/**
+ * \brief Sets the parameters for a kernel node in the given graphExec
+ *
+ * Sets the parameters of a kernel node in an executable graph \p hGraphExec.
+ * The node is identified by the corresponding node \p hNode in the
+ * non-executable graph, from which the executable graph was instantiated.
+ *
+ * \p hNode must not have been removed from the original graph. The \p func field
+ * of \p nodeParams cannot be modified and must match the original value.
+ * All other values can be modified.
+ *
+ * The modifications take effect at the next launch of \p hGraphExec. Already
+ * enqueued or running launches of \p hGraphExec are not affected by this call.
+ * \p hNode is also not modified by this call.
+ *
+ * \param hGraphExec  - The executable graph in which to set the specified node
+ * \param hNode       - kernel node from the graph from which graphExec was instantiated
+ * \param nodeParams  - Updated Parameters to set
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphKernelNodeSetParams,
+ * ::cuGraphInstantiate
+ */
+ CUresult CUDAAPI cuGraphExecKernelNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS *nodeParams);
+
+#endif /* __CUDA_API_VERSION >= 10010 */
+
+/**
+ * \brief Launches an executable graph in a stream
+ *
+ * Executes \p hGraphExec in \p hStream. Only one instance of \p hGraphExec may be executing
+ * at a time. Each launch is ordered behind both any previous work in \p hStream
+ * and any previous launches of \p hGraphExec. To execute a graph concurrently, it must be
+ * instantiated multiple times into multiple executable graphs.
+ *
+ * \param hGraphExec - Executable graph to launch
+ * \param hStream    - Stream in which to launch the graph
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphInstantiate,
+ * ::cuGraphExecDestroy
+ */
+CUresult CUDAAPI cuGraphLaunch(CUgraphExec hGraphExec, CUstream hStream);
+
+/**
+ * \brief Destroys an executable graph
+ *
+ * Destroys the executable graph specified by \p hGraphExec, as well
+ * as all of its executable nodes. If the executable graph is
+ * in-flight, it will not be terminated, but rather freed
+ * asynchronously on completion.
+ *
+ * \param hGraphExec - Executable graph to destroy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphInstantiate,
+ * ::cuGraphLaunch
+ */
+CUresult CUDAAPI cuGraphExecDestroy(CUgraphExec hGraphExec);
+
+/**
+ * \brief Destroys a graph
+ *
+ * Destroys the graph specified by \p hGraph, as well as all of its nodes.
+ *
+ * \param hGraph - Graph to destroy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphCreate
+ */
+CUresult CUDAAPI cuGraphDestroy(CUgraph hGraph);
+/** @} */ /* END CUDA_GRAPH */
+#endif /* __CUDA_API_VERSION >= 10000 */
+
+#if __CUDA_API_VERSION >= 6050
+/**
+ * \defgroup CUDA_OCCUPANCY Occupancy
+ *
+ * ___MANBRIEF___ occupancy calculation functions of the low-level CUDA driver
+ * API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the occupancy calculation functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Returns occupancy of a function
+ *
+ * Returns in \p *numBlocks the number of the maximum active blocks per
+ * streaming multiprocessor.
+ *
+ * \param numBlocks       - Returned occupancy
+ * \param func            - Kernel for which occupancy is calculated
+ * \param blockSize       - Block size the kernel is intended to be launched with
+ * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaOccupancyMaxActiveBlocksPerMultiprocessor
+ */
+CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize);
+
+/**
+ * \brief Returns occupancy of a function
+ *
+ * Returns in \p *numBlocks the number of the maximum active blocks per
+ * streaming multiprocessor.
+ *
+ * The \p Flags parameter controls how special cases are handled. The
+ * valid flags are:
+ *
+ * - ::CU_OCCUPANCY_DEFAULT, which maintains the default behavior as
+ *   ::cuOccupancyMaxActiveBlocksPerMultiprocessor;
+ *
+ * - ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE, which suppresses the
+ *   default behavior on platform where global caching affects
+ *   occupancy. On such platforms, if caching is enabled, but
+ *   per-block SM resource usage would result in zero occupancy, the
+ *   occupancy calculator will calculate the occupancy as if caching
+ *   is disabled. Setting ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE makes
+ *   the occupancy calculator to return 0 in such cases. More information
+ *   can be found about this feature in the "Unified L1/Texture Cache"
+ *   section of the Maxwell tuning guide.
+ *
+ * \param numBlocks       - Returned occupancy
+ * \param func            - Kernel for which occupancy is calculated
+ * \param blockSize       - Block size the kernel is intended to be launched with
+ * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes
+ * \param flags           - Requested behavior for the occupancy calculator
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+ */
+CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize, unsigned int flags);
+
+/**
+ * \brief Suggest a launch configuration with reasonable occupancy
+ *
+ * Returns in \p *blockSize a reasonable block size that can achieve
+ * the maximum occupancy (or, the maximum number of active warps with
+ * the fewest blocks per multiprocessor), and in \p *minGridSize the
+ * minimum grid size to achieve the maximum occupancy.
+ *
+ * If \p blockSizeLimit is 0, the configurator will use the maximum
+ * block size permitted by the device / function instead.
+ *
+ * If per-block dynamic shared memory allocation is not needed, the
+ * user should leave both \p blockSizeToDynamicSMemSize and \p
+ * dynamicSMemSize as 0.
+ *
+ * If per-block dynamic shared memory allocation is needed, then if
+ * the dynamic shared memory size is constant regardless of block
+ * size, the size should be passed through \p dynamicSMemSize, and \p
+ * blockSizeToDynamicSMemSize should be NULL.
+ *
+ * Otherwise, if the per-block dynamic shared memory size varies with
+ * different block sizes, the user needs to provide a unary function
+ * through \p blockSizeToDynamicSMemSize that computes the dynamic
+ * shared memory needed by \p func for any given block size. \p
+ * dynamicSMemSize is ignored. An example signature is:
+ *
+ * \code
+ *    // Take block size, returns dynamic shared memory needed
+ *    size_t blockToSmem(int blockSize);
+ * \endcode
+ *
+ * \param minGridSize - Returned minimum grid size needed to achieve the maximum occupancy
+ * \param blockSize   - Returned maximum block size that can achieve the maximum occupancy
+ * \param func        - Kernel for which launch configuration is calculated
+ * \param blockSizeToDynamicSMemSize - A function that calculates how much per-block dynamic shared memory \p func uses based on the block size
+ * \param dynamicSMemSize - Dynamic shared memory usage intended, in bytes
+ * \param blockSizeLimit  - The maximum block size \p func is designed to handle
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaOccupancyMaxPotentialBlockSize
+ */
+CUresult CUDAAPI cuOccupancyMaxPotentialBlockSize(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit);
+
+/**
+ * \brief Suggest a launch configuration with reasonable occupancy
+ *
+ * An extended version of ::cuOccupancyMaxPotentialBlockSize. In
+ * addition to arguments passed to ::cuOccupancyMaxPotentialBlockSize,
+ * ::cuOccupancyMaxPotentialBlockSizeWithFlags also takes a \p Flags
+ * parameter.
+ *
+ * The \p Flags parameter controls how special cases are handled. The
+ * valid flags are:
+ *
+ * - ::CU_OCCUPANCY_DEFAULT, which maintains the default behavior as
+ *   ::cuOccupancyMaxPotentialBlockSize;
+ *
+ * - ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE, which suppresses the
+ *   default behavior on platform where global caching affects
+ *   occupancy. On such platforms, the launch configurations that
+ *   produces maximal occupancy might not support global
+ *   caching. Setting ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE
+ *   guarantees that the the produced launch configuration is global
+ *   caching compatible at a potential cost of occupancy. More information
+ *   can be found about this feature in the "Unified L1/Texture Cache"
+ *   section of the Maxwell tuning guide.
+ *
+ * \param minGridSize - Returned minimum grid size needed to achieve the maximum occupancy
+ * \param blockSize   - Returned maximum block size that can achieve the maximum occupancy
+ * \param func        - Kernel for which launch configuration is calculated
+ * \param blockSizeToDynamicSMemSize - A function that calculates how much per-block dynamic shared memory \p func uses based on the block size
+ * \param dynamicSMemSize - Dynamic shared memory usage intended, in bytes
+ * \param blockSizeLimit  - The maximum block size \p func is designed to handle
+ * \param flags       - Options
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaOccupancyMaxPotentialBlockSizeWithFlags
+ */
+CUresult CUDAAPI cuOccupancyMaxPotentialBlockSizeWithFlags(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit, unsigned int flags);
+
+/** @} */ /* END CUDA_OCCUPANCY */
+#endif /* __CUDA_API_VERSION >= 6050 */
+
+/**
+ * \defgroup CUDA_TEXREF_DEPRECATED Texture Reference Management [DEPRECATED]
+ *
+ * ___MANBRIEF___ deprecated texture reference management functions of the
+ * low-level CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the deprecated texture reference management
+ * functions of the low-level CUDA driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Binds an array as a texture reference
+ *
+ * \deprecated
+ *
+ * Binds the CUDA array \p hArray to the texture reference \p hTexRef. Any
+ * previous address or CUDA array state associated with the texture reference
+ * is superseded by this function. \p Flags must be set to
+ * ::CU_TRSA_OVERRIDE_FORMAT. Any CUDA array previously bound to \p hTexRef is
+ * unbound.
+ *
+ * \param hTexRef - Texture reference to bind
+ * \param hArray  - Array to bind
+ * \param Flags   - Options (must be ::CU_TRSA_OVERRIDE_FORMAT)
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
+ * ::cudaBindTextureToArray
+ */
+CUresult CUDAAPI cuTexRefSetArray(CUtexref hTexRef, CUarray hArray, unsigned int Flags);
+
+/**
+ * \brief Binds a mipmapped array to a texture reference
+ *
+ * \deprecated
+ *
+ * Binds the CUDA mipmapped array \p hMipmappedArray to the texture reference \p hTexRef.
+ * Any previous address or CUDA array state associated with the texture reference
+ * is superseded by this function. \p Flags must be set to ::CU_TRSA_OVERRIDE_FORMAT.
+ * Any CUDA array previously bound to \p hTexRef is unbound.
+ *
+ * \param hTexRef         - Texture reference to bind
+ * \param hMipmappedArray - Mipmapped array to bind
+ * \param Flags           - Options (must be ::CU_TRSA_OVERRIDE_FORMAT)
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
+ * ::cudaBindTextureToMipmappedArray
+ */
+CUresult CUDAAPI cuTexRefSetMipmappedArray(CUtexref hTexRef, CUmipmappedArray hMipmappedArray, unsigned int Flags);
+
+#if __CUDA_API_VERSION >= 3020
+/**
+ * \brief Binds an address as a texture reference
+ *
+ * \deprecated
+ *
+ * Binds a linear address range to the texture reference \p hTexRef. Any
+ * previous address or CUDA array state associated with the texture reference
+ * is superseded by this function. Any memory previously bound to \p hTexRef
+ * is unbound.
+ *
+ * Since the hardware enforces an alignment requirement on texture base
+ * addresses, ::cuTexRefSetAddress() passes back a byte offset in
+ * \p *ByteOffset that must be applied to texture fetches in order to read from
+ * the desired memory. This offset must be divided by the texel size and
+ * passed to kernels that read from the texture so they can be applied to the
+ * ::tex1Dfetch() function.
+ *
+ * If the device memory pointer was returned from ::cuMemAlloc(), the offset
+ * is guaranteed to be 0 and NULL may be passed as the \p ByteOffset parameter.
+ *
+ * The total number of elements (or texels) in the linear address range
+ * cannot exceed ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH.
+ * The number of elements is computed as (\p bytes / bytesPerElement),
+ * where bytesPerElement is determined from the data format and number of
+ * components set using ::cuTexRefSetFormat().
+ *
+ * \param ByteOffset - Returned byte offset
+ * \param hTexRef    - Texture reference to bind
+ * \param dptr       - Device pointer to bind
+ * \param bytes      - Size of memory to bind in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
+ * ::cudaBindTexture
+ */
+CUresult CUDAAPI cuTexRefSetAddress(size_t *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, size_t bytes);
+
+/**
+ * \brief Binds an address as a 2D texture reference
+ *
+ * \deprecated
+ *
+ * Binds a linear address range to the texture reference \p hTexRef. Any
+ * previous address or CUDA array state associated with the texture reference
+ * is superseded by this function. Any memory previously bound to \p hTexRef
+ * is unbound.
+ *
+ * Using a ::tex2D() function inside a kernel requires a call to either
+ * ::cuTexRefSetArray() to bind the corresponding texture reference to an
+ * array, or ::cuTexRefSetAddress2D() to bind the texture reference to linear
+ * memory.
+ *
+ * Function calls to ::cuTexRefSetFormat() cannot follow calls to
+ * ::cuTexRefSetAddress2D() for the same texture reference.
+ *
+ * It is required that \p dptr be aligned to the appropriate hardware-specific
+ * texture alignment. You can query this value using the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT. If an unaligned \p dptr is
+ * supplied, ::CUDA_ERROR_INVALID_VALUE is returned.
+ *
+ * \p Pitch has to be aligned to the hardware-specific texture pitch alignment.
+ * This value can be queried using the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT. If an unaligned \p Pitch is
+ * supplied, ::CUDA_ERROR_INVALID_VALUE is returned.
+ *
+ * Width and Height, which are specified in elements (or texels), cannot exceed
+ * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH and
+ * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT respectively.
+ * \p Pitch, which is specified in bytes, cannot exceed
+ * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH.
+ *
+ * \param hTexRef - Texture reference to bind
+ * \param desc    - Descriptor of CUDA array
+ * \param dptr    - Device pointer to bind
+ * \param Pitch   - Line pitch in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
+ * ::cudaBindTexture2D
+ */
+CUresult CUDAAPI cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch);
+#endif /* __CUDA_API_VERSION >= 3020 */
+
+/**
+ * \brief Sets the format for a texture reference
+ *
+ * \deprecated
+ *
+ * Specifies the format of the data to be read by the texture reference
+ * \p hTexRef. \p fmt and \p NumPackedComponents are exactly analogous to the
+ * ::Format and ::NumChannels members of the ::CUDA_ARRAY_DESCRIPTOR structure:
+ * They specify the format of each component and the number of components per
+ * array element.
+ *
+ * \param hTexRef             - Texture reference
+ * \param fmt                 - Format to set
+ * \param NumPackedComponents - Number of components per array element
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
+ * ::cudaCreateChannelDesc,
+ * ::cudaBindTexture,
+ * ::cudaBindTexture2D,
+ * ::cudaBindTextureToArray,
+ * ::cudaBindTextureToMipmappedArray
+ */
+CUresult CUDAAPI cuTexRefSetFormat(CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents);
+
+/**
+ * \brief Sets the addressing mode for a texture reference
+ *
+ * \deprecated
+ *
+ * Specifies the addressing mode \p am for the given dimension \p dim of the
+ * texture reference \p hTexRef. If \p dim is zero, the addressing mode is
+ * applied to the first parameter of the functions used to fetch from the
+ * texture; if \p dim is 1, the second, and so on. ::CUaddress_mode is defined
+ * as:
+ * \code
+   typedef enum CUaddress_mode_enum {
+      CU_TR_ADDRESS_MODE_WRAP = 0,
+      CU_TR_ADDRESS_MODE_CLAMP = 1,
+      CU_TR_ADDRESS_MODE_MIRROR = 2,
+      CU_TR_ADDRESS_MODE_BORDER = 3
+   } CUaddress_mode;
+ * \endcode
+ *
+ * Note that this call has no effect if \p hTexRef is bound to linear memory.
+ * Also, if the flag, ::CU_TRSF_NORMALIZED_COORDINATES, is not set, the only
+ * supported address mode is ::CU_TR_ADDRESS_MODE_CLAMP.
+ *
+ * \param hTexRef - Texture reference
+ * \param dim     - Dimension
+ * \param am      - Addressing mode to set
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
+ * ::cudaBindTexture,
+ * ::cudaBindTexture2D,
+ * ::cudaBindTextureToArray,
+ * ::cudaBindTextureToMipmappedArray
+ */
+CUresult CUDAAPI cuTexRefSetAddressMode(CUtexref hTexRef, int dim, CUaddress_mode am);
+
+/**
+ * \brief Sets the filtering mode for a texture reference
+ *
+ * \deprecated
+ *
+ * Specifies the filtering mode \p fm to be used when reading memory through
+ * the texture reference \p hTexRef. ::CUfilter_mode_enum is defined as:
+ *
+ * \code
+   typedef enum CUfilter_mode_enum {
+      CU_TR_FILTER_MODE_POINT = 0,
+      CU_TR_FILTER_MODE_LINEAR = 1
+   } CUfilter_mode;
+ * \endcode
+ *
+ * Note that this call has no effect if \p hTexRef is bound to linear memory.
+ *
+ * \param hTexRef - Texture reference
+ * \param fm      - Filtering mode to set
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
+ * ::cudaBindTextureToArray
+ */
+CUresult CUDAAPI cuTexRefSetFilterMode(CUtexref hTexRef, CUfilter_mode fm);
+
+/**
+ * \brief Sets the mipmap filtering mode for a texture reference
+ *
+ * \deprecated
+ *
+ * Specifies the mipmap filtering mode \p fm to be used when reading memory through
+ * the texture reference \p hTexRef. ::CUfilter_mode_enum is defined as:
+ *
+ * \code
+   typedef enum CUfilter_mode_enum {
+      CU_TR_FILTER_MODE_POINT = 0,
+      CU_TR_FILTER_MODE_LINEAR = 1
+   } CUfilter_mode;
+ * \endcode
+ *
+ * Note that this call has no effect if \p hTexRef is not bound to a mipmapped array.
+ *
+ * \param hTexRef - Texture reference
+ * \param fm      - Filtering mode to set
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
+ * ::cudaBindTextureToMipmappedArray
+ */
+CUresult CUDAAPI cuTexRefSetMipmapFilterMode(CUtexref hTexRef, CUfilter_mode fm);
+
+/**
+ * \brief Sets the mipmap level bias for a texture reference
+ *
+ * \deprecated
+ *
+ * Specifies the mipmap level bias \p bias to be added to the specified mipmap level when
+ * reading memory through the texture reference \p hTexRef.
+ *
+ * Note that this call has no effect if \p hTexRef is not bound to a mipmapped array.
+ *
+ * \param hTexRef - Texture reference
+ * \param bias    - Mipmap level bias
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
+ * ::cudaBindTextureToMipmappedArray
+ */
+CUresult CUDAAPI cuTexRefSetMipmapLevelBias(CUtexref hTexRef, float bias);
+
+/**
+ * \brief Sets the mipmap min/max mipmap level clamps for a texture reference
+ *
+ * \deprecated
+ *
+ * Specifies the min/max mipmap level clamps, \p minMipmapLevelClamp and \p maxMipmapLevelClamp
+ * respectively, to be used when reading memory through the texture reference
+ * \p hTexRef.
+ *
+ * Note that this call has no effect if \p hTexRef is not bound to a mipmapped array.
+ *
+ * \param hTexRef        - Texture reference
+ * \param minMipmapLevelClamp - Mipmap min level clamp
+ * \param maxMipmapLevelClamp - Mipmap max level clamp
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
+ * ::cudaBindTextureToMipmappedArray
+ */
+CUresult CUDAAPI cuTexRefSetMipmapLevelClamp(CUtexref hTexRef, float minMipmapLevelClamp, float maxMipmapLevelClamp);
+
+/**
+ * \brief Sets the maximum anisotropy for a texture reference
+ *
+ * \deprecated
+ *
+ * Specifies the maximum anisotropy \p maxAniso to be used when reading memory through
+ * the texture reference \p hTexRef.
+ *
+ * Note that this call has no effect if \p hTexRef is bound to linear memory.
+ *
+ * \param hTexRef  - Texture reference
+ * \param maxAniso - Maximum anisotropy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
+ * ::cudaBindTextureToArray,
+ * ::cudaBindTextureToMipmappedArray
+ */
+CUresult CUDAAPI cuTexRefSetMaxAnisotropy(CUtexref hTexRef, unsigned int maxAniso);
+
+/**
+ * \brief Sets the border color for a texture reference
+ *
+ * \deprecated
+ *
+ * Specifies the value of the RGBA color via the \p pBorderColor to the texture reference
+ * \p hTexRef. The color value supports only float type and holds color components in
+ * the following sequence:
+ * pBorderColor[0] holds 'R' component
+ * pBorderColor[1] holds 'G' component
+ * pBorderColor[2] holds 'B' component
+ * pBorderColor[3] holds 'A' component
+ *
+ * Note that the color values can be set only when the Address mode is set to
+ * CU_TR_ADDRESS_MODE_BORDER using ::cuTexRefSetAddressMode.
+ * Applications using integer border color values have to "reinterpret_cast" their values to float.
+ *
+ * \param hTexRef       - Texture reference
+ * \param pBorderColor  - RGBA color
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddressMode,
+ * ::cuTexRefGetAddressMode, ::cuTexRefGetBorderColor,
+ * ::cudaBindTexture,
+ * ::cudaBindTexture2D,
+ * ::cudaBindTextureToArray,
+ * ::cudaBindTextureToMipmappedArray
+ */
+CUresult CUDAAPI cuTexRefSetBorderColor(CUtexref hTexRef, float *pBorderColor);
+
+/**
+ * \brief Sets the flags for a texture reference
+ *
+ * \deprecated
+ *
+ * Specifies optional flags via \p Flags to specify the behavior of data
+ * returned through the texture reference \p hTexRef. The valid flags are:
+ *
+ * - ::CU_TRSF_READ_AS_INTEGER, which suppresses the default behavior of
+ *   having the texture promote integer data to floating point data in the
+ *   range [0, 1]. Note that texture with 32-bit integer format
+ *   would not be promoted, regardless of whether or not this
+ *   flag is specified;
+ * - ::CU_TRSF_NORMALIZED_COORDINATES, which suppresses the
+ *   default behavior of having the texture coordinates range
+ *   from [0, Dim) where Dim is the width or height of the CUDA
+ *   array. Instead, the texture coordinates [0, 1.0) reference
+ *   the entire breadth of the array dimension;
+ *
+ * \param hTexRef - Texture reference
+ * \param Flags   - Optional flags to set
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
+ * ::cudaBindTexture,
+ * ::cudaBindTexture2D,
+ * ::cudaBindTextureToArray,
+ * ::cudaBindTextureToMipmappedArray
+ */
+CUresult CUDAAPI cuTexRefSetFlags(CUtexref hTexRef, unsigned int Flags);
+
+#if __CUDA_API_VERSION >= 3020
+/**
+ * \brief Gets the address associated with a texture reference
+ *
+ * \deprecated
+ *
+ * Returns in \p *pdptr the base address bound to the texture reference
+ * \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference
+ * is not bound to any device memory range.
+ *
+ * \param pdptr   - Returned device address
+ * \param hTexRef - Texture reference
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr *pdptr, CUtexref hTexRef);
+#endif /* __CUDA_API_VERSION >= 3020 */
+
+/**
+ * \brief Gets the array bound to a texture reference
+ *
+ * \deprecated
+ *
+ * Returns in \p *phArray the CUDA array bound to the texture reference
+ * \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference
+ * is not bound to any CUDA array.
+ *
+ * \param phArray - Returned array
+ * \param hTexRef - Texture reference
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+CUresult CUDAAPI cuTexRefGetArray(CUarray *phArray, CUtexref hTexRef);
+
+/**
+ * \brief Gets the mipmapped array bound to a texture reference
+ *
+ * \deprecated
+ *
+ * Returns in \p *phMipmappedArray the CUDA mipmapped array bound to the texture
+ * reference \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference
+ * is not bound to any CUDA mipmapped array.
+ *
+ * \param phMipmappedArray - Returned mipmapped array
+ * \param hTexRef          - Texture reference
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+CUresult CUDAAPI cuTexRefGetMipmappedArray(CUmipmappedArray *phMipmappedArray, CUtexref hTexRef);
+
+/**
+ * \brief Gets the addressing mode used by a texture reference
+ *
+ * \deprecated
+ *
+ * Returns in \p *pam the addressing mode corresponding to the
+ * dimension \p dim of the texture reference \p hTexRef. Currently, the only
+ * valid value for \p dim are 0 and 1.
+ *
+ * \param pam     - Returned addressing mode
+ * \param hTexRef - Texture reference
+ * \param dim     - Dimension
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+CUresult CUDAAPI cuTexRefGetAddressMode(CUaddress_mode *pam, CUtexref hTexRef, int dim);
+
+/**
+ * \brief Gets the filter-mode used by a texture reference
+ *
+ * \deprecated
+ *
+ * Returns in \p *pfm the filtering mode of the texture reference
+ * \p hTexRef.
+ *
+ * \param pfm     - Returned filtering mode
+ * \param hTexRef - Texture reference
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+CUresult CUDAAPI cuTexRefGetFilterMode(CUfilter_mode *pfm, CUtexref hTexRef);
+
+/**
+ * \brief Gets the format used by a texture reference
+ *
+ * \deprecated
+ *
+ * Returns in \p *pFormat and \p *pNumChannels the format and number
+ * of components of the CUDA array bound to the texture reference \p hTexRef.
+ * If \p pFormat or \p pNumChannels is NULL, it will be ignored.
+ *
+ * \param pFormat      - Returned format
+ * \param pNumChannels - Returned number of components
+ * \param hTexRef      - Texture reference
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags
+ */
+CUresult CUDAAPI cuTexRefGetFormat(CUarray_format *pFormat, int *pNumChannels, CUtexref hTexRef);
+
+/**
+ * \brief Gets the mipmap filtering mode for a texture reference
+ *
+ * \deprecated
+ *
+ * Returns the mipmap filtering mode in \p pfm that's used when reading memory through
+ * the texture reference \p hTexRef.
+ *
+ * \param pfm     - Returned mipmap filtering mode
+ * \param hTexRef - Texture reference
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+CUresult CUDAAPI cuTexRefGetMipmapFilterMode(CUfilter_mode *pfm, CUtexref hTexRef);
+
+/**
+ * \brief Gets the mipmap level bias for a texture reference
+ *
+ * \deprecated
+ *
+ * Returns the mipmap level bias in \p pBias that's added to the specified mipmap
+ * level when reading memory through the texture reference \p hTexRef.
+ *
+ * \param pbias   - Returned mipmap level bias
+ * \param hTexRef - Texture reference
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+CUresult CUDAAPI cuTexRefGetMipmapLevelBias(float *pbias, CUtexref hTexRef);
+
+/**
+ * \brief Gets the min/max mipmap level clamps for a texture reference
+ *
+ * \deprecated
+ *
+ * Returns the min/max mipmap level clamps in \p pminMipmapLevelClamp and \p pmaxMipmapLevelClamp
+ * that's used when reading memory through the texture reference \p hTexRef.
+ *
+ * \param pminMipmapLevelClamp - Returned mipmap min level clamp
+ * \param pmaxMipmapLevelClamp - Returned mipmap max level clamp
+ * \param hTexRef              - Texture reference
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+CUresult CUDAAPI cuTexRefGetMipmapLevelClamp(float *pminMipmapLevelClamp, float *pmaxMipmapLevelClamp, CUtexref hTexRef);
+
+/**
+ * \brief Gets the maximum anisotropy for a texture reference
+ *
+ * \deprecated
+ *
+ * Returns the maximum anisotropy in \p pmaxAniso that's used when reading memory through
+ * the texture reference \p hTexRef.
+ *
+ * \param pmaxAniso - Returned maximum anisotropy
+ * \param hTexRef   - Texture reference
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+CUresult CUDAAPI cuTexRefGetMaxAnisotropy(int *pmaxAniso, CUtexref hTexRef);
+
+/**
+ * \brief Gets the border color used by a texture reference
+ *
+ * \deprecated
+ *
+ * Returns in \p pBorderColor, values of the RGBA color used by
+ * the texture reference \p hTexRef.
+ * The color value is of type float and holds color components in
+ * the following sequence:
+ * pBorderColor[0] holds 'R' component
+ * pBorderColor[1] holds 'G' component
+ * pBorderColor[2] holds 'B' component
+ * pBorderColor[3] holds 'A' component
+ *
+ * \param hTexRef  - Texture reference
+ * \param pBorderColor   - Returned Type and Value of RGBA color
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddressMode,
+ * ::cuTexRefSetAddressMode, ::cuTexRefSetBorderColor
+ */
+CUresult CUDAAPI cuTexRefGetBorderColor(float *pBorderColor, CUtexref hTexRef);
+
+/**
+ * \brief Gets the flags used by a texture reference
+ *
+ * \deprecated
+ *
+ * Returns in \p *pFlags the flags of the texture reference \p hTexRef.
+ *
+ * \param pFlags  - Returned flags
+ * \param hTexRef - Texture reference
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFormat
+ */
+CUresult CUDAAPI cuTexRefGetFlags(unsigned int *pFlags, CUtexref hTexRef);
+
+/**
+ * \brief Creates a texture reference
+ *
+ * \deprecated
+ *
+ * Creates a texture reference and returns its handle in \p *pTexRef. Once
+ * created, the application must call ::cuTexRefSetArray() or
+ * ::cuTexRefSetAddress() to associate the reference with allocated memory.
+ * Other texture reference functions are used to specify the format and
+ * interpretation (addressing, filtering, etc.) to be used when the memory is
+ * read through this texture reference.
+ *
+ * \param pTexRef - Returned texture reference
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefDestroy
+ */
+CUresult CUDAAPI cuTexRefCreate(CUtexref *pTexRef);
+
+/**
+ * \brief Destroys a texture reference
+ *
+ * \deprecated
+ *
+ * Destroys the texture reference specified by \p hTexRef.
+ *
+ * \param hTexRef - Texture reference to destroy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefCreate
+ */
+CUresult CUDAAPI cuTexRefDestroy(CUtexref hTexRef);
+
+/** @} */ /* END CUDA_TEXREF_DEPRECATED */
+
+
+/**
+ * \defgroup CUDA_SURFREF_DEPRECATED Surface Reference Management [DEPRECATED]
+ *
+ * ___MANBRIEF___ surface reference management functions of the low-level CUDA
+ * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the surface reference management functions of the
+ * low-level CUDA driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Sets the CUDA array for a surface reference.
+ *
+ * \deprecated
+ *
+ * Sets the CUDA array \p hArray to be read and written by the surface reference
+ * \p hSurfRef.  Any previous CUDA array state associated with the surface
+ * reference is superseded by this function.  \p Flags must be set to 0.
+ * The ::CUDA_ARRAY3D_SURFACE_LDST flag must have been set for the CUDA array.
+ * Any CUDA array previously bound to \p hSurfRef is unbound.
+
+ * \param hSurfRef - Surface reference handle
+ * \param hArray - CUDA array handle
+ * \param Flags - set to 0
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuModuleGetSurfRef,
+ * ::cuSurfRefGetArray,
+ * ::cudaBindSurfaceToArray
+ */
+CUresult CUDAAPI cuSurfRefSetArray(CUsurfref hSurfRef, CUarray hArray, unsigned int Flags);
+
+/**
+ * \brief Passes back the CUDA array bound to a surface reference.
+ *
+ * \deprecated
+ *
+ * Returns in \p *phArray the CUDA array bound to the surface reference
+ * \p hSurfRef, or returns ::CUDA_ERROR_INVALID_VALUE if the surface reference
+ * is not bound to any CUDA array.
+
+ * \param phArray - Surface reference handle
+ * \param hSurfRef - Surface reference handle
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuModuleGetSurfRef, ::cuSurfRefSetArray
+ */
+CUresult CUDAAPI cuSurfRefGetArray(CUarray *phArray, CUsurfref hSurfRef);
+
+/** @} */ /* END CUDA_SURFREF_DEPRECATED */
+
+#if __CUDA_API_VERSION >= 5000
+/**
+ * \defgroup CUDA_TEXOBJECT Texture Object Management
+ *
+ * ___MANBRIEF___ texture object management functions of the low-level CUDA
+ * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the texture object management functions of the
+ * low-level CUDA driver application programming interface. The texture
+ * object API is only supported on devices of compute capability 3.0 or higher.
+ *
+ * @{
+ */
+
+/**
+ * \brief Creates a texture object
+ *
+ * Creates a texture object and returns it in \p pTexObject. \p pResDesc describes
+ * the data to texture from. \p pTexDesc describes how the data should be sampled.
+ * \p pResViewDesc is an optional argument that specifies an alternate format for
+ * the data described by \p pResDesc, and also describes the subresource region
+ * to restrict access to when texturing. \p pResViewDesc can only be specified if
+ * the type of resource is a CUDA array or a CUDA mipmapped array.
+ *
+ * Texture objects are only supported on devices of compute capability 3.0 or higher.
+ * Additionally, a texture object is an opaque value, and, as such, should only be
+ * accessed through CUDA API calls.
+ *
+ * The ::CUDA_RESOURCE_DESC structure is defined as:
+ * \code
+        typedef struct CUDA_RESOURCE_DESC_st
+        {
+            CUresourcetype resType;
+
+            union {
+                struct {
+                    CUarray hArray;
+                } array;
+                struct {
+                    CUmipmappedArray hMipmappedArray;
+                } mipmap;
+                struct {
+                    CUdeviceptr devPtr;
+                    CUarray_format format;
+                    unsigned int numChannels;
+                    size_t sizeInBytes;
+                } linear;
+                struct {
+                    CUdeviceptr devPtr;
+                    CUarray_format format;
+                    unsigned int numChannels;
+                    size_t width;
+                    size_t height;
+                    size_t pitchInBytes;
+                } pitch2D;
+            } res;
+
+            unsigned int flags;
+        } CUDA_RESOURCE_DESC;
+
+ * \endcode
+ * where:
+ * - ::CUDA_RESOURCE_DESC::resType specifies the type of resource to texture from.
+ * CUresourceType is defined as:
+ * \code
+        typedef enum CUresourcetype_enum {
+            CU_RESOURCE_TYPE_ARRAY           = 0x00,
+            CU_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01,
+            CU_RESOURCE_TYPE_LINEAR          = 0x02,
+            CU_RESOURCE_TYPE_PITCH2D         = 0x03
+        } CUresourcetype;
+ * \endcode
+ *
+ * \par
+ * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_ARRAY, ::CUDA_RESOURCE_DESC::res::array::hArray
+ * must be set to a valid CUDA array handle.
+ *
+ * \par
+ * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_MIPMAPPED_ARRAY, ::CUDA_RESOURCE_DESC::res::mipmap::hMipmappedArray
+ * must be set to a valid CUDA mipmapped array handle.
+ *
+ * \par
+ * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_LINEAR, ::CUDA_RESOURCE_DESC::res::linear::devPtr
+ * must be set to a valid device pointer, that is aligned to ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT.
+ * ::CUDA_RESOURCE_DESC::res::linear::format and ::CUDA_RESOURCE_DESC::res::linear::numChannels
+ * describe the format of each component and the number of components per array element. ::CUDA_RESOURCE_DESC::res::linear::sizeInBytes
+ * specifies the size of the array in bytes. The total number of elements in the linear address range cannot exceed
+ * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH. The number of elements is computed as (sizeInBytes / (sizeof(format) * numChannels)).
+ *
+ * \par
+ * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_PITCH2D, ::CUDA_RESOURCE_DESC::res::pitch2D::devPtr
+ * must be set to a valid device pointer, that is aligned to ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT.
+ * ::CUDA_RESOURCE_DESC::res::pitch2D::format and ::CUDA_RESOURCE_DESC::res::pitch2D::numChannels
+ * describe the format of each component and the number of components per array element. ::CUDA_RESOURCE_DESC::res::pitch2D::width
+ * and ::CUDA_RESOURCE_DESC::res::pitch2D::height specify the width and height of the array in elements, and cannot exceed
+ * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH and ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT respectively.
+ * ::CUDA_RESOURCE_DESC::res::pitch2D::pitchInBytes specifies the pitch between two rows in bytes and has to be aligned to
+ * ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT. Pitch cannot exceed ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH.
+ *
+ * - ::flags must be set to zero.
+ *
+ *
+ * The ::CUDA_TEXTURE_DESC struct is defined as
+ * \code
+        typedef struct CUDA_TEXTURE_DESC_st {
+            CUaddress_mode addressMode[3];
+            CUfilter_mode filterMode;
+            unsigned int flags;
+            unsigned int maxAnisotropy;
+            CUfilter_mode mipmapFilterMode;
+            float mipmapLevelBias;
+            float minMipmapLevelClamp;
+            float maxMipmapLevelClamp;
+        } CUDA_TEXTURE_DESC;
+ * \endcode
+ * where
+ * - ::CUDA_TEXTURE_DESC::addressMode specifies the addressing mode for each dimension of the texture data. ::CUaddress_mode is defined as:
+ *   \code
+        typedef enum CUaddress_mode_enum {
+            CU_TR_ADDRESS_MODE_WRAP = 0,
+            CU_TR_ADDRESS_MODE_CLAMP = 1,
+            CU_TR_ADDRESS_MODE_MIRROR = 2,
+            CU_TR_ADDRESS_MODE_BORDER = 3
+        } CUaddress_mode;
+ *   \endcode
+ *   This is ignored if ::CUDA_RESOURCE_DESC::resType is ::CU_RESOURCE_TYPE_LINEAR. Also, if the flag, ::CU_TRSF_NORMALIZED_COORDINATES
+ *   is not set, the only supported address mode is ::CU_TR_ADDRESS_MODE_CLAMP.
+ *
+ * - ::CUDA_TEXTURE_DESC::filterMode specifies the filtering mode to be used when fetching from the texture. CUfilter_mode is defined as:
+ *   \code
+        typedef enum CUfilter_mode_enum {
+            CU_TR_FILTER_MODE_POINT = 0,
+            CU_TR_FILTER_MODE_LINEAR = 1
+        } CUfilter_mode;
+ *   \endcode
+ *   This is ignored if ::CUDA_RESOURCE_DESC::resType is ::CU_RESOURCE_TYPE_LINEAR.
+ *
+ * - ::CUDA_TEXTURE_DESC::flags can be any combination of the following:
+ *   - ::CU_TRSF_READ_AS_INTEGER, which suppresses the default behavior of having the texture promote integer data to floating point data in the
+ *     range [0, 1]. Note that texture with 32-bit integer format would not be promoted, regardless of whether or not this flag is specified.
+ *   - ::CU_TRSF_NORMALIZED_COORDINATES, which suppresses the default behavior of having the texture coordinates range from [0, Dim) where Dim is
+ *     the width or height of the CUDA array. Instead, the texture coordinates [0, 1.0) reference the entire breadth of the array dimension; Note
+ *     that for CUDA mipmapped arrays, this flag has to be set.
+ *
+ * - ::CUDA_TEXTURE_DESC::maxAnisotropy specifies the maximum anisotropy ratio to be used when doing anisotropic filtering. This value will be
+ *   clamped to the range [1,16].
+ *
+ * - ::CUDA_TEXTURE_DESC::mipmapFilterMode specifies the filter mode when the calculated mipmap level lies between two defined mipmap levels.
+ *
+ * - ::CUDA_TEXTURE_DESC::mipmapLevelBias specifies the offset to be applied to the calculated mipmap level.
+ *
+ * - ::CUDA_TEXTURE_DESC::minMipmapLevelClamp specifies the lower end of the mipmap level range to clamp access to.
+ *
+ * - ::CUDA_TEXTURE_DESC::maxMipmapLevelClamp specifies the upper end of the mipmap level range to clamp access to.
+ *
+ *
+ * The ::CUDA_RESOURCE_VIEW_DESC struct is defined as
+ * \code
+        typedef struct CUDA_RESOURCE_VIEW_DESC_st
+        {
+            CUresourceViewFormat format;
+            size_t width;
+            size_t height;
+            size_t depth;
+            unsigned int firstMipmapLevel;
+            unsigned int lastMipmapLevel;
+            unsigned int firstLayer;
+            unsigned int lastLayer;
+        } CUDA_RESOURCE_VIEW_DESC;
+ * \endcode
+ * where:
+ * - ::CUDA_RESOURCE_VIEW_DESC::format specifies how the data contained in the CUDA array or CUDA mipmapped array should
+ *   be interpreted. Note that this can incur a change in size of the texture data. If the resource view format is a block
+ *   compressed format, then the underlying CUDA array or CUDA mipmapped array has to have a base of format ::CU_AD_FORMAT_UNSIGNED_INT32.
+ *   with 2 or 4 channels, depending on the block compressed format. For ex., BC1 and BC4 require the underlying CUDA array to have
+ *   a format of ::CU_AD_FORMAT_UNSIGNED_INT32 with 2 channels. The other BC formats require the underlying resource to have the same base
+ *   format but with 4 channels.
+ *
+ * - ::CUDA_RESOURCE_VIEW_DESC::width specifies the new width of the texture data. If the resource view format is a block
+ *   compressed format, this value has to be 4 times the original width of the resource. For non block compressed formats,
+ *   this value has to be equal to that of the original resource.
+ *
+ * - ::CUDA_RESOURCE_VIEW_DESC::height specifies the new height of the texture data. If the resource view format is a block
+ *   compressed format, this value has to be 4 times the original height of the resource. For non block compressed formats,
+ *   this value has to be equal to that of the original resource.
+ *
+ * - ::CUDA_RESOURCE_VIEW_DESC::depth specifies the new depth of the texture data. This value has to be equal to that of the
+ *   original resource.
+ *
+ * - ::CUDA_RESOURCE_VIEW_DESC::firstMipmapLevel specifies the most detailed mipmap level. This will be the new mipmap level zero.
+ *   For non-mipmapped resources, this value has to be zero.::CUDA_TEXTURE_DESC::minMipmapLevelClamp and ::CUDA_TEXTURE_DESC::maxMipmapLevelClamp
+ *   will be relative to this value. For ex., if the firstMipmapLevel is set to 2, and a minMipmapLevelClamp of 1.2 is specified,
+ *   then the actual minimum mipmap level clamp will be 3.2.
+ *
+ * - ::CUDA_RESOURCE_VIEW_DESC::lastMipmapLevel specifies the least detailed mipmap level. For non-mipmapped resources, this value
+ *   has to be zero.
+ *
+ * - ::CUDA_RESOURCE_VIEW_DESC::firstLayer specifies the first layer index for layered textures. This will be the new layer zero.
+ *   For non-layered resources, this value has to be zero.
+ *
+ * - ::CUDA_RESOURCE_VIEW_DESC::lastLayer specifies the last layer index for layered textures. For non-layered resources,
+ *   this value has to be zero.
+ *
+ *
+ * \param pTexObject   - Texture object to create
+ * \param pResDesc     - Resource descriptor
+ * \param pTexDesc     - Texture descriptor
+ * \param pResViewDesc - Resource view descriptor
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexObjectDestroy,
+ * ::cudaCreateTextureObject
+ */
+CUresult CUDAAPI cuTexObjectCreate(CUtexObject *pTexObject, const CUDA_RESOURCE_DESC *pResDesc, const CUDA_TEXTURE_DESC *pTexDesc, const CUDA_RESOURCE_VIEW_DESC *pResViewDesc);
+
+/**
+ * \brief Destroys a texture object
+ *
+ * Destroys the texture object specified by \p texObject.
+ *
+ * \param texObject - Texture object to destroy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexObjectCreate,
+ * ::cudaDestroyTextureObject
+ */
+CUresult CUDAAPI cuTexObjectDestroy(CUtexObject texObject);
+
+/**
+ * \brief Returns a texture object's resource descriptor
+ *
+ * Returns the resource descriptor for the texture object specified by \p texObject.
+ *
+ * \param pResDesc  - Resource descriptor
+ * \param texObject - Texture object
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexObjectCreate,
+ * ::cudaGetTextureObjectResourceDesc,
+ */
+CUresult CUDAAPI cuTexObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUtexObject texObject);
+
+/**
+ * \brief Returns a texture object's texture descriptor
+ *
+ * Returns the texture descriptor for the texture object specified by \p texObject.
+ *
+ * \param pTexDesc  - Texture descriptor
+ * \param texObject - Texture object
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexObjectCreate,
+ * ::cudaGetTextureObjectTextureDesc
+ */
+CUresult CUDAAPI cuTexObjectGetTextureDesc(CUDA_TEXTURE_DESC *pTexDesc, CUtexObject texObject);
+
+/**
+ * \brief Returns a texture object's resource view descriptor
+ *
+ * Returns the resource view descriptor for the texture object specified by \p texObject.
+ * If no resource view was set for \p texObject, the ::CUDA_ERROR_INVALID_VALUE is returned.
+ *
+ * \param pResViewDesc - Resource view descriptor
+ * \param texObject    - Texture object
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexObjectCreate,
+ * ::cudaGetTextureObjectResourceViewDesc
+ */
+CUresult CUDAAPI cuTexObjectGetResourceViewDesc(CUDA_RESOURCE_VIEW_DESC *pResViewDesc, CUtexObject texObject);
+
+/** @} */ /* END CUDA_TEXOBJECT */
+
+/**
+ * \defgroup CUDA_SURFOBJECT Surface Object Management
+ *
+ * ___MANBRIEF___ surface object management functions of the low-level CUDA
+ * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the surface object management functions of the
+ * low-level CUDA driver application programming interface. The surface
+ * object API is only supported on devices of compute capability 3.0 or higher.
+ *
+ * @{
+ */
+
+/**
+ * \brief Creates a surface object
+ *
+ * Creates a surface object and returns it in \p pSurfObject. \p pResDesc describes
+ * the data to perform surface load/stores on. ::CUDA_RESOURCE_DESC::resType must be
+ * ::CU_RESOURCE_TYPE_ARRAY and  ::CUDA_RESOURCE_DESC::res::array::hArray
+ * must be set to a valid CUDA array handle. ::CUDA_RESOURCE_DESC::flags must be set to zero.
+ *
+ * Surface objects are only supported on devices of compute capability 3.0 or higher.
+ * Additionally, a surface object is an opaque value, and, as such, should only be
+ * accessed through CUDA API calls.
+ *
+ * \param pSurfObject - Surface object to create
+ * \param pResDesc    - Resource descriptor
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuSurfObjectDestroy,
+ * ::cudaCreateSurfaceObject
+ */
+CUresult CUDAAPI cuSurfObjectCreate(CUsurfObject *pSurfObject, const CUDA_RESOURCE_DESC *pResDesc);
+
+/**
+ * \brief Destroys a surface object
+ *
+ * Destroys the surface object specified by \p surfObject.
+ *
+ * \param surfObject - Surface object to destroy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuSurfObjectCreate,
+ * ::cudaDestroySurfaceObject
+ */
+CUresult CUDAAPI cuSurfObjectDestroy(CUsurfObject surfObject);
+
+/**
+ * \brief Returns a surface object's resource descriptor
+ *
+ * Returns the resource descriptor for the surface object specified by \p surfObject.
+ *
+ * \param pResDesc   - Resource descriptor
+ * \param surfObject - Surface object
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuSurfObjectCreate,
+ * ::cudaGetSurfaceObjectResourceDesc
+ */
+CUresult CUDAAPI cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUsurfObject surfObject);
+
+/** @} */ /* END CUDA_SURFOBJECT */
+#endif /* __CUDA_API_VERSION >= 5000 */
+
+/**
+ * \defgroup CUDA_PEER_ACCESS Peer Context Memory Access
+ *
+ * ___MANBRIEF___ direct peer context memory access functions of the low-level
+ * CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the direct peer context memory access functions
+ * of the low-level CUDA driver application programming interface.
+ *
+ * @{
+ */
+
+#if __CUDA_API_VERSION >= 4000
+
+/**
+ * \brief Queries if a device may directly access a peer device's memory.
+ *
+ * Returns in \p *canAccessPeer a value of 1 if contexts on \p dev are capable of
+ * directly accessing memory from contexts on \p peerDev and 0 otherwise.
+ * If direct access of \p peerDev from \p dev is possible, then access may be
+ * enabled on two specific contexts by calling ::cuCtxEnablePeerAccess().
+ *
+ * \param canAccessPeer - Returned access capability
+ * \param dev           - Device from which allocations on \p peerDev are to
+ *                        be directly accessed.
+ * \param peerDev       - Device on which the allocations to be directly accessed
+ *                        by \p dev reside.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuCtxEnablePeerAccess,
+ * ::cuCtxDisablePeerAccess,
+ * ::cudaDeviceCanAccessPeer
+ */
+CUresult CUDAAPI cuDeviceCanAccessPeer(int *canAccessPeer, CUdevice dev, CUdevice peerDev);
+
+/**
+ * \brief Enables direct access to memory allocations in a peer context.
+ *
+ * If both the current context and \p peerContext are on devices which support unified
+ * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING) and same
+ * major compute capability, then on success all allocations from \p peerContext will
+ * immediately be accessible by the current context.  See \ref CUDA_UNIFIED for additional
+ * details.
+ *
+ * Note that access granted by this call is unidirectional and that in order to access
+ * memory from the current context in \p peerContext, a separate symmetric call
+ * to ::cuCtxEnablePeerAccess() is required.
+ *
+ * There is a system-wide maximum of eight peer connections per device.
+ *
+ * Returns ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED if ::cuDeviceCanAccessPeer() indicates
+ * that the ::CUdevice of the current context cannot directly access memory
+ * from the ::CUdevice of \p peerContext.
+ *
+ * Returns ::CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED if direct access of
+ * \p peerContext from the current context has already been enabled.
+ *
+ * Returns ::CUDA_ERROR_TOO_MANY_PEERS if direct peer access is not possible
+ * because hardware resources required for peer access have been exhausted.
+ *
+ * Returns ::CUDA_ERROR_INVALID_CONTEXT if there is no current context, \p peerContext
+ * is not a valid context, or if the current context is \p peerContext.
+ *
+ * Returns ::CUDA_ERROR_INVALID_VALUE if \p Flags is not 0.
+ *
+ * \param peerContext - Peer context to enable direct access to from the current context
+ * \param Flags       - Reserved for future use and must be set to 0
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED,
+ * ::CUDA_ERROR_TOO_MANY_PEERS,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceCanAccessPeer,
+ * ::cuCtxDisablePeerAccess,
+ * ::cudaDeviceEnablePeerAccess
+ */
+CUresult CUDAAPI cuCtxEnablePeerAccess(CUcontext peerContext, unsigned int Flags);
+
+/**
+ * \brief Disables direct access to memory allocations in a peer context and
+ * unregisters any registered allocations.
+ *
+  Returns ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED if direct peer access has
+ * not yet been enabled from \p peerContext to the current context.
+ *
+ * Returns ::CUDA_ERROR_INVALID_CONTEXT if there is no current context, or if
+ * \p peerContext is not a valid context.
+ *
+ * \param peerContext - Peer context to disable direct access to
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceCanAccessPeer,
+ * ::cuCtxEnablePeerAccess,
+ * ::cudaDeviceDisablePeerAccess
+ */
+CUresult CUDAAPI cuCtxDisablePeerAccess(CUcontext peerContext);
+
+#endif /* __CUDA_API_VERSION >= 4000 */
+
+#if __CUDA_API_VERSION >= 8000
+
+/**
+ * \brief Queries attributes of the link between two devices.
+ *
+ * Returns in \p *value the value of the requested attribute \p attrib of the
+ * link between \p srcDevice and \p dstDevice. The supported attributes are:
+ * - ::CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK: A relative value indicating the
+ *   performance of the link between two devices.
+ * - ::CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED P2P: 1 if P2P Access is enable.
+ * - ::CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED: 1 if Atomic operations over
+ *   the link are supported.
+ * - ::CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED: 1 if cudaArray can
+ *   be accessed over the link.
+ *
+ * Returns ::CUDA_ERROR_INVALID_DEVICE if \p srcDevice or \p dstDevice are not valid
+ * or if they represent the same device.
+ *
+ * Returns ::CUDA_ERROR_INVALID_VALUE if \p attrib is not valid or if \p value is
+ * a null pointer.
+ *
+ * \param value         - Returned value of the requested attribute
+ * \param attrib        - The requested attribute of the link between \p srcDevice and \p dstDevice.
+ * \param srcDevice     - The source device of the target link.
+ * \param dstDevice     - The destination device of the target link.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuCtxEnablePeerAccess,
+ * ::cuCtxDisablePeerAccess,
+ * ::cuDeviceCanAccessPeer,
+ * ::cudaDeviceGetP2PAttribute
+ */
+CUresult CUDAAPI cuDeviceGetP2PAttribute(int* value, CUdevice_P2PAttribute attrib, CUdevice srcDevice, CUdevice dstDevice);
+
+#endif /* __CUDA_API_VERSION >= 8000 */
+
+/** @} */ /* END CUDA_PEER_ACCESS */
+
+/**
+ * \defgroup CUDA_GRAPHICS Graphics Interoperability
+ *
+ * ___MANBRIEF___ graphics interoperability functions of the low-level CUDA
+ * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the graphics interoperability functions of the
+ * low-level CUDA driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Unregisters a graphics resource for access by CUDA
+ *
+ * Unregisters the graphics resource \p resource so it is not accessible by
+ * CUDA unless registered again.
+ *
+ * If \p resource is invalid then ::CUDA_ERROR_INVALID_HANDLE is
+ * returned.
+ *
+ * \param resource - Resource to unregister
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphicsD3D9RegisterResource,
+ * ::cuGraphicsD3D10RegisterResource,
+ * ::cuGraphicsD3D11RegisterResource,
+ * ::cuGraphicsGLRegisterBuffer,
+ * ::cuGraphicsGLRegisterImage,
+ * ::cudaGraphicsUnregisterResource
+ */
+CUresult CUDAAPI cuGraphicsUnregisterResource(CUgraphicsResource resource);
+
+/**
+ * \brief Get an array through which to access a subresource of a mapped graphics resource.
+ *
+ * Returns in \p *pArray an array through which the subresource of the mapped
+ * graphics resource \p resource which corresponds to array index \p arrayIndex
+ * and mipmap level \p mipLevel may be accessed.  The value set in \p *pArray may
+ * change every time that \p resource is mapped.
+ *
+ * If \p resource is not a texture then it cannot be accessed via an array and
+ * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY is returned.
+ * If \p arrayIndex is not a valid array index for \p resource then
+ * ::CUDA_ERROR_INVALID_VALUE is returned.
+ * If \p mipLevel is not a valid mipmap level for \p resource then
+ * ::CUDA_ERROR_INVALID_VALUE is returned.
+ * If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned.
+ *
+ * \param pArray      - Returned array through which a subresource of \p resource may be accessed
+ * \param resource    - Mapped resource to access
+ * \param arrayIndex  - Array index for array textures or cubemap face
+ *                      index as defined by ::CUarray_cubemap_face for
+ *                      cubemap textures for the subresource to access
+ * \param mipLevel    - Mipmap level for the subresource to access
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_MAPPED,
+ * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphicsResourceGetMappedPointer,
+ * ::cudaGraphicsSubResourceGetMappedArray
+ */
+CUresult CUDAAPI cuGraphicsSubResourceGetMappedArray(CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel);
+
+#if __CUDA_API_VERSION >= 5000
+
+/**
+ * \brief Get a mipmapped array through which to access a mapped graphics resource.
+ *
+ * Returns in \p *pMipmappedArray a mipmapped array through which the mapped graphics
+ * resource \p resource. The value set in \p *pMipmappedArray may change every time
+ * that \p resource is mapped.
+ *
+ * If \p resource is not a texture then it cannot be accessed via a mipmapped array and
+ * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY is returned.
+ * If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned.
+ *
+ * \param pMipmappedArray - Returned mipmapped array through which \p resource may be accessed
+ * \param resource        - Mapped resource to access
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_MAPPED,
+ * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphicsResourceGetMappedPointer,
+ * ::cudaGraphicsResourceGetMappedMipmappedArray
+ */
+CUresult CUDAAPI cuGraphicsResourceGetMappedMipmappedArray(CUmipmappedArray *pMipmappedArray, CUgraphicsResource resource);
+
+#endif /* __CUDA_API_VERSION >= 5000 */
+
+#if __CUDA_API_VERSION >= 3020
+/**
+ * \brief Get a device pointer through which to access a mapped graphics resource.
+ *
+ * Returns in \p *pDevPtr a pointer through which the mapped graphics resource
+ * \p resource may be accessed.
+ * Returns in \p pSize the size of the memory in bytes which may be accessed from that pointer.
+ * The value set in \p pPointer may change every time that \p resource is mapped.
+ *
+ * If \p resource is not a buffer then it cannot be accessed via a pointer and
+ * ::CUDA_ERROR_NOT_MAPPED_AS_POINTER is returned.
+ * If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned.
+ * *
+ * \param pDevPtr    - Returned pointer through which \p resource may be accessed
+ * \param pSize      - Returned size of the buffer accessible starting at \p *pPointer
+ * \param resource   - Mapped resource to access
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_MAPPED,
+ * ::CUDA_ERROR_NOT_MAPPED_AS_POINTER
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphicsMapResources,
+ * ::cuGraphicsSubResourceGetMappedArray,
+ * ::cudaGraphicsResourceGetMappedPointer
+ */
+CUresult CUDAAPI cuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, size_t *pSize, CUgraphicsResource resource);
+#endif /* __CUDA_API_VERSION >= 3020 */
+
+/**
+ * \brief Set usage flags for mapping a graphics resource
+ *
+ * Set \p flags for mapping the graphics resource \p resource.
+ *
+ * Changes to \p flags will take effect the next time \p resource is mapped.
+ * The \p flags argument may be any of the following:
+
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE: Specifies no hints about how this
+ *   resource will be used. It is therefore assumed that this resource will be
+ *   read from and written to by CUDA kernels.  This is the default value.
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_READONLY: Specifies that CUDA kernels which
+ *   access this resource will not write to this resource.
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITEDISCARD: Specifies that CUDA kernels
+ *   which access this resource will not read from this resource and will
+ *   write over the entire contents of the resource, so none of the data
+ *   previously stored in the resource will be preserved.
+ *
+ * If \p resource is presently mapped for access by CUDA then
+ * ::CUDA_ERROR_ALREADY_MAPPED is returned.
+ * If \p flags is not one of the above values then ::CUDA_ERROR_INVALID_VALUE is returned.
+ *
+ * \param resource - Registered resource to set flags for
+ * \param flags    - Parameters for resource mapping
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_ALREADY_MAPPED
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphicsMapResources,
+ * ::cudaGraphicsResourceSetMapFlags
+ */
+CUresult CUDAAPI cuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags);
+
+/**
+ * \brief Map graphics resources for access by CUDA
+ *
+ * Maps the \p count graphics resources in \p resources for access by CUDA.
+ *
+ * The resources in \p resources may be accessed by CUDA until they
+ * are unmapped. The graphics API from which \p resources were registered
+ * should not access any resources while they are mapped by CUDA. If an
+ * application does so, the results are undefined.
+ *
+ * This function provides the synchronization guarantee that any graphics calls
+ * issued before ::cuGraphicsMapResources() will complete before any subsequent CUDA
+ * work issued in \p stream begins.
+ *
+ * If \p resources includes any duplicate entries then ::CUDA_ERROR_INVALID_HANDLE is returned.
+ * If any of \p resources are presently mapped for access by CUDA then ::CUDA_ERROR_ALREADY_MAPPED is returned.
+ *
+ * \param count      - Number of resources to map
+ * \param resources  - Resources to map for CUDA usage
+ * \param hStream    - Stream with which to synchronize
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_ALREADY_MAPPED,
+ * ::CUDA_ERROR_UNKNOWN
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphicsResourceGetMappedPointer,
+ * ::cuGraphicsSubResourceGetMappedArray,
+ * ::cuGraphicsUnmapResources,
+ * ::cudaGraphicsMapResources
+ */
+CUresult CUDAAPI cuGraphicsMapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
+
+/**
+ * \brief Unmap graphics resources.
+ *
+ * Unmaps the \p count graphics resources in \p resources.
+ *
+ * Once unmapped, the resources in \p resources may not be accessed by CUDA
+ * until they are mapped again.
+ *
+ * This function provides the synchronization guarantee that any CUDA work issued
+ * in \p stream before ::cuGraphicsUnmapResources() will complete before any
+ * subsequently issued graphics work begins.
+ *
+ *
+ * If \p resources includes any duplicate entries then ::CUDA_ERROR_INVALID_HANDLE is returned.
+ * If any of \p resources are not presently mapped for access by CUDA then ::CUDA_ERROR_NOT_MAPPED is returned.
+ *
+ * \param count      - Number of resources to unmap
+ * \param resources  - Resources to unmap
+ * \param hStream    - Stream with which to synchronize
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_MAPPED,
+ * ::CUDA_ERROR_UNKNOWN
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphicsMapResources,
+ * ::cudaGraphicsUnmapResources
+ */
+CUresult CUDAAPI cuGraphicsUnmapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
+
+/** @} */ /* END CUDA_GRAPHICS */
+
+CUresult CUDAAPI cuGetExportTable(const void **ppExportTable, const CUuuid *pExportTableId);
+
+
+/**
+ * CUDA API versioning support
+ */
+#if defined(__CUDA_API_VERSION_INTERNAL)
+    #undef cuMemHostRegister
+    #undef cuGraphicsResourceSetMapFlags
+    #undef cuLinkCreate
+    #undef cuLinkAddData
+    #undef cuLinkAddFile
+    #undef cuDeviceTotalMem
+    #undef cuCtxCreate
+    #undef cuModuleGetGlobal
+    #undef cuMemGetInfo
+    #undef cuMemAlloc
+    #undef cuMemAllocPitch
+    #undef cuMemFree
+    #undef cuMemGetAddressRange
+    #undef cuMemAllocHost
+    #undef cuMemHostGetDevicePointer
+    #undef cuMemcpyHtoD
+    #undef cuMemcpyDtoH
+    #undef cuMemcpyDtoD
+    #undef cuMemcpyDtoA
+    #undef cuMemcpyAtoD
+    #undef cuMemcpyHtoA
+    #undef cuMemcpyAtoH
+    #undef cuMemcpyAtoA
+    #undef cuMemcpyHtoAAsync
+    #undef cuMemcpyAtoHAsync
+    #undef cuMemcpy2D
+    #undef cuMemcpy2DUnaligned
+    #undef cuMemcpy3D
+    #undef cuMemcpyHtoDAsync
+    #undef cuMemcpyDtoHAsync
+    #undef cuMemcpyDtoDAsync
+    #undef cuMemcpy2DAsync
+    #undef cuMemcpy3DAsync
+    #undef cuMemsetD8
+    #undef cuMemsetD16
+    #undef cuMemsetD32
+    #undef cuMemsetD2D8
+    #undef cuMemsetD2D16
+    #undef cuMemsetD2D32
+    #undef cuArrayCreate
+    #undef cuArrayGetDescriptor
+    #undef cuArray3DCreate
+    #undef cuArray3DGetDescriptor
+    #undef cuTexRefSetAddress
+    #undef cuTexRefSetAddress2D
+    #undef cuTexRefGetAddress
+    #undef cuGraphicsResourceGetMappedPointer
+    #undef cuCtxDestroy
+    #undef cuCtxPopCurrent
+    #undef cuCtxPushCurrent
+    #undef cuStreamDestroy
+    #undef cuEventDestroy
+    #undef cuMemcpy
+    #undef cuMemcpyAsync
+    #undef cuMemcpyPeer
+    #undef cuMemcpyPeerAsync
+    #undef cuMemcpy3DPeer
+    #undef cuMemcpy3DPeerAsync
+    #undef cuMemsetD8Async
+    #undef cuMemsetD16Async
+    #undef cuMemsetD32Async
+    #undef cuMemsetD2D8Async
+    #undef cuMemsetD2D16Async
+    #undef cuMemsetD2D32Async
+    #undef cuStreamGetPriority
+    #undef cuStreamGetFlags
+    #undef cuStreamGetCtx
+    #undef cuStreamWaitEvent
+    #undef cuStreamAddCallback
+    #undef cuStreamAttachMemAsync
+    #undef cuStreamQuery
+    #undef cuStreamSynchronize
+    #undef cuEventRecord
+    #undef cuLaunchKernel
+    #undef cuLaunchHostFunc
+    #undef cuGraphicsMapResources
+    #undef cuGraphicsUnmapResources
+    #undef cuStreamWriteValue32
+    #undef cuStreamWaitValue32
+    #undef cuStreamWriteValue64
+    #undef cuStreamWaitValue64
+    #undef cuStreamBatchMemOp
+    #undef cuMemPrefetchAsync
+    #undef cuLaunchCooperativeKernel
+    #undef cuSignalExternalSemaphoresAsync
+    #undef cuWaitExternalSemaphoresAsync
+    #undef cuStreamBeginCapture
+    #undef cuStreamEndCapture
+    #undef cuStreamIsCapturing
+    #undef cuStreamGetCaptureInfo
+    #undef cuGraphLaunch
+#endif /* __CUDA_API_VERSION_INTERNAL */
+
+#if defined(__CUDA_API_VERSION_INTERNAL) || (__CUDA_API_VERSION >= 4000 && __CUDA_API_VERSION < 6050)
+CUresult CUDAAPI cuMemHostRegister(void *p, size_t bytesize, unsigned int Flags);
+#endif /* defined(__CUDA_API_VERSION_INTERNAL) || (__CUDA_API_VERSION >= 4000 && __CUDA_API_VERSION < 6050) */
+
+#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION < 6050
+CUresult CUDAAPI cuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags);
+#endif /* defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION < 6050 */
+
+#if defined(__CUDA_API_VERSION_INTERNAL) || (__CUDA_API_VERSION >= 5050 && __CUDA_API_VERSION < 6050)
+CUresult CUDAAPI cuLinkCreate(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut);
+CUresult CUDAAPI cuLinkAddData(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name,
+    unsigned int numOptions, CUjit_option *options, void **optionValues);
+CUresult CUDAAPI cuLinkAddFile(CUlinkState state, CUjitInputType type, const char *path,
+    unsigned int numOptions, CUjit_option *options, void **optionValues);
+#endif /* __CUDA_API_VERSION_INTERNAL || (__CUDA_API_VERSION >= 5050 && __CUDA_API_VERSION < 6050) */
+
+#if defined(__CUDA_API_VERSION_INTERNAL) || (__CUDA_API_VERSION >= 3020 && __CUDA_API_VERSION < 4010)
+CUresult CUDAAPI cuTexRefSetAddress2D_v2(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch);
+#endif /* __CUDA_API_VERSION_INTERNAL || (__CUDA_API_VERSION >= 3020 && __CUDA_API_VERSION < 4010) */
+
+/**
+ * CUDA API made obselete at API version 3020
+ */
+#if defined(__CUDA_API_VERSION_INTERNAL)
+    #define CUdeviceptr                  CUdeviceptr_v1
+    #define CUDA_MEMCPY2D_st             CUDA_MEMCPY2D_v1_st
+    #define CUDA_MEMCPY2D                CUDA_MEMCPY2D_v1
+    #define CUDA_MEMCPY3D_st             CUDA_MEMCPY3D_v1_st
+    #define CUDA_MEMCPY3D                CUDA_MEMCPY3D_v1
+    #define CUDA_ARRAY_DESCRIPTOR_st     CUDA_ARRAY_DESCRIPTOR_v1_st
+    #define CUDA_ARRAY_DESCRIPTOR        CUDA_ARRAY_DESCRIPTOR_v1
+    #define CUDA_ARRAY3D_DESCRIPTOR_st   CUDA_ARRAY3D_DESCRIPTOR_v1_st
+    #define CUDA_ARRAY3D_DESCRIPTOR      CUDA_ARRAY3D_DESCRIPTOR_v1
+#endif /* CUDA_FORCE_LEGACY32_INTERNAL */
+
+#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION < 3020
+
+typedef unsigned int CUdeviceptr;
+
+typedef struct CUDA_MEMCPY2D_st
+{
+    unsigned int srcXInBytes;   /**< Source X in bytes */
+    unsigned int srcY;          /**< Source Y */
+    CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
+    const void *srcHost;        /**< Source host pointer */
+    CUdeviceptr srcDevice;      /**< Source device pointer */
+    CUarray srcArray;           /**< Source array reference */
+    unsigned int srcPitch;      /**< Source pitch (ignored when src is array) */
+
+    unsigned int dstXInBytes;   /**< Destination X in bytes */
+    unsigned int dstY;          /**< Destination Y */
+    CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
+    void *dstHost;              /**< Destination host pointer */
+    CUdeviceptr dstDevice;      /**< Destination device pointer */
+    CUarray dstArray;           /**< Destination array reference */
+    unsigned int dstPitch;      /**< Destination pitch (ignored when dst is array) */
+
+    unsigned int WidthInBytes;  /**< Width of 2D memory copy in bytes */
+    unsigned int Height;        /**< Height of 2D memory copy */
+} CUDA_MEMCPY2D;
+
+typedef struct CUDA_MEMCPY3D_st
+{
+    unsigned int srcXInBytes;   /**< Source X in bytes */
+    unsigned int srcY;          /**< Source Y */
+    unsigned int srcZ;          /**< Source Z */
+    unsigned int srcLOD;        /**< Source LOD */
+    CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
+    const void *srcHost;        /**< Source host pointer */
+    CUdeviceptr srcDevice;      /**< Source device pointer */
+    CUarray srcArray;           /**< Source array reference */
+    void *reserved0;            /**< Must be NULL */
+    unsigned int srcPitch;      /**< Source pitch (ignored when src is array) */
+    unsigned int srcHeight;     /**< Source height (ignored when src is array; may be 0 if Depth==1) */
+
+    unsigned int dstXInBytes;   /**< Destination X in bytes */
+    unsigned int dstY;          /**< Destination Y */
+    unsigned int dstZ;          /**< Destination Z */
+    unsigned int dstLOD;        /**< Destination LOD */
+    CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
+    void *dstHost;              /**< Destination host pointer */
+    CUdeviceptr dstDevice;      /**< Destination device pointer */
+    CUarray dstArray;           /**< Destination array reference */
+    void *reserved1;            /**< Must be NULL */
+    unsigned int dstPitch;      /**< Destination pitch (ignored when dst is array) */
+    unsigned int dstHeight;     /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */
+
+    unsigned int WidthInBytes;  /**< Width of 3D memory copy in bytes */
+    unsigned int Height;        /**< Height of 3D memory copy */
+    unsigned int Depth;         /**< Depth of 3D memory copy */
+} CUDA_MEMCPY3D;
+
+typedef struct CUDA_ARRAY_DESCRIPTOR_st
+{
+    unsigned int Width;         /**< Width of array */
+    unsigned int Height;        /**< Height of array */
+
+    CUarray_format Format;      /**< Array format */
+    unsigned int NumChannels;   /**< Channels per array element */
+} CUDA_ARRAY_DESCRIPTOR;
+
+typedef struct CUDA_ARRAY3D_DESCRIPTOR_st
+{
+    unsigned int Width;         /**< Width of 3D array */
+    unsigned int Height;        /**< Height of 3D array */
+    unsigned int Depth;         /**< Depth of 3D array */
+
+    CUarray_format Format;      /**< Array format */
+    unsigned int NumChannels;   /**< Channels per array element */
+    unsigned int Flags;         /**< Flags */
+} CUDA_ARRAY3D_DESCRIPTOR;
+
+CUresult CUDAAPI cuDeviceTotalMem(unsigned int *bytes, CUdevice dev);
+CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev);
+CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, unsigned int *bytes, CUmodule hmod, const char *name);
+CUresult CUDAAPI cuMemGetInfo(unsigned int *free, unsigned int *total);
+CUresult CUDAAPI cuMemAlloc(CUdeviceptr *dptr, unsigned int bytesize);
+CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr *dptr, unsigned int *pPitch, unsigned int WidthInBytes, unsigned int Height, unsigned int ElementSizeBytes);
+CUresult CUDAAPI cuMemFree(CUdeviceptr dptr);
+CUresult CUDAAPI cuMemGetAddressRange(CUdeviceptr *pbase, unsigned int *psize, CUdeviceptr dptr);
+CUresult CUDAAPI cuMemAllocHost(void **pp, unsigned int bytesize);
+CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned int Flags);
+CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, unsigned int ByteCount);
+CUresult CUDAAPI cuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, unsigned int ByteCount);
+CUresult CUDAAPI cuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, unsigned int ByteCount);
+CUresult CUDAAPI cuMemcpyDtoA(CUarray dstArray, unsigned int dstOffset, CUdeviceptr srcDevice, unsigned int ByteCount);
+CUresult CUDAAPI cuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
+CUresult CUDAAPI cuMemcpyHtoA(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount);
+CUresult CUDAAPI cuMemcpyAtoH(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
+CUresult CUDAAPI cuMemcpyAtoA(CUarray dstArray, unsigned int dstOffset, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
+CUresult CUDAAPI cuMemcpyHtoAAsync(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount, CUstream hStream);
+CUresult CUDAAPI cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount, CUstream hStream);
+CUresult CUDAAPI cuMemcpy2D(const CUDA_MEMCPY2D *pCopy);
+CUresult CUDAAPI cuMemcpy2DUnaligned(const CUDA_MEMCPY2D *pCopy);
+CUresult CUDAAPI cuMemcpy3D(const CUDA_MEMCPY3D *pCopy);
+CUresult CUDAAPI cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost, unsigned int ByteCount, CUstream hStream);
+CUresult CUDAAPI cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream);
+CUresult CUDAAPI cuMemcpyDtoDAsync(CUdeviceptr dstDevice, CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream);
+CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D *pCopy, CUstream hStream);
+CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream);
+CUresult CUDAAPI cuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, unsigned int N);
+CUresult CUDAAPI cuMemsetD16(CUdeviceptr dstDevice, unsigned short us, unsigned int N);
+CUresult CUDAAPI cuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, unsigned int N);
+CUresult CUDAAPI cuMemsetD2D8(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned char uc, unsigned int Width, unsigned int Height);
+CUresult CUDAAPI cuMemsetD2D16(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned short us, unsigned int Width, unsigned int Height);
+CUresult CUDAAPI cuMemsetD2D32(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned int ui, unsigned int Width, unsigned int Height);
+CUresult CUDAAPI cuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR *pAllocateArray);
+CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor, CUarray hArray);
+CUresult CUDAAPI cuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray);
+CUresult CUDAAPI cuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray);
+CUresult CUDAAPI cuTexRefSetAddress(unsigned int *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, unsigned int bytes);
+CUresult CUDAAPI cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, unsigned int Pitch);
+CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr *pdptr, CUtexref hTexRef);
+CUresult CUDAAPI cuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, unsigned int *pSize, CUgraphicsResource resource);
+#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION < 3020 */
+#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION < 4000
+CUresult CUDAAPI cuCtxDestroy(CUcontext ctx);
+CUresult CUDAAPI cuCtxPopCurrent(CUcontext *pctx);
+CUresult CUDAAPI cuCtxPushCurrent(CUcontext ctx);
+CUresult CUDAAPI cuStreamDestroy(CUstream hStream);
+CUresult CUDAAPI cuEventDestroy(CUevent hEvent);
+#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION < 4000 */
+#if defined(__CUDA_API_VERSION_INTERNAL)
+    #undef CUdeviceptr
+    #undef CUDA_MEMCPY2D_st
+    #undef CUDA_MEMCPY2D
+    #undef CUDA_MEMCPY3D_st
+    #undef CUDA_MEMCPY3D
+    #undef CUDA_ARRAY_DESCRIPTOR_st
+    #undef CUDA_ARRAY_DESCRIPTOR
+    #undef CUDA_ARRAY3D_DESCRIPTOR_st
+    #undef CUDA_ARRAY3D_DESCRIPTOR
+#endif /* __CUDA_API_VERSION_INTERNAL */
+
+#if defined(__CUDA_API_VERSION_INTERNAL)
+    CUresult CUDAAPI cuMemcpyHtoD_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount);
+    CUresult CUDAAPI cuMemcpyDtoH_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount);
+    CUresult CUDAAPI cuMemcpyDtoD_v2(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount);
+    CUresult CUDAAPI cuMemcpyDtoA_v2(CUarray dstArray, size_t dstOffset, CUdeviceptr srcDevice, size_t ByteCount);
+    CUresult CUDAAPI cuMemcpyAtoD_v2(CUdeviceptr dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+    CUresult CUDAAPI cuMemcpyHtoA_v2(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount);
+    CUresult CUDAAPI cuMemcpyAtoH_v2(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+    CUresult CUDAAPI cuMemcpyAtoA_v2(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+    CUresult CUDAAPI cuMemcpyHtoAAsync_v2(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream);
+    CUresult CUDAAPI cuMemcpyAtoHAsync_v2(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream);
+    CUresult CUDAAPI cuMemcpy2D_v2(const CUDA_MEMCPY2D *pCopy);
+    CUresult CUDAAPI cuMemcpy2DUnaligned_v2(const CUDA_MEMCPY2D *pCopy);
+    CUresult CUDAAPI cuMemcpy3D_v2(const CUDA_MEMCPY3D *pCopy);
+    CUresult CUDAAPI cuMemcpyHtoDAsync_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream);
+    CUresult CUDAAPI cuMemcpyDtoHAsync_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
+    CUresult CUDAAPI cuMemcpyDtoDAsync_v2(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
+    CUresult CUDAAPI cuMemcpy2DAsync_v2(const CUDA_MEMCPY2D *pCopy, CUstream hStream);
+    CUresult CUDAAPI cuMemcpy3DAsync_v2(const CUDA_MEMCPY3D *pCopy, CUstream hStream);
+    CUresult CUDAAPI cuMemsetD8_v2(CUdeviceptr dstDevice, unsigned char uc, size_t N);
+    CUresult CUDAAPI cuMemsetD16_v2(CUdeviceptr dstDevice, unsigned short us, size_t N);
+    CUresult CUDAAPI cuMemsetD32_v2(CUdeviceptr dstDevice, unsigned int ui, size_t N);
+    CUresult CUDAAPI cuMemsetD2D8_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height);
+    CUresult CUDAAPI cuMemsetD2D16_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height);
+    CUresult CUDAAPI cuMemsetD2D32_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height);
+    CUresult CUDAAPI cuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount);
+    CUresult CUDAAPI cuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount, CUstream hStream);
+    CUresult CUDAAPI cuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount);
+    CUresult CUDAAPI cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream);
+    CUresult CUDAAPI cuMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *pCopy);
+    CUresult CUDAAPI cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy, CUstream hStream);
+
+    CUresult CUDAAPI cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream);
+    CUresult CUDAAPI cuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us, size_t N, CUstream hStream);
+    CUresult CUDAAPI cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream);
+    CUresult CUDAAPI cuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream);
+    CUresult CUDAAPI cuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream);
+    CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream);
+
+    CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority);
+    CUresult CUDAAPI cuStreamGetFlags(CUstream hStream, unsigned int *flags);
+    CUresult CUDAAPI cuStreamGetCtx(CUstream hStream, CUcontext *pctx);
+    CUresult CUDAAPI cuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags);
+    CUresult CUDAAPI cuStreamAddCallback(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags);
+    CUresult CUDAAPI cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr, size_t length, unsigned int flags);
+    CUresult CUDAAPI cuStreamQuery(CUstream hStream);
+    CUresult CUDAAPI cuStreamSynchronize(CUstream hStream);
+    CUresult CUDAAPI cuEventRecord(CUevent hEvent, CUstream hStream);
+    CUresult CUDAAPI cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
+    CUresult CUDAAPI cuLaunchHostFunc(CUstream hStream, CUhostFn fn, void *userData);
+    CUresult CUDAAPI cuGraphicsMapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
+    CUresult CUDAAPI cuGraphicsUnmapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
+    CUresult CUDAAPI cuStreamWriteValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags);
+    CUresult CUDAAPI cuStreamWaitValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags);
+    CUresult CUDAAPI cuStreamWriteValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags);
+    CUresult CUDAAPI cuStreamWaitValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags);
+    CUresult CUDAAPI cuStreamBatchMemOp(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags);
+    CUresult CUDAAPI cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count, CUdevice dstDevice, CUstream hStream);
+    CUresult CUDAAPI cuLaunchCooperativeKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams);
+    CUresult CUDAAPI cuSignalExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream);
+    CUresult CUDAAPI cuWaitExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream);
+    CUresult CUDAAPI cuStreamBeginCapture(CUstream hStream);
+    CUresult CUDAAPI cuStreamBeginCapture_ptsz(CUstream hStream);
+    CUresult CUDAAPI cuStreamBeginCapture_v2(CUstream hStream, CUstreamCaptureMode mode);
+    CUresult CUDAAPI cuStreamEndCapture(CUstream hStream, CUgraph *phGraph);
+    CUresult CUDAAPI cuStreamIsCapturing(CUstream hStream, CUstreamCaptureStatus *captureStatus);
+    CUresult CUDAAPI cuStreamGetCaptureInfo(CUstream hStream, CUstreamCaptureStatus *captureStatus, cuuint64_t *id);
+    CUresult CUDAAPI cuGraphLaunch(CUgraphExec hGraph, CUstream hStream);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#undef __CUDA_API_VERSION
+#undef __CUDA_DEPRECATED
+
+#endif /* __cuda_cuda_h__ */
diff --git a/icicle/curves/bn254/cuda_runtime.h b/icicle/curves/bn254/cuda_runtime.h
new file mode 100644
index 000000000..b909b4dee
--- /dev/null
+++ b/icicle/curves/bn254/cuda_runtime.h
@@ -0,0 +1,2039 @@
+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_RUNTIME_H__)
+#define __CUDA_RUNTIME_H__
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_RUNTIME_H__
+#endif
+
+#if !defined(__CUDACC_RTC__)
+#if defined(__GNUC__)
+#if defined(__clang__) || (!defined(__PGIC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)))
+#pragma GCC diagnostic push
+#endif
+#if defined(__clang__) || (!defined(__PGIC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2)))
+#pragma GCC diagnostic ignored "-Wunused-function"
+#endif
+#elif defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable: 4820)
+#endif
+#endif
+
+#ifdef __QNX__
+#if (__GNUC__ == 4 && __GNUC_MINOR__ >= 7)
+typedef unsigned size_t;
+#endif
+#endif
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "crt/host_config.h"
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "builtin_types.h"
+#include "library_types.h"
+#if !defined(__CUDACC_RTC__)
+#define EXCLUDE_FROM_RTC
+#include "channel_descriptor.h"
+#include "cuda_runtime_api.h"
+#include "driver_functions.h"
+#undef EXCLUDE_FROM_RTC
+#endif /* !__CUDACC_RTC__ */
+#include "crt/host_defines.h"
+#include "vector_functions.h"
+
+#if defined(__CUDACC__)
+
+#if defined(__CUDACC_RTC__)
+#include "nvrtc_device_runtime.h"
+#include "crt/device_functions.h"
+#include "crt/common_functions.h"
+#include "cuda_surface_types.h"
+#include "cuda_texture_types.h"
+#include "device_launch_parameters.h"
+
+#else /* !__CUDACC_RTC__ */
+#define EXCLUDE_FROM_RTC
+#include "crt/common_functions.h"
+#include "cuda_surface_types.h"
+#include "cuda_texture_types.h"
+#include "crt/device_functions.h"
+#include "device_launch_parameters.h"
+
+#if defined(__CUDACC_EXTENDED_LAMBDA__)
+#include <functional>
+#include <utility>
+struct  __device_builtin__ __nv_lambda_preheader_injection { };
+#endif /* defined(__CUDACC_EXTENDED_LAMBDA__) */
+
+#undef EXCLUDE_FROM_RTC
+#endif /* __CUDACC_RTC__ */
+
+#endif /* __CUDACC__ */
+
+/** \cond impl_private */
+#if defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
+#define __CUDA_DEPRECATED
+#elif defined(_MSC_VER)
+#define __CUDA_DEPRECATED __declspec(deprecated)
+#elif defined(__GNUC__)
+#define __CUDA_DEPRECATED __attribute__((deprecated))
+#else
+#define __CUDA_DEPRECATED
+#endif
+/** \endcond impl_private */
+
+#if defined(__cplusplus) && !defined(__CUDACC_RTC__)
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+/**
+ * \addtogroup CUDART_HIGHLEVEL
+ * @{
+ */
+
+/**
+ *\brief Launches a device function
+ *
+ * The function invokes kernel \p func on \p gridDim (\p gridDim.x × \p gridDim.y
+ * × \p gridDim.z) grid of blocks. Each block contains \p blockDim (\p blockDim.x ×
+ * \p blockDim.y × \p blockDim.z) threads.
+ *
+ * If the kernel has N parameters the \p args should point to array of N pointers.
+ * Each pointer, from <tt>args[0]</tt> to <tt>args[N - 1]</tt>, point to the region
+ * of memory from which the actual parameter will be copied.
+ *
+ * \p sharedMem sets the amount of dynamic shared memory that will be available to
+ * each thread block.
+ *
+ * \p stream specifies a stream the invocation is associated to.
+ *
+ * \param func        - Device function symbol
+ * \param gridDim     - Grid dimensions
+ * \param blockDim    - Block dimensions
+ * \param args        - Arguments
+ * \param sharedMem   - Shared memory (defaults to 0)
+ * \param stream      - Stream identifier (defaults to NULL)
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidConfiguration,
+ * ::cudaErrorLaunchFailure,
+ * ::cudaErrorLaunchTimeout,
+ * ::cudaErrorLaunchOutOfResources,
+ * ::cudaErrorSharedObjectInitFailed,
+ * ::cudaErrorInvalidPtx,
+ * ::cudaErrorNoKernelImageForDevice,
+ * ::cudaErrorJitCompilerNotFound
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ *
+ * \ref ::cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C API)"
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaLaunchKernel(
+  const T *func,
+  dim3 gridDim,
+  dim3 blockDim,
+  void **args,
+  size_t sharedMem = 0,
+  cudaStream_t stream = 0
+)
+{
+    return ::cudaLaunchKernel((const void *)func, gridDim, blockDim, args, sharedMem, stream);
+}
+
+/**
+ *\brief Launches a device function
+ *
+ * The function invokes kernel \p func on \p gridDim (\p gridDim.x × \p gridDim.y
+ * × \p gridDim.z) grid of blocks. Each block contains \p blockDim (\p blockDim.x ×
+ * \p blockDim.y × \p blockDim.z) threads.
+ *
+ * The device on which this kernel is invoked must have a non-zero value for
+ * the device attribute ::cudaDevAttrCooperativeLaunch.
+ *
+ * The total number of blocks launched cannot exceed the maximum number of blocks per
+ * multiprocessor as returned by ::cudaOccupancyMaxActiveBlocksPerMultiprocessor (or
+ * ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) times the number of multiprocessors
+ * as specified by the device attribute ::cudaDevAttrMultiProcessorCount.
+ *
+ * The kernel cannot make use of CUDA dynamic parallelism.
+ *
+ * If the kernel has N parameters the \p args should point to array of N pointers.
+ * Each pointer, from <tt>args[0]</tt> to <tt>args[N - 1]</tt>, point to the region
+ * of memory from which the actual parameter will be copied.
+ *
+ * \p sharedMem sets the amount of dynamic shared memory that will be available to
+ * each thread block.
+ *
+ * \p stream specifies a stream the invocation is associated to.
+ *
+ * \param func        - Device function symbol
+ * \param gridDim     - Grid dimensions
+ * \param blockDim    - Block dimensions
+ * \param args        - Arguments
+ * \param sharedMem   - Shared memory (defaults to 0)
+ * \param stream      - Stream identifier (defaults to NULL)
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidConfiguration,
+ * ::cudaErrorLaunchFailure,
+ * ::cudaErrorLaunchTimeout,
+ * ::cudaErrorLaunchOutOfResources,
+ * ::cudaErrorSharedObjectInitFailed
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ *
+ * \ref ::cudaLaunchCooperativeKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchCooperativeKernel (C API)"
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaLaunchCooperativeKernel(
+  const T *func,
+  dim3 gridDim,
+  dim3 blockDim,
+  void **args,
+  size_t sharedMem = 0,
+  cudaStream_t stream = 0
+)
+{
+    return ::cudaLaunchCooperativeKernel((const void *)func, gridDim, blockDim, args, sharedMem, stream);
+}
+
+/**
+ * \brief \hl Creates an event object with the specified flags
+ *
+ * Creates an event object with the specified flags. Valid flags include:
+ * - ::cudaEventDefault: Default event creation flag.
+ * - ::cudaEventBlockingSync: Specifies that event should use blocking
+ *   synchronization. A host thread that uses ::cudaEventSynchronize() to wait
+ *   on an event created with this flag will block until the event actually
+ *   completes.
+ * - ::cudaEventDisableTiming: Specifies that the created event does not need
+ *   to record timing data.  Events created with this flag specified and
+ *   the ::cudaEventBlockingSync flag not specified will provide the best
+ *   performance when used with ::cudaStreamWaitEvent() and ::cudaEventQuery().
+ *
+ * \param event - Newly created event
+ * \param flags - Flags for new event
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorLaunchFailure,
+ * ::cudaErrorMemoryAllocation
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)",
+ * ::cudaEventCreateWithFlags, ::cudaEventRecord, ::cudaEventQuery,
+ * ::cudaEventSynchronize, ::cudaEventDestroy, ::cudaEventElapsedTime,
+ * ::cudaStreamWaitEvent
+ */
+static __inline__ __host__ cudaError_t cudaEventCreate(
+  cudaEvent_t  *event,
+  unsigned int  flags
+)
+{
+  return ::cudaEventCreateWithFlags(event, flags);
+}
+
+/**
+ * \brief \hl Allocates page-locked memory on the host
+ *
+ * Allocates \p size bytes of host memory that is page-locked and accessible
+ * to the device. The driver tracks the virtual memory ranges allocated with
+ * this function and automatically accelerates calls to functions such as
+ * ::cudaMemcpy(). Since the memory can be accessed directly by the device, it
+ * can be read or written with much higher bandwidth than pageable memory
+ * obtained with functions such as ::malloc(). Allocating excessive amounts of
+ * pinned memory may degrade system performance, since it reduces the amount
+ * of memory available to the system for paging. As a result, this function is
+ * best used sparingly to allocate staging areas for data exchange between host
+ * and device.
+ *
+ * The \p flags parameter enables different options to be specified that affect
+ * the allocation, as follows.
+ * - ::cudaHostAllocDefault: This flag's value is defined to be 0.
+ * - ::cudaHostAllocPortable: The memory returned by this call will be
+ * considered as pinned memory by all CUDA contexts, not just the one that
+ * performed the allocation.
+ * - ::cudaHostAllocMapped: Maps the allocation into the CUDA address space.
+ * The device pointer to the memory may be obtained by calling
+ * ::cudaHostGetDevicePointer().
+ * - ::cudaHostAllocWriteCombined: Allocates the memory as write-combined (WC).
+ * WC memory can be transferred across the PCI Express bus more quickly on some
+ * system configurations, but cannot be read efficiently by most CPUs.  WC
+ * memory is a good option for buffers that will be written by the CPU and read
+ * by the device via mapped pinned memory or host->device transfers.
+ *
+ * All of these flags are orthogonal to one another: a developer may allocate
+ * memory that is portable, mapped and/or write-combined with no restrictions.
+ *
+ * ::cudaSetDeviceFlags() must have been called with the ::cudaDeviceMapHost
+ * flag in order for the ::cudaHostAllocMapped flag to have any effect.
+ *
+ * The ::cudaHostAllocMapped flag may be specified on CUDA contexts for devices
+ * that do not support mapped pinned memory. The failure is deferred to
+ * ::cudaHostGetDevicePointer() because the memory may be mapped into other
+ * CUDA contexts via the ::cudaHostAllocPortable flag.
+ *
+ * Memory allocated by this function must be freed with ::cudaFreeHost().
+ *
+ * \param ptr   - Device pointer to allocated memory
+ * \param size  - Requested allocation size in bytes
+ * \param flags - Requested properties of allocated memory
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorMemoryAllocation
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaSetDeviceFlags,
+ * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
+ * ::cudaFreeHost, ::cudaHostAlloc
+ */
+static __inline__ __host__ cudaError_t cudaMallocHost(
+  void         **ptr,
+  size_t         size,
+  unsigned int   flags
+)
+{
+  return ::cudaHostAlloc(ptr, size, flags);
+}
+
+template<class T>
+static __inline__ __host__ cudaError_t cudaHostAlloc(
+  T            **ptr,
+  size_t         size,
+  unsigned int   flags
+)
+{
+  return ::cudaHostAlloc((void**)(void*)ptr, size, flags);
+}
+
+template<class T>
+static __inline__ __host__ cudaError_t cudaHostGetDevicePointer(
+  T            **pDevice,
+  void          *pHost,
+  unsigned int   flags
+)
+{
+  return ::cudaHostGetDevicePointer((void**)(void*)pDevice, pHost, flags);
+}
+
+/**
+ * \brief Allocates memory that will be automatically managed by the Unified Memory system
+ *
+ * Allocates \p size bytes of managed memory on the device and returns in
+ * \p *devPtr a pointer to the allocated memory. If the device doesn't support
+ * allocating managed memory, ::cudaErrorNotSupported is returned. Support
+ * for managed memory can be queried using the device attribute
+ * ::cudaDevAttrManagedMemory. The allocated memory is suitably
+ * aligned for any kind of variable. The memory is not cleared. If \p size
+ * is 0, ::cudaMallocManaged returns ::cudaErrorInvalidValue. The pointer
+ * is valid on the CPU and on all GPUs in the system that support managed memory.
+ * All accesses to this pointer must obey the Unified Memory programming model.
+ *
+ * \p flags specifies the default stream association for this allocation.
+ * \p flags must be one of ::cudaMemAttachGlobal or ::cudaMemAttachHost. The
+ * default value for \p flags is ::cudaMemAttachGlobal.
+ * If ::cudaMemAttachGlobal is specified, then this memory is accessible from
+ * any stream on any device. If ::cudaMemAttachHost is specified, then the
+ * allocation should not be accessed from devices that have a zero value for the
+ * device attribute ::cudaDevAttrConcurrentManagedAccess; an explicit call to
+ * ::cudaStreamAttachMemAsync will be required to enable access on such devices.
+ *
+ * If the association is later changed via ::cudaStreamAttachMemAsync to
+ * a single stream, the default association, as specified during ::cudaMallocManaged,
+ * is restored when that stream is destroyed. For __managed__ variables, the
+ * default association is always ::cudaMemAttachGlobal. Note that destroying a
+ * stream is an asynchronous operation, and as a result, the change to default
+ * association won't happen until all work in the stream has completed.
+ *
+ * Memory allocated with ::cudaMallocManaged should be released with ::cudaFree.
+ *
+ * Device memory oversubscription is possible for GPUs that have a non-zero value for the
+ * device attribute ::cudaDevAttrConcurrentManagedAccess. Managed memory on
+ * such GPUs may be evicted from device memory to host memory at any time by the Unified
+ * Memory driver in order to make room for other allocations.
+ *
+ * In a multi-GPU system where all GPUs have a non-zero value for the device attribute
+ * ::cudaDevAttrConcurrentManagedAccess, managed memory may not be populated when this
+ * API returns and instead may be populated on access. In such systems, managed memory can
+ * migrate to any processor's memory at any time. The Unified Memory driver will employ heuristics to
+ * maintain data locality and prevent excessive page faults to the extent possible. The application
+ * can also guide the driver about memory usage patterns via ::cudaMemAdvise. The application
+ * can also explicitly migrate memory to a desired processor's memory via
+ * ::cudaMemPrefetchAsync.
+ *
+ * In a multi-GPU system where all of the GPUs have a zero value for the device attribute
+ * ::cudaDevAttrConcurrentManagedAccess and all the GPUs have peer-to-peer support
+ * with each other, the physical storage for managed memory is created on the GPU which is active
+ * at the time ::cudaMallocManaged is called. All other GPUs will reference the data at reduced
+ * bandwidth via peer mappings over the PCIe bus. The Unified Memory driver does not migrate
+ * memory among such GPUs.
+ *
+ * In a multi-GPU system where not all GPUs have peer-to-peer support with each other and
+ * where the value of the device attribute ::cudaDevAttrConcurrentManagedAccess
+ * is zero for at least one of those GPUs, the location chosen for physical storage of managed
+ * memory is system-dependent.
+ * - On Linux, the location chosen will be device memory as long as the current set of active
+ * contexts are on devices that either have peer-to-peer support with each other or have a
+ * non-zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess.
+ * If there is an active context on a GPU that does not have a non-zero value for that device
+ * attribute and it does not have peer-to-peer support with the other devices that have active
+ * contexts on them, then the location for physical storage will be 'zero-copy' or host memory.
+ * Note that this means that managed memory that is located in device memory is migrated to
+ * host memory if a new context is created on a GPU that doesn't have a non-zero value for
+ * the device attribute and does not support peer-to-peer with at least one of the other devices
+ * that has an active context. This in turn implies that context creation may fail if there is
+ * insufficient host memory to migrate all managed allocations.
+ * - On Windows, the physical storage is always created in 'zero-copy' or host memory.
+ * All GPUs will reference the data at reduced bandwidth over the PCIe bus. In these
+ * circumstances, use of the environment variable CUDA_VISIBLE_DEVICES is recommended to
+ * restrict CUDA to only use those GPUs that have peer-to-peer support.
+ * Alternatively, users can also set CUDA_MANAGED_FORCE_DEVICE_ALLOC to a non-zero
+ * value to force the driver to always use device memory for physical storage.
+ * When this environment variable is set to a non-zero value, all devices used in
+ * that process that support managed memory have to be peer-to-peer compatible
+ * with each other. The error ::cudaErrorInvalidDevice will be returned if a device
+ * that supports managed memory is used and it is not peer-to-peer compatible with
+ * any of the other managed memory supporting devices that were previously used in
+ * that process, even if ::cudaDeviceReset has been called on those devices. These
+ * environment variables are described in the CUDA programming guide under the
+ * "CUDA environment variables" section.
+ * - On ARM, managed memory is not available on discrete gpu with Drive PX-2.
+ *
+ * \param devPtr - Pointer to allocated device memory
+ * \param size   - Requested allocation size in bytes
+ * \param flags  - Must be either ::cudaMemAttachGlobal or ::cudaMemAttachHost (defaults to ::cudaMemAttachGlobal)
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorMemoryAllocation,
+ * ::cudaErrorNotSupported,
+ * ::cudaErrorInvalidValue
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMallocPitch, ::cudaFree, ::cudaMallocArray, ::cudaFreeArray,
+ * ::cudaMalloc3D, ::cudaMalloc3DArray,
+ * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
+ * ::cudaFreeHost, ::cudaHostAlloc, ::cudaDeviceGetAttribute, ::cudaStreamAttachMemAsync
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaMallocManaged(
+  T            **devPtr,
+  size_t         size,
+  unsigned int   flags = cudaMemAttachGlobal
+)
+{
+  return ::cudaMallocManaged((void**)(void*)devPtr, size, flags);
+}
+
+/**
+ * \brief Attach memory to a stream asynchronously
+ *
+ * Enqueues an operation in \p stream to specify stream association of
+ * \p length bytes of memory starting from \p devPtr. This function is a
+ * stream-ordered operation, meaning that it is dependent on, and will
+ * only take effect when, previous work in stream has completed. Any
+ * previous association is automatically replaced.
+ *
+ * \p devPtr must point to an one of the following types of memories:
+ * - managed memory declared using the __managed__ keyword or allocated with
+ *   ::cudaMallocManaged.
+ * - a valid host-accessible region of system-allocated pageable memory. This
+ *   type of memory may only be specified if the device associated with the
+ *   stream reports a non-zero value for the device attribute
+ *   ::cudaDevAttrPageableMemoryAccess.
+ *
+ * For managed allocations, \p length must be either zero or the entire
+ * allocation's size. Both indicate that the entire allocation's stream
+ * association is being changed. Currently, it is not possible to change stream
+ * association for a portion of a managed allocation.
+ *
+ * For pageable allocations, \p length must be non-zero.
+ *
+ * The stream association is specified using \p flags which must be
+ * one of ::cudaMemAttachGlobal, ::cudaMemAttachHost or ::cudaMemAttachSingle.
+ * The default value for \p flags is ::cudaMemAttachSingle
+ * If the ::cudaMemAttachGlobal flag is specified, the memory can be accessed
+ * by any stream on any device.
+ * If the ::cudaMemAttachHost flag is specified, the program makes a guarantee
+ * that it won't access the memory on the device from any stream on a device that
+ * has a zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess.
+ * If the ::cudaMemAttachSingle flag is specified and \p stream is associated with
+ * a device that has a zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess,
+ * the program makes a guarantee that it will only access the memory on the device
+ * from \p stream. It is illegal to attach singly to the NULL stream, because the
+ * NULL stream is a virtual global stream and not a specific stream. An error will
+ * be returned in this case.
+ *
+ * When memory is associated with a single stream, the Unified Memory system will
+ * allow CPU access to this memory region so long as all operations in \p stream
+ * have completed, regardless of whether other streams are active. In effect,
+ * this constrains exclusive ownership of the managed memory region by
+ * an active GPU to per-stream activity instead of whole-GPU activity.
+ *
+ * Accessing memory on the device from streams that are not associated with
+ * it will produce undefined results. No error checking is performed by the
+ * Unified Memory system to ensure that kernels launched into other streams
+ * do not access this region.
+ *
+ * It is a program's responsibility to order calls to ::cudaStreamAttachMemAsync
+ * via events, synchronization or other means to ensure legal access to memory
+ * at all times. Data visibility and coherency will be changed appropriately
+ * for all kernels which follow a stream-association change.
+ *
+ * If \p stream is destroyed while data is associated with it, the association is
+ * removed and the association reverts to the default visibility of the allocation
+ * as specified at ::cudaMallocManaged. For __managed__ variables, the default
+ * association is always ::cudaMemAttachGlobal. Note that destroying a stream is an
+ * asynchronous operation, and as a result, the change to default association won't
+ * happen until all work in the stream has completed.
+ *
+ * \param stream  - Stream in which to enqueue the attach operation
+ * \param devPtr  - Pointer to memory (must be a pointer to managed memory or
+ *                  to a valid host-accessible region of system-allocated
+ *                  memory)
+ * \param length  - Length of memory (defaults to zero)
+ * \param flags   - Must be one of ::cudaMemAttachGlobal, ::cudaMemAttachHost or ::cudaMemAttachSingle (defaults to ::cudaMemAttachSingle)
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorNotReady,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaStreamCreate, ::cudaStreamCreateWithFlags, ::cudaStreamWaitEvent, ::cudaStreamSynchronize, ::cudaStreamAddCallback, ::cudaStreamDestroy, ::cudaMallocManaged
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaStreamAttachMemAsync(
+  cudaStream_t   stream,
+  T              *devPtr,
+  size_t         length = 0,
+  unsigned int   flags  = cudaMemAttachSingle
+)
+{
+  return ::cudaStreamAttachMemAsync(stream, (void*)devPtr, length, flags);
+}
+
+template<class T>
+static __inline__ __host__ cudaError_t cudaMalloc(
+  T      **devPtr,
+  size_t   size
+)
+{
+  return ::cudaMalloc((void**)(void*)devPtr, size);
+}
+
+template<class T>
+static __inline__ __host__ cudaError_t cudaMallocHost(
+  T            **ptr,
+  size_t         size,
+  unsigned int   flags = 0
+)
+{
+  return cudaMallocHost((void**)(void*)ptr, size, flags);
+}
+
+template<class T>
+static __inline__ __host__ cudaError_t cudaMallocPitch(
+  T      **devPtr,
+  size_t  *pitch,
+  size_t   width,
+  size_t   height
+)
+{
+  return ::cudaMallocPitch((void**)(void*)devPtr, pitch, width, height);
+}
+
+#if defined(__CUDACC__)
+
+/**
+ * \brief \hl Copies data to the given symbol on the device
+ *
+ * Copies \p count bytes from the memory area pointed to by \p src
+ * to the memory area \p offset bytes from the start of symbol
+ * \p symbol. The memory areas may not overlap. \p symbol is a variable that
+ * resides in global or constant memory space. \p kind can be either
+ * ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToDevice.
+ *
+ * \param symbol - Device symbol reference
+ * \param src    - Source memory address
+ * \param count  - Size in bytes to copy
+ * \param offset - Offset from start of symbol in bytes
+ * \param kind   - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidSymbol,
+ * ::cudaErrorInvalidMemcpyDirection,
+ * ::cudaErrorNoKernelImageForDevice
+ * \notefnerr
+ * \note_sync
+ * \note_string_api_deprecation
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpy2DArrayToArray,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaMemcpyToSymbol(
+  const T                   &symbol,
+  const void                *src,
+        size_t               count,
+        size_t               offset = 0,
+        enum cudaMemcpyKind  kind   = cudaMemcpyHostToDevice
+)
+{
+  return ::cudaMemcpyToSymbol((const void*)&symbol, src, count, offset, kind);
+}
+
+/**
+ * \brief \hl Copies data to the given symbol on the device
+ *
+ * Copies \p count bytes from the memory area pointed to by \p src
+ * to the memory area \p offset bytes from the start of symbol
+ * \p symbol. The memory areas may not overlap. \p symbol is a variable that
+ * resides in global or constant memory space. \p kind can be either
+ * ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToDevice.
+ *
+ * ::cudaMemcpyToSymbolAsync() is asynchronous with respect to the host, so
+ * the call may return before the copy is complete. The copy can optionally
+ * be associated to a stream by passing a non-zero \p stream argument. If
+ * \p kind is ::cudaMemcpyHostToDevice and \p stream is non-zero, the copy
+ * may overlap with operations in other streams.
+ *
+ * \param symbol - Device symbol reference
+ * \param src    - Source memory address
+ * \param count  - Size in bytes to copy
+ * \param offset - Offset from start of symbol in bytes
+ * \param kind   - Type of transfer
+ * \param stream - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidSymbol,
+ * ::cudaErrorInvalidMemcpyDirection,
+ * ::cudaErrorNoKernelImageForDevice
+ * \notefnerr
+ * \note_async
+ * \note_string_api_deprecation
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyFromSymbolAsync
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaMemcpyToSymbolAsync(
+  const T                   &symbol,
+  const void                *src,
+        size_t               count,
+        size_t               offset = 0,
+        enum cudaMemcpyKind  kind   = cudaMemcpyHostToDevice,
+        cudaStream_t         stream = 0
+)
+{
+  return ::cudaMemcpyToSymbolAsync((const void*)&symbol, src, count, offset, kind, stream);
+}
+
+/**
+ * \brief \hl Copies data from the given symbol on the device
+ *
+ * Copies \p count bytes from the memory area \p offset bytes
+ * from the start of symbol \p symbol to the memory area pointed to by \p dst.
+ * The memory areas may not overlap. \p symbol is a variable that
+ * resides in global or constant memory space. \p kind can be either
+ * ::cudaMemcpyDeviceToHost or ::cudaMemcpyDeviceToDevice.
+ *
+ * \param dst    - Destination memory address
+ * \param symbol - Device symbol reference
+ * \param count  - Size in bytes to copy
+ * \param offset - Offset from start of symbol in bytes
+ * \param kind   - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidSymbol,
+ * ::cudaErrorInvalidMemcpyDirection,
+ * ::cudaErrorNoKernelImageForDevice
+ * \notefnerr
+ * \note_sync
+ * \note_string_api_deprecation
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaMemcpyFromSymbol(
+        void                *dst,
+  const T                   &symbol,
+        size_t               count,
+        size_t               offset = 0,
+        enum cudaMemcpyKind  kind   = cudaMemcpyDeviceToHost
+)
+{
+  return ::cudaMemcpyFromSymbol(dst, (const void*)&symbol, count, offset, kind);
+}
+
+/**
+ * \brief \hl Copies data from the given symbol on the device
+ *
+ * Copies \p count bytes from the memory area \p offset bytes
+ * from the start of symbol \p symbol to the memory area pointed to by \p dst.
+ * The memory areas may not overlap. \p symbol is a variable that resides in
+ * global or constant memory space. \p kind can be either
+ * ::cudaMemcpyDeviceToHost or ::cudaMemcpyDeviceToDevice.
+ *
+ * ::cudaMemcpyFromSymbolAsync() is asynchronous with respect to the host, so
+ * the call may return before the copy is complete. The copy can optionally be
+ * associated to a stream by passing a non-zero \p stream argument. If \p kind
+ * is ::cudaMemcpyDeviceToHost and \p stream is non-zero, the copy may overlap
+ * with operations in other streams.
+ *
+ * \param dst    - Destination memory address
+ * \param symbol - Device symbol reference
+ * \param count  - Size in bytes to copy
+ * \param offset - Offset from start of symbol in bytes
+ * \param kind   - Type of transfer
+ * \param stream - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidSymbol,
+ * ::cudaErrorInvalidMemcpyDirection,
+ * ::cudaErrorNoKernelImageForDevice
+ * \notefnerr
+ * \note_async
+ * \note_string_api_deprecation
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaMemcpyFromSymbolAsync(
+        void                *dst,
+  const T                   &symbol,
+        size_t               count,
+        size_t               offset = 0,
+        enum cudaMemcpyKind  kind   = cudaMemcpyDeviceToHost,
+        cudaStream_t         stream = 0
+)
+{
+  return ::cudaMemcpyFromSymbolAsync(dst, (const void*)&symbol, count, offset, kind, stream);
+}
+
+/**
+ * \brief \hl Finds the address associated with a CUDA symbol
+ *
+ * Returns in \p *devPtr the address of symbol \p symbol on the device.
+ * \p symbol can either be a variable that resides in global or constant memory space.
+ * If \p symbol cannot be found, or if \p symbol is not declared
+ * in the global or constant memory space, \p *devPtr is unchanged and the error
+ * ::cudaErrorInvalidSymbol is returned.
+ *
+ * \param devPtr - Return device pointer associated with symbol
+ * \param symbol - Device symbol reference
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidSymbol,
+ * ::cudaErrorNoKernelImageForDevice
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaGetSymbolAddress(void**, const void*) "cudaGetSymbolAddress (C API)",
+ * \ref ::cudaGetSymbolSize(size_t*, const T&) "cudaGetSymbolSize (C++ API)"
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaGetSymbolAddress(
+        void **devPtr,
+  const T     &symbol
+)
+{
+  return ::cudaGetSymbolAddress(devPtr, (const void*)&symbol);
+}
+
+/**
+ * \brief \hl Finds the size of the object associated with a CUDA symbol
+ *
+ * Returns in \p *size the size of symbol \p symbol. \p symbol must be a
+ * variable that resides in global or constant memory space.
+ * If \p symbol cannot be found, or if \p symbol is not declared
+ * in global or constant memory space, \p *size is unchanged and the error
+ * ::cudaErrorInvalidSymbol is returned.
+ *
+ * \param size   - Size of object associated with symbol
+ * \param symbol - Device symbol reference
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidSymbol,
+ * ::cudaErrorNoKernelImageForDevice
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaGetSymbolAddress(void**, const T&) "cudaGetSymbolAddress (C++ API)",
+ * \ref ::cudaGetSymbolSize(size_t*, const void*) "cudaGetSymbolSize (C API)"
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaGetSymbolSize(
+        size_t *size,
+  const T      &symbol
+)
+{
+  return ::cudaGetSymbolSize(size, (const void*)&symbol);
+}
+
+/**
+ * \brief \hl Binds a memory area to a texture
+ *
+ * Binds \p size bytes of the memory area pointed to by \p devPtr to texture
+ * reference \p tex. \p desc describes how the memory is interpreted when
+ * fetching values from the texture. The \p offset parameter is an optional
+ * byte offset as with the low-level
+ * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture()"
+ * function. Any memory previously bound to \p tex is unbound.
+ *
+ * \param offset - Offset in bytes
+ * \param tex    - Texture to bind
+ * \param devPtr - Memory area on device
+ * \param desc   - Channel format
+ * \param size   - Size of the memory area pointed to by devPtr
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidTexture
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)",
+ * ::cudaGetChannelDesc, ::cudaGetTextureReference,
+ * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)",
+ * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)",
+ * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)",
+ * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)",
+ * \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaUnbindTexture (C++ API)",
+ * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture<T, dim, readMode>&) "cudaGetTextureAlignmentOffset (C++ API)"
+ */
+template<class T, int dim, enum cudaTextureReadMode readMode>
+static __inline__ __host__ cudaError_t cudaBindTexture(
+        size_t                           *offset,
+  const struct texture<T, dim, readMode> &tex,
+  const void                             *devPtr,
+  const struct cudaChannelFormatDesc     &desc,
+        size_t                            size = UINT_MAX
+)
+{
+  return ::cudaBindTexture(offset, &tex, devPtr, &desc, size);
+}
+
+/**
+ * \brief \hl Binds a memory area to a texture
+ *
+ * Binds \p size bytes of the memory area pointed to by \p devPtr to texture
+ * reference \p tex. The channel descriptor is inherited from the texture
+ * reference type. The \p offset parameter is an optional byte offset as with
+ * the low-level
+ * ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t)
+ * function. Any memory previously bound to \p tex is unbound.
+ *
+ * \param offset - Offset in bytes
+ * \param tex    - Texture to bind
+ * \param devPtr - Memory area on device
+ * \param size   - Size of the memory area pointed to by devPtr
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidTexture
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)",
+ * ::cudaGetChannelDesc, ::cudaGetTextureReference,
+ * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)",
+ * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)",
+ * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)",
+ * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)",
+ * \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaUnbindTexture (C++ API)",
+ * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture<T, dim, readMode>&) "cudaGetTextureAlignmentOffset (C++ API)"
+ */
+template<class T, int dim, enum cudaTextureReadMode readMode>
+static __inline__ __host__ cudaError_t cudaBindTexture(
+        size_t                           *offset,
+  const struct texture<T, dim, readMode> &tex,
+  const void                             *devPtr,
+        size_t                            size = UINT_MAX
+)
+{
+  return cudaBindTexture(offset, tex, devPtr, tex.channelDesc, size);
+}
+
+/**
+ * \brief \hl Binds a 2D memory area to a texture
+ *
+ * Binds the 2D memory area pointed to by \p devPtr to the
+ * texture reference \p tex. The size of the area is constrained by
+ * \p width in texel units, \p height in texel units, and \p pitch in byte
+ * units. \p desc describes how the memory is interpreted when fetching values
+ * from the texture. Any memory previously bound to \p tex is unbound.
+ *
+ * Since the hardware enforces an alignment requirement on texture base
+ * addresses,
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D()"
+ * returns in \p *offset a byte offset that
+ * must be applied to texture fetches in order to read from the desired memory.
+ * This offset must be divided by the texel size and passed to kernels that
+ * read from the texture so they can be applied to the ::tex2D() function.
+ * If the device memory pointer was returned from ::cudaMalloc(), the offset is
+ * guaranteed to be 0 and NULL may be passed as the \p offset parameter.
+ *
+ * \param offset - Offset in bytes
+ * \param tex    - Texture reference to bind
+ * \param devPtr - 2D memory area on device
+ * \param desc   - Channel format
+ * \param width  - Width in texel units
+ * \param height - Height in texel units
+ * \param pitch  - Pitch in bytes
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidTexture
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)",
+ * ::cudaGetChannelDesc, ::cudaGetTextureReference,
+ * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)",
+ * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)",
+ * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)",
+ * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)",
+ * \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaUnbindTexture (C++ API)",
+ * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture<T, dim, readMode>&) "cudaGetTextureAlignmentOffset (C++ API)"
+ */
+template<class T, int dim, enum cudaTextureReadMode readMode>
+static __inline__ __host__ cudaError_t cudaBindTexture2D(
+        size_t                           *offset,
+  const struct texture<T, dim, readMode> &tex,
+  const void                             *devPtr,
+  const struct cudaChannelFormatDesc     &desc,
+  size_t                                  width,
+  size_t                                  height,
+  size_t                                  pitch
+)
+{
+  return ::cudaBindTexture2D(offset, &tex, devPtr, &desc, width, height, pitch);
+}
+
+/**
+ * \brief \hl Binds a 2D memory area to a texture
+ *
+ * Binds the 2D memory area pointed to by \p devPtr to the
+ * texture reference \p tex. The size of the area is constrained by
+ * \p width in texel units, \p height in texel units, and \p pitch in byte
+ * units. The channel descriptor is inherited from the texture reference
+ * type. Any memory previously bound to \p tex is unbound.
+ *
+ * Since the hardware enforces an alignment requirement on texture base
+ * addresses,
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D()"
+ * returns in \p *offset a byte offset that
+ * must be applied to texture fetches in order to read from the desired memory.
+ * This offset must be divided by the texel size and passed to kernels that
+ * read from the texture so they can be applied to the ::tex2D() function.
+ * If the device memory pointer was returned from ::cudaMalloc(), the offset is
+ * guaranteed to be 0 and NULL may be passed as the \p offset parameter.
+ *
+ * \param offset - Offset in bytes
+ * \param tex    - Texture reference to bind
+ * \param devPtr - 2D memory area on device
+ * \param width  - Width in texel units
+ * \param height - Height in texel units
+ * \param pitch  - Pitch in bytes
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidTexture
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)",
+ * ::cudaGetChannelDesc, ::cudaGetTextureReference,
+ * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)",
+ * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)",
+ * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)",
+ * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)",
+ * \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaUnbindTexture (C++ API)",
+ * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture<T, dim, readMode>&) "cudaGetTextureAlignmentOffset (C++ API)"
+ */
+template<class T, int dim, enum cudaTextureReadMode readMode>
+static __inline__ __host__ cudaError_t cudaBindTexture2D(
+        size_t                           *offset,
+  const struct texture<T, dim, readMode> &tex,
+  const void                             *devPtr,
+  size_t                                  width,
+  size_t                                  height,
+  size_t                                  pitch
+)
+{
+  return ::cudaBindTexture2D(offset, &tex, devPtr, &tex.channelDesc, width, height, pitch);
+}
+
+/**
+ * \brief \hl Binds an array to a texture
+ *
+ * Binds the CUDA array \p array to the texture reference \p tex.
+ * \p desc describes how the memory is interpreted when fetching values from
+ * the texture. Any CUDA array previously bound to \p tex is unbound.
+ *
+ * \param tex   - Texture to bind
+ * \param array - Memory array on device
+ * \param desc  - Channel format
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidTexture
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)",
+ * ::cudaGetChannelDesc, ::cudaGetTextureReference,
+ * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)",
+ * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)",
+ * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)",
+ * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)",
+ * \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaUnbindTexture (C++ API)",
+ * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture<T, dim, readMode >&) "cudaGetTextureAlignmentOffset (C++ API)"
+ */
+template<class T, int dim, enum cudaTextureReadMode readMode>
+static __inline__ __host__ cudaError_t cudaBindTextureToArray(
+  const struct texture<T, dim, readMode> &tex,
+  cudaArray_const_t                       array,
+  const struct cudaChannelFormatDesc     &desc
+)
+{
+  return ::cudaBindTextureToArray(&tex, array, &desc);
+}
+
+/**
+ * \brief \hl Binds an array to a texture
+ *
+ * Binds the CUDA array \p array to the texture reference \p tex.
+ * The channel descriptor is inherited from the CUDA array. Any CUDA array
+ * previously bound to \p tex is unbound.
+ *
+ * \param tex   - Texture to bind
+ * \param array - Memory array on device
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidTexture
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)",
+ * ::cudaGetChannelDesc, ::cudaGetTextureReference,
+ * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)",
+ * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)",
+ * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)",
+ * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)",
+ * \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaUnbindTexture (C++ API)",
+ * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture<T, dim, readMode >&) "cudaGetTextureAlignmentOffset (C++ API)"
+ */
+template<class T, int dim, enum cudaTextureReadMode readMode>
+static __inline__ __host__ cudaError_t cudaBindTextureToArray(
+  const struct texture<T, dim, readMode> &tex,
+  cudaArray_const_t                       array
+)
+{
+  struct cudaChannelFormatDesc desc;
+  cudaError_t                  err = ::cudaGetChannelDesc(&desc, array);
+
+  return err == cudaSuccess ? cudaBindTextureToArray(tex, array, desc) : err;
+}
+
+/**
+ * \brief \hl Binds a mipmapped array to a texture
+ *
+ * Binds the CUDA mipmapped array \p mipmappedArray to the texture reference \p tex.
+ * \p desc describes how the memory is interpreted when fetching values from
+ * the texture. Any CUDA mipmapped array previously bound to \p tex is unbound.
+ *
+ * \param tex            - Texture to bind
+ * \param mipmappedArray - Memory mipmapped array on device
+ * \param desc           - Channel format
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidTexture
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)",
+ * ::cudaGetChannelDesc, ::cudaGetTextureReference,
+ * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)",
+ * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)",
+ * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)",
+ * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)",
+ * \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaUnbindTexture (C++ API)",
+ * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture<T, dim, readMode >&) "cudaGetTextureAlignmentOffset (C++ API)"
+ */
+template<class T, int dim, enum cudaTextureReadMode readMode>
+static __inline__ __host__ cudaError_t cudaBindTextureToMipmappedArray(
+  const struct texture<T, dim, readMode> &tex,
+  cudaMipmappedArray_const_t              mipmappedArray,
+  const struct cudaChannelFormatDesc     &desc
+)
+{
+  return ::cudaBindTextureToMipmappedArray(&tex, mipmappedArray, &desc);
+}
+
+/**
+ * \brief \hl Binds a mipmapped array to a texture
+ *
+ * Binds the CUDA mipmapped array \p mipmappedArray to the texture reference \p tex.
+ * The channel descriptor is inherited from the CUDA array. Any CUDA mipmapped array
+ * previously bound to \p tex is unbound.
+ *
+ * \param tex            - Texture to bind
+ * \param mipmappedArray - Memory mipmapped array on device
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidTexture
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)",
+ * ::cudaGetChannelDesc, ::cudaGetTextureReference,
+ * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)",
+ * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)",
+ * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)",
+ * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)",
+ * \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaUnbindTexture (C++ API)",
+ * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture<T, dim, readMode >&) "cudaGetTextureAlignmentOffset (C++ API)"
+ */
+template<class T, int dim, enum cudaTextureReadMode readMode>
+static __inline__ __host__ cudaError_t cudaBindTextureToMipmappedArray(
+  const struct texture<T, dim, readMode> &tex,
+  cudaMipmappedArray_const_t              mipmappedArray
+)
+{
+  struct cudaChannelFormatDesc desc;
+  cudaArray_t                  levelArray;
+  cudaError_t                  err = ::cudaGetMipmappedArrayLevel(&levelArray, mipmappedArray, 0);
+  
+  if (err != cudaSuccess) {
+      return err;
+  }
+  err = ::cudaGetChannelDesc(&desc, levelArray);
+
+  return err == cudaSuccess ? cudaBindTextureToMipmappedArray(tex, mipmappedArray, desc) : err;
+}
+
+/**
+ * \brief \hl Unbinds a texture
+ *
+ * Unbinds the texture bound to \p tex. If \p texref is not currently bound, no operation is performed.
+ *
+ * \param tex - Texture to unbind
+ *
+ * \return 
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidTexture
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)",
+ * ::cudaGetChannelDesc, ::cudaGetTextureReference,
+ * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)",
+ * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)",
+ * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)",
+ * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)",
+ * \ref ::cudaUnbindTexture(const struct textureReference*) "cudaUnbindTexture (C API)",
+ * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture<T, dim, readMode >&) "cudaGetTextureAlignmentOffset (C++ API)"
+ */
+template<class T, int dim, enum cudaTextureReadMode readMode>
+static __inline__ __host__ cudaError_t cudaUnbindTexture(
+  const struct texture<T, dim, readMode> &tex
+)
+{
+  return ::cudaUnbindTexture(&tex);
+}
+
+/**
+ * \brief \hl Get the alignment offset of a texture
+ *
+ * Returns in \p *offset the offset that was returned when texture reference
+ * \p tex was bound.
+ *
+ * \param offset - Offset of texture reference in bytes
+ * \param tex    - Texture to get offset of
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidTexture,
+ * ::cudaErrorInvalidTextureBinding
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)",
+ * ::cudaGetChannelDesc, ::cudaGetTextureReference,
+ * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)",
+ * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)",
+ * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)",
+ * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)",
+ * \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaUnbindTexture (C++ API)",
+ * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct textureReference*) "cudaGetTextureAlignmentOffset (C API)"
+ */
+template<class T, int dim, enum cudaTextureReadMode readMode>
+static __inline__ __host__ cudaError_t cudaGetTextureAlignmentOffset(
+        size_t                           *offset,
+  const struct texture<T, dim, readMode> &tex
+)
+{
+  return ::cudaGetTextureAlignmentOffset(offset, &tex);
+}
+
+/**
+ * \brief \hl Sets the preferred cache configuration for a device function
+ *
+ * On devices where the L1 cache and shared memory use the same hardware
+ * resources, this sets through \p cacheConfig the preferred cache configuration
+ * for the function specified via \p func. This is only a preference. The
+ * runtime will use the requested configuration if possible, but it is free to
+ * choose a different configuration if required to execute \p func.
+ *
+ * \p func must be a pointer to a function that executes on the device.
+ * The parameter specified by \p func must be declared as a \p __global__
+ * function. If the specified function does not exist,
+ * then ::cudaErrorInvalidDeviceFunction is returned.
+ *
+ * This setting does nothing on devices where the size of the L1 cache and
+ * shared memory are fixed.
+ *
+ * Launching a kernel with a different preference than the most recent
+ * preference setting may insert a device-side synchronization point.
+ *
+ * The supported cache configurations are:
+ * - ::cudaFuncCachePreferNone: no preference for shared memory or L1 (default)
+ * - ::cudaFuncCachePreferShared: prefer larger shared memory and smaller L1 cache
+ * - ::cudaFuncCachePreferL1: prefer larger L1 cache and smaller shared memory
+ *
+ * \param func        - device function pointer
+ * \param cacheConfig - Requested cache configuration
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \ref ::cudaLaunchKernel(const T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C++ API)",
+ * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)",
+ * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, T*) "cudaFuncGetAttributes (C++ API)",
+ * ::cudaSetDoubleForDevice,
+ * ::cudaSetDoubleForHost,
+ * ::cudaThreadGetCacheConfig,
+ * ::cudaThreadSetCacheConfig
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaFuncSetCacheConfig(
+  T                  *func,
+  enum cudaFuncCache  cacheConfig
+)
+{
+  return ::cudaFuncSetCacheConfig((const void*)func, cacheConfig);
+}
+
+template<class T>
+static __inline__ __host__ cudaError_t cudaFuncSetSharedMemConfig(
+  T                        *func,
+  enum cudaSharedMemConfig  config
+)
+{
+  return ::cudaFuncSetSharedMemConfig((const void*)func, config);
+}
+
+/**
+ * \brief Returns occupancy for a device function
+ *
+ * Returns in \p *numBlocks the maximum number of active blocks per
+ * streaming multiprocessor for the device function.
+ *
+ * \param numBlocks       - Returned occupancy
+ * \param func            - Kernel function for which occupancy is calculated
+ * \param blockSize       - Block size the kernel is intended to be launched with
+ * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+ * \sa ::cudaOccupancyMaxPotentialBlockSize
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeWithFlags
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+    int   *numBlocks,
+    T      func,
+    int    blockSize,
+    size_t dynamicSMemSize)
+{
+    return ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(numBlocks, (const void*)func, blockSize, dynamicSMemSize, cudaOccupancyDefault);
+}
+
+/**
+ * \brief Returns occupancy for a device function with the specified flags
+ *
+ * Returns in \p *numBlocks the maximum number of active blocks per
+ * streaming multiprocessor for the device function.
+ *
+ * The \p flags parameter controls how special cases are handled. Valid flags include:
+ *
+ * - ::cudaOccupancyDefault: keeps the default behavior as
+ *   ::cudaOccupancyMaxActiveBlocksPerMultiprocessor
+ *
+ * - ::cudaOccupancyDisableCachingOverride: suppresses the default behavior
+ *   on platform where global caching affects occupancy. On such platforms, if caching
+ *   is enabled, but per-block SM resource usage would result in zero occupancy, the
+ *   occupancy calculator will calculate the occupancy as if caching is disabled.
+ *   Setting this flag makes the occupancy calculator to return 0 in such cases.
+ *   More information can be found about this feature in the "Unified L1/Texture Cache"
+ *   section of the Maxwell tuning guide.
+ *
+ * \param numBlocks       - Returned occupancy
+ * \param func            - Kernel function for which occupancy is calculated
+ * \param blockSize       - Block size the kernel is intended to be launched with
+ * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes
+ * \param flags           - Requested behavior for the occupancy calculator
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor
+ * \sa ::cudaOccupancyMaxPotentialBlockSize
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeWithFlags
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+    int         *numBlocks,
+    T            func,
+    int          blockSize,
+    size_t       dynamicSMemSize,
+    unsigned int flags)
+{
+    return ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(numBlocks, (const void*)func, blockSize, dynamicSMemSize, flags);
+}
+
+/**
+ * Helper functor for cudaOccupancyMaxPotentialBlockSize
+ */
+class __cudaOccupancyB2DHelper {
+  size_t n;
+public:
+  inline __host__ CUDART_DEVICE __cudaOccupancyB2DHelper(size_t n_) : n(n_) {}
+  inline __host__ CUDART_DEVICE size_t operator()(int)
+  {
+      return n;
+  }
+};
+
+/**
+ * \brief Returns grid and block size that achieves maximum potential occupancy for a device function
+ *
+ * Returns in \p *minGridSize and \p *blocksize a suggested grid /
+ * block size pair that achieves the best potential occupancy
+ * (i.e. the maximum number of active warps with the smallest number
+ * of blocks).
+ *
+ * The \p flags parameter controls how special cases are handled. Valid flags include:
+ *
+ * - ::cudaOccupancyDefault: keeps the default behavior as
+ *   ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags
+ *
+ * - ::cudaOccupancyDisableCachingOverride: This flag suppresses the default behavior
+ *   on platform where global caching affects occupancy. On such platforms, if caching
+ *   is enabled, but per-block SM resource usage would result in zero occupancy, the
+ *   occupancy calculator will calculate the occupancy as if caching is disabled.
+ *   Setting this flag makes the occupancy calculator to return 0 in such cases.
+ *   More information can be found about this feature in the "Unified L1/Texture Cache"
+ *   section of the Maxwell tuning guide.
+ *
+ * \param minGridSize - Returned minimum grid size needed to achieve the best potential occupancy
+ * \param blockSize   - Returned block size
+ * \param func        - Device function symbol
+ * \param blockSizeToDynamicSMemSize - A unary function / functor that takes block size, and returns the size, in bytes, of dynamic shared memory needed for a block
+ * \param blockSizeLimit  - The maximum block size \p func is designed to work with. 0 means no limit.
+ * \param flags       - Requested behavior for the occupancy calculator
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem
+ * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor
+ * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+ * \sa ::cudaOccupancyMaxPotentialBlockSize
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeWithFlags
+ */
+
+template<typename UnaryFunction, class T>
+static __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(
+    int           *minGridSize,
+    int           *blockSize,
+    T              func,
+    UnaryFunction  blockSizeToDynamicSMemSize,
+    int            blockSizeLimit = 0,
+    unsigned int   flags = 0)
+{
+    cudaError_t status;
+
+    // Device and function properties
+    int                       device;
+    struct cudaFuncAttributes attr;
+
+    // Limits
+    int maxThreadsPerMultiProcessor;
+    int warpSize;
+    int devMaxThreadsPerBlock;
+    int multiProcessorCount;
+    int funcMaxThreadsPerBlock;
+    int occupancyLimit;
+    int granularity;
+
+    // Recorded maximum
+    int maxBlockSize = 0;
+    int numBlocks    = 0;
+    int maxOccupancy = 0;
+
+    // Temporary
+    int blockSizeToTryAligned;
+    int blockSizeToTry;
+    int blockSizeLimitAligned;
+    int occupancyInBlocks;
+    int occupancyInThreads;
+    size_t dynamicSMemSize;
+
+    ///////////////////////////
+    // Check user input
+    ///////////////////////////
+
+    if (!minGridSize || !blockSize || !func) {
+        return cudaErrorInvalidValue;
+    }
+
+    //////////////////////////////////////////////
+    // Obtain device and function properties
+    //////////////////////////////////////////////
+
+    status = ::cudaGetDevice(&device);
+    if (status != cudaSuccess) {
+        return status;
+    }
+
+    status = cudaDeviceGetAttribute(
+        &maxThreadsPerMultiProcessor,
+        cudaDevAttrMaxThreadsPerMultiProcessor,
+        device);
+    if (status != cudaSuccess) {
+        return status;
+    }
+
+    status = cudaDeviceGetAttribute(
+        &warpSize,
+        cudaDevAttrWarpSize,
+        device);
+    if (status != cudaSuccess) {
+        return status;
+    }
+
+    status = cudaDeviceGetAttribute(
+        &devMaxThreadsPerBlock,
+        cudaDevAttrMaxThreadsPerBlock,
+        device);
+    if (status != cudaSuccess) {
+        return status;
+    }
+
+    status = cudaDeviceGetAttribute(
+        &multiProcessorCount,
+        cudaDevAttrMultiProcessorCount,
+        device);
+    if (status != cudaSuccess) {
+        return status;
+    }
+
+    status = cudaFuncGetAttributes(&attr, func);
+    if (status != cudaSuccess) {
+        return status;
+    }
+    
+    funcMaxThreadsPerBlock = attr.maxThreadsPerBlock;
+
+    /////////////////////////////////////////////////////////////////////////////////
+    // Try each block size, and pick the block size with maximum occupancy
+    /////////////////////////////////////////////////////////////////////////////////
+
+    occupancyLimit = maxThreadsPerMultiProcessor;
+    granularity    = warpSize;
+
+    if (blockSizeLimit == 0) {
+        blockSizeLimit = devMaxThreadsPerBlock;
+    }
+
+    if (devMaxThreadsPerBlock < blockSizeLimit) {
+        blockSizeLimit = devMaxThreadsPerBlock;
+    }
+
+    if (funcMaxThreadsPerBlock < blockSizeLimit) {
+        blockSizeLimit = funcMaxThreadsPerBlock;
+    }
+
+    blockSizeLimitAligned = ((blockSizeLimit + (granularity - 1)) / granularity) * granularity;
+
+    for (blockSizeToTryAligned = blockSizeLimitAligned; blockSizeToTryAligned > 0; blockSizeToTryAligned -= granularity) {
+        // This is needed for the first iteration, because
+        // blockSizeLimitAligned could be greater than blockSizeLimit
+        //
+        if (blockSizeLimit < blockSizeToTryAligned) {
+            blockSizeToTry = blockSizeLimit;
+        } else {
+            blockSizeToTry = blockSizeToTryAligned;
+        }
+        
+        dynamicSMemSize = blockSizeToDynamicSMemSize(blockSizeToTry);
+
+        status = cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+            &occupancyInBlocks,
+            func,
+            blockSizeToTry,
+            dynamicSMemSize,
+            flags);
+
+        if (status != cudaSuccess) {
+            return status;
+        }
+
+        occupancyInThreads = blockSizeToTry * occupancyInBlocks;
+
+        if (occupancyInThreads > maxOccupancy) {
+            maxBlockSize = blockSizeToTry;
+            numBlocks    = occupancyInBlocks;
+            maxOccupancy = occupancyInThreads;
+        }
+
+        // Early out if we have reached the maximum
+        //
+        if (occupancyLimit == maxOccupancy) {
+            break;
+        }
+    }
+
+    ///////////////////////////
+    // Return best available
+    ///////////////////////////
+
+    // Suggested min grid size to achieve a full machine launch
+    //
+    *minGridSize = numBlocks * multiProcessorCount;
+    *blockSize = maxBlockSize;
+
+    return status;
+}
+
+/**
+ * \brief Returns grid and block size that achieves maximum potential occupancy for a device function
+ *
+ * Returns in \p *minGridSize and \p *blocksize a suggested grid /
+ * block size pair that achieves the best potential occupancy
+ * (i.e. the maximum number of active warps with the smallest number
+ * of blocks).
+ *
+ * \param minGridSize - Returned minimum grid size needed to achieve the best potential occupancy
+ * \param blockSize   - Returned block size
+ * \param func        - Device function symbol
+ * \param blockSizeToDynamicSMemSize - A unary function / functor that takes block size, and returns the size, in bytes, of dynamic shared memory needed for a block
+ * \param blockSizeLimit  - The maximum block size \p func is designed to work with. 0 means no limit.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags
+ * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor
+ * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+ * \sa ::cudaOccupancyMaxPotentialBlockSize
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeWithFlags
+ */
+
+template<typename UnaryFunction, class T>
+static __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSizeVariableSMem(
+    int           *minGridSize,
+    int           *blockSize,
+    T              func,
+    UnaryFunction  blockSizeToDynamicSMemSize,
+    int            blockSizeLimit = 0)
+{
+    return cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(minGridSize, blockSize, func, blockSizeToDynamicSMemSize, blockSizeLimit, cudaOccupancyDefault);
+}
+
+/**
+ * \brief Returns grid and block size that achieves maximum potential occupancy for a device function
+ *
+ * Returns in \p *minGridSize and \p *blocksize a suggested grid /
+ * block size pair that achieves the best potential occupancy
+ * (i.e. the maximum number of active warps with the smallest number
+ * of blocks).
+ *
+ * Use \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem if the
+ * amount of per-block dynamic shared memory changes with different
+ * block sizes.
+ *
+ * \param minGridSize - Returned minimum grid size needed to achieve the best potential occupancy
+ * \param blockSize   - Returned block size
+ * \param func        - Device function symbol
+ * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes
+ * \param blockSizeLimit  - The maximum block size \p func is designed to work with. 0 means no limit.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeWithFlags
+ * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor
+ * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags
+ */
+template<class T>
+static __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSize(
+    int    *minGridSize,
+    int    *blockSize,
+    T       func,
+    size_t  dynamicSMemSize = 0,
+    int     blockSizeLimit = 0)
+{
+  return cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(minGridSize, blockSize, func, __cudaOccupancyB2DHelper(dynamicSMemSize), blockSizeLimit, cudaOccupancyDefault);
+}
+
+/**
+ * \brief Returns grid and block size that achieved maximum potential occupancy for a device function with the specified flags
+ *
+ * Returns in \p *minGridSize and \p *blocksize a suggested grid /
+ * block size pair that achieves the best potential occupancy
+ * (i.e. the maximum number of active warps with the smallest number
+ * of blocks).
+ *
+ * The \p flags parameter controls how special cases are handle. Valid flags include:
+ *
+ * - ::cudaOccupancyDefault: keeps the default behavior as
+ *   ::cudaOccupancyMaxPotentialBlockSize
+ *
+ * - ::cudaOccupancyDisableCachingOverride: This flag suppresses the default behavior
+ *   on platform where global caching affects occupancy. On such platforms, if caching
+ *   is enabled, but per-block SM resource usage would result in zero occupancy, the
+ *   occupancy calculator will calculate the occupancy as if caching is disabled.
+ *   Setting this flag makes the occupancy calculator to return 0 in such cases.
+ *   More information can be found about this feature in the "Unified L1/Texture Cache"
+ *   section of the Maxwell tuning guide.
+ *
+ * Use \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem if the
+ * amount of per-block dynamic shared memory changes with different
+ * block sizes.
+ *
+ * \param minGridSize - Returned minimum grid size needed to achieve the best potential occupancy
+ * \param blockSize   - Returned block size
+ * \param func        - Device function symbol
+ * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes
+ * \param blockSizeLimit  - The maximum block size \p func is designed to work with. 0 means no limit.
+ * \param flags       - Requested behavior for the occupancy calculator
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaOccupancyMaxPotentialBlockSize
+ * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor
+ * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags
+ */
+template<class T>
+static __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSizeWithFlags(
+    int    *minGridSize,
+    int    *blockSize,
+    T      func,
+    size_t dynamicSMemSize = 0,
+    int    blockSizeLimit = 0,
+    unsigned int flags = 0)
+{
+    return cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(minGridSize, blockSize, func, __cudaOccupancyB2DHelper(dynamicSMemSize), blockSizeLimit, flags);
+}
+
+/**
+ * \brief \hl Find out attributes for a given function
+ *
+ * This function obtains the attributes of a function specified via \p entry.
+ * The parameter \p entry must be a pointer to a function that executes
+ * on the device. The parameter specified by \p entry must be declared as a \p __global__
+ * function. The fetched attributes are placed in \p attr. If the specified
+ * function does not exist, then ::cudaErrorInvalidDeviceFunction is returned.
+ *
+ * Note that some function attributes such as
+ * \ref ::cudaFuncAttributes::maxThreadsPerBlock "maxThreadsPerBlock"
+ * may vary based on the device that is currently being used.
+ *
+ * \param attr  - Return pointer to function's attributes
+ * \param entry - Function to get attributes of
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \ref ::cudaLaunchKernel(const T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C++ API)",
+ * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)",
+ * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)",
+ * ::cudaSetDoubleForDevice,
+ * ::cudaSetDoubleForHost
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaFuncGetAttributes(
+  struct cudaFuncAttributes *attr,
+  T                         *entry
+)
+{
+  return ::cudaFuncGetAttributes(attr, (const void*)entry);
+}
+
+/**
+ * \brief \hl Set attributes for a given function
+ *
+ * This function sets the attributes of a function specified via \p entry.
+ * The parameter \p entry must be a pointer to a function that executes
+ * on the device. The parameter specified by \p entry must be declared as a \p __global__
+ * function. The enumeration defined by \p attr is set to the value defined by \p value.
+ * If the specified function does not exist, then ::cudaErrorInvalidDeviceFunction is returned.
+ * If the specified attribute cannot be written, or if the value is incorrect, 
+ * then ::cudaErrorInvalidValue is returned.
+ *
+ * Valid values for \p attr are:
+ * - ::cudaFuncAttributeMaxDynamicSharedMemorySize - The requested maximum size in bytes of dynamically-allocated shared memory. The sum of this value and the function attribute ::sharedSizeBytes
+ *   cannot exceed the device attribute ::cudaDevAttrMaxSharedMemoryPerBlockOptin. The maximal size of requestable dynamic shared memory may differ by GPU architecture.
+ * - ::cudaFuncAttributePreferredSharedMemoryCarveout - On devices where the L1 cache and shared memory use the same hardware resources, 
+ *   this sets the shared memory carveout preference, in percent of the total shared memory. See ::cudaDevAttrMaxSharedMemoryPerMultiprocessor.
+ *   This is only a hint, and the driver can choose a different ratio if required to execute the function.
+ *
+ * \param entry - Function to get attributes of
+ * \param attr  - Attribute to set
+ * \param value - Value to set
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \ref ::cudaLaunchKernel(const T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C++ API)",
+ * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)",
+ * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)",
+ * ::cudaSetDoubleForDevice,
+ * ::cudaSetDoubleForHost
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaFuncSetAttribute(
+  T                         *entry,
+  enum cudaFuncAttribute    attr,
+  int                       value
+)
+{
+  return ::cudaFuncSetAttribute((const void*)entry, attr, value);
+}
+
+/**
+ * \brief \hl Binds an array to a surface
+ *
+ * Binds the CUDA array \p array to the surface reference \p surf.
+ * \p desc describes how the memory is interpreted when dealing with
+ * the surface. Any CUDA array previously bound to \p surf is unbound.
+ *
+ * \param surf  - Surface to bind
+ * \param array - Memory array on device
+ * \param desc  - Channel format
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidSurface
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaBindSurfaceToArray(const struct surfaceReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindSurfaceToArray (C API)",
+ * \ref ::cudaBindSurfaceToArray(const struct surface<T, dim>&, cudaArray_const_t) "cudaBindSurfaceToArray (C++ API, inherited channel descriptor)"
+ */
+template<class T, int dim>
+static __CUDA_DEPRECATED __inline__ __host__ cudaError_t cudaBindSurfaceToArray(
+  const struct surface<T, dim>       &surf,
+  cudaArray_const_t                   array,
+  const struct cudaChannelFormatDesc &desc
+)
+{
+  return ::cudaBindSurfaceToArray(&surf, array, &desc);
+}
+
+/**
+ * \brief \hl Binds an array to a surface
+ *
+ * Binds the CUDA array \p array to the surface reference \p surf.
+ * The channel descriptor is inherited from the CUDA array. Any CUDA array
+ * previously bound to \p surf is unbound.
+ *
+ * \param surf  - Surface to bind
+ * \param array - Memory array on device
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidSurface
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaBindSurfaceToArray(const struct surfaceReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindSurfaceToArray (C API)",
+ * \ref ::cudaBindSurfaceToArray(const struct surface<T, dim>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindSurfaceToArray (C++ API)"
+ */
+template<class T, int dim>
+static __CUDA_DEPRECATED __inline__ __host__ cudaError_t cudaBindSurfaceToArray(
+  const struct surface<T, dim> &surf,
+  cudaArray_const_t             array
+)
+{
+  struct cudaChannelFormatDesc desc;
+  cudaError_t                  err = ::cudaGetChannelDesc(&desc, array);
+
+  return err == cudaSuccess ? cudaBindSurfaceToArray(surf, array, desc) : err;
+}
+
+#endif /* __CUDACC__ */
+
+/** @} */ /* END CUDART_HIGHLEVEL */
+
+#endif /* __cplusplus && !__CUDACC_RTC__ */
+
+#if !defined(__CUDACC_RTC__)
+#if defined(__GNUC__)
+#if defined(__clang__) || (!defined(__PGIC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)))
+#pragma GCC diagnostic pop
+#endif
+#elif defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+#endif
+
+#undef __CUDA_DEPRECATED
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_RUNTIME_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_RUNTIME_H__
+#endif
+
+#endif /* !__CUDA_RUNTIME_H__ */
diff --git a/icicle/curves/bn254/curve_config.cuh b/icicle/curves/bn254/curve_config.cuh
index f1e26172a..adc8729f4 100644
--- a/icicle/curves/bn254/curve_config.cuh
+++ b/icicle/curves/bn254/curve_config.cuh
@@ -2,6 +2,9 @@
 
 #include "../../primitives/field.cuh"
 #include "../../primitives/projective.cuh"
+#if defined(G2_DEFINED)
+#include "../../primitives/extension_field.cuh"
+#endif
 
 #include "params.cuh"
 
diff --git a/icicle/curves/bn254/lde.cu b/icicle/curves/bn254/lde.cu
index 302be7139..da76e69b6 100644
--- a/icicle/curves/bn254/lde.cu
+++ b/icicle/curves/bn254/lde.cu
@@ -5,6 +5,9 @@
 #include "../../appUtils/ntt/ntt.cuh"
 #include "../../appUtils/vector_manipulation/ve_mod_mult.cuh"
 #include "curve_config.cuh"
+#include "../../utils/mont.cuh"
+
+
 
 extern "C" BN254::scalar_t* build_domain_cuda_bn254(uint32_t domain_size, uint32_t logn, bool inverse, size_t device_id = 0, cudaStream_t stream = 0)
 {
@@ -24,7 +27,7 @@ extern "C" BN254::scalar_t* build_domain_cuda_bn254(uint32_t domain_size, uint32
     }
 }
 
-extern "C" int ntt_cuda_bn254(BN254::scalar_t *arr, uint32_t n, bool inverse, size_t device_id = 0, cudaStream_t stream = 0)
+extern "C" int ntt_cuda_bn254(BN254::scalar_t *arr, uint32_t n, bool inverse, Decimation decimation, size_t device_id = 0, cudaStream_t stream = 0)
 {
     try
     {
@@ -39,7 +42,7 @@ extern "C" int ntt_cuda_bn254(BN254::scalar_t *arr, uint32_t n, bool inverse, si
     }
 }
 
-extern "C" int ecntt_cuda_bn254(BN254::projective_t *arr, uint32_t n, bool inverse, size_t device_id = 0, cudaStream_t stream = 0)
+extern "C" int ecntt_cuda_bn254(BN254::projective_t *arr, uint32_t n, bool inverse, Decimation decimation, size_t device_id = 0, cudaStream_t stream = 0)
 {
     try
     {
@@ -85,7 +88,8 @@ extern "C" int interpolate_scalars_cuda_bn254(BN254::scalar_t* d_out, BN254::sca
 {
     try
     {
-        return interpolate(d_out, d_evaluations, d_domain, n, stream);
+        BN254::scalar_t* _null = nullptr;
+        return interpolate(d_out, d_evaluations, d_domain, n, false, _null, stream);
     }
     catch (const std::runtime_error &ex)
     {
@@ -99,8 +103,37 @@ extern "C" int interpolate_scalars_batch_cuda_bn254(BN254::scalar_t* d_out, BN25
 {
     try
     {
+        BN254::scalar_t* _null = nullptr;
         cudaStreamCreate(&stream);
-        return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, stream);
+        return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, false, _null, stream);
+    }
+    catch (const std::runtime_error &ex)
+    {
+        printf("error %s", ex.what());
+        return -1;
+    }
+}
+
+extern "C" int interpolate_scalars_on_coset_cuda_bn254(BN254::scalar_t* d_out, BN254::scalar_t *d_evaluations, BN254::scalar_t *d_domain, unsigned n, BN254::scalar_t *coset_powers, unsigned device_id = 0, cudaStream_t stream = 0)
+{
+    try
+    {
+        return interpolate(d_out, d_evaluations, d_domain, n, true, coset_powers, stream);
+    }
+    catch (const std::runtime_error &ex)
+    {
+        printf("error %s", ex.what());
+        return -1;
+    }
+}
+
+extern "C" int interpolate_scalars_batch_on_coset_cuda_bn254(BN254::scalar_t* d_out, BN254::scalar_t* d_evaluations, BN254::scalar_t* d_domain, unsigned n,
+                                              unsigned batch_size, BN254::scalar_t* coset_powers, size_t device_id = 0, cudaStream_t stream = 0)
+{
+    try
+    {
+        cudaStreamCreate(&stream);
+        return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, true, coset_powers, stream);
     }
     catch (const std::runtime_error &ex)
     {
@@ -113,7 +146,8 @@ extern "C" int interpolate_points_cuda_bn254(BN254::projective_t* d_out, BN254::
 {
     try
     {
-        return interpolate(d_out, d_evaluations, d_domain, n, stream);
+        BN254::scalar_t* _null = nullptr;
+        return interpolate(d_out, d_evaluations, d_domain, n, false, _null, stream);
     }
     catch (const std::runtime_error &ex)
     {
@@ -127,8 +161,9 @@ extern "C" int interpolate_points_batch_cuda_bn254(BN254::projective_t* d_out, B
 {
     try
     {
+        BN254::scalar_t* _null = nullptr;
         cudaStreamCreate(&stream);
-        return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, stream);
+        return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, false, _null, stream);
     }
     catch (const std::runtime_error &ex)
     {
@@ -266,8 +301,10 @@ extern "C" int ntt_inplace_batch_cuda_bn254(BN254::scalar_t* d_inout, BN254::sca
 {
     try
     {
+
         cudaStreamCreate(&stream);
-        ntt_inplace_batch_template(d_inout, d_twiddles, n, batch_size, inverse, stream, true);
+        BN254::scalar_t* _null = nullptr;
+        ntt_inplace_batch_template(d_inout, d_twiddles, n, batch_size, inverse, false, _null, stream, true);
         return CUDA_SUCCESS; //TODO: we should implement this https://leimao.github.io/blog/Proper-CUDA-Error-Checking/
     }
     catch (const std::runtime_error &ex)
@@ -277,6 +314,192 @@ extern "C" int ntt_inplace_batch_cuda_bn254(BN254::scalar_t* d_inout, BN254::sca
     }
 }
 
+extern "C" int ntt_inplace_coset_batch_cuda_bn254(BN254::scalar_t* d_inout, BN254::scalar_t* d_twiddles,
+                                           unsigned n, unsigned batch_size, bool inverse, bool is_coset, BN254::scalar_t* coset, size_t device_id = 0, cudaStream_t stream = 0)
+{
+    try
+    {
+        cudaStreamCreate(&stream);
+        ntt_inplace_batch_template(d_inout, d_twiddles, n, batch_size, inverse, is_coset, coset, stream, true);
+        return CUDA_SUCCESS; //TODO: we should implement this https://leimao.github.io/blog/Proper-CUDA-Error-Checking/
+    }
+    catch (const std::runtime_error &ex)
+    {
+        printf("error %s", ex.what());
+        return -1;
+    }
+}
+
+extern "C" int sub_scalars_cuda_bn254(BN254::scalar_t* d_out, BN254::scalar_t* d_in1, BN254::scalar_t* d_in2, unsigned n, cudaStream_t stream = 0)
+{
+    try
+    {
+        cudaStreamCreate(&stream);
+        return sub_polys(d_out, d_in1, d_in2, n, stream);
+    }
+    catch (const std::runtime_error &ex)
+    {
+        printf("error %s", ex.what());
+        return -1;
+    }
+}
+
+extern "C" int add_scalars_cuda_bn254(BN254::scalar_t* d_out, BN254::scalar_t* d_in1, BN254::scalar_t* d_in2, unsigned n, cudaStream_t stream = 0)
+{
+    try
+    {
+        cudaStreamCreate(&stream);
+        return add_polys(d_out, d_in1, d_in2, n, stream);
+    }
+    catch (const std::runtime_error &ex)
+    {
+        printf("error %s", ex.what());
+        return -1;
+    }
+}
+
+extern "C" int to_montgomery_scalars_cuda_bn254(BN254::scalar_t* d_inout, unsigned n, cudaStream_t stream = 0)
+{
+    try
+    {
+        cudaStreamCreate(&stream);
+        return to_montgomery(d_inout, n, stream);
+    }
+    catch (const std::runtime_error &ex)
+    {
+        printf("error %s", ex.what());
+        return -1;
+    }
+}
+
+extern "C" int from_montgomery_scalars_cuda_bn254(BN254::scalar_t* d_inout, unsigned n, cudaStream_t stream = 0)
+{
+    try
+    {
+        cudaStreamCreate(&stream);
+        return from_montgomery(d_inout, n, stream);
+    }
+    catch (const std::runtime_error &ex)
+    {
+        printf("error %s", ex.what());
+        return -1;
+    }
+}
+
+extern "C" int to_montgomery_proj_points_cuda_bn254(BN254::projective_t* d_inout, unsigned n, cudaStream_t stream = 0)
+{
+    try
+    {
+        cudaStreamCreate(&stream);
+        return to_montgomery((BN254::point_field_t*)d_inout, 3 * n, stream);
+    }
+    catch (const std::runtime_error &ex)
+    {
+        printf("error %s", ex.what());
+        return -1;
+    }
+}
+
+extern "C" int from_montgomery_proj_points_cuda_bn254(BN254::projective_t* d_inout, unsigned n, cudaStream_t stream = 0)
+{
+    try
+    {
+        cudaStreamCreate(&stream);
+        return from_montgomery((BN254::point_field_t*)d_inout, 3 * n, stream);
+    }
+    catch (const std::runtime_error &ex)
+    {
+        printf("error %s", ex.what());
+        return -1;
+    }
+}
+
+extern "C" int to_montgomery_aff_points_cuda_bn254(BN254::affine_t* d_inout, unsigned n, cudaStream_t stream = 0)
+{
+    try
+    {
+        cudaStreamCreate(&stream);
+        return to_montgomery((BN254::point_field_t*)d_inout, 2 * n, stream);
+    }
+    catch (const std::runtime_error &ex)
+    {
+        printf("error %s", ex.what());
+        return -1;
+    }
+}
+
+extern "C" int from_montgomery_aff_points_cuda_bn254(BN254::affine_t* d_inout, unsigned n, cudaStream_t stream = 0)
+{
+    try
+    {
+        cudaStreamCreate(&stream);
+        return from_montgomery((BN254::point_field_t*)d_inout, 2 * n, stream);
+    }
+    catch (const std::runtime_error &ex)
+    {
+        printf("error %s", ex.what());
+        return -1;
+    }
+}
+
+#if defined(G2_DEFINED)
+extern "C" int to_montgomery_proj_points_g2_cuda_bn254(BN254::g2_projective_t* d_inout, unsigned n, cudaStream_t stream = 0)
+{
+    try
+    {
+        cudaStreamCreate(&stream);
+        return to_montgomery((BN254::point_field_t*)d_inout, 6 * n, stream);
+    }
+    catch (const std::runtime_error &ex)
+    {
+        printf("error %s", ex.what());
+        return -1;
+    }
+}
+
+extern "C" int from_montgomery_proj_points_g2_cuda_bn254(BN254::g2_projective_t* d_inout, unsigned n, cudaStream_t stream = 0)
+{
+    try
+    {
+        cudaStreamCreate(&stream);
+        return from_montgomery((BN254::point_field_t*)d_inout, 6 * n, stream);
+    }
+    catch (const std::runtime_error &ex)
+    {
+        printf("error %s", ex.what());
+        return -1;
+    }
+}
+
+extern "C" int to_montgomery_aff_points_g2_cuda_bn254(BN254::g2_affine_t* d_inout, unsigned n, cudaStream_t stream = 0)
+{
+    try
+    {
+        cudaStreamCreate(&stream);
+        return to_montgomery((BN254::point_field_t*)d_inout, 4 * n, stream);
+    }
+    catch (const std::runtime_error &ex)
+    {
+        printf("error %s", ex.what());
+        return -1;
+    }
+}
+
+extern "C" int from_montgomery_aff_points_g2_cuda_bn254(BN254::g2_affine_t* d_inout, unsigned n, cudaStream_t stream = 0)
+{
+    try
+    {
+        cudaStreamCreate(&stream);
+        return from_montgomery((BN254::point_field_t*)d_inout, 4 * n, stream);
+    }
+    catch (const std::runtime_error &ex)
+    {
+        printf("error %s", ex.what());
+        return -1;
+    }
+}
+#endif
+
 extern "C" int reverse_order_scalars_cuda_bn254(BN254::scalar_t* arr, int n, size_t device_id = 0, cudaStream_t stream = 0)
 {
     try
@@ -284,6 +507,7 @@ extern "C" int reverse_order_scalars_cuda_bn254(BN254::scalar_t* arr, int n, siz
         uint32_t logn = uint32_t(log(n) / log(2));
         cudaStreamCreate(&stream);
         reverse_order(arr, n, logn, stream);
+        cudaStreamSynchronize(stream);
         return 0;
     }
     catch (const std::runtime_error &ex)
diff --git a/icicle/curves/bn254/msm.cu b/icicle/curves/bn254/msm.cu
index 8de1c4bb3..3133b7b7d 100644
--- a/icicle/curves/bn254/msm.cu
+++ b/icicle/curves/bn254/msm.cu
@@ -11,8 +11,10 @@ int msm_cuda_bn254(BN254::projective_t *out, BN254::affine_t points[],
               BN254::scalar_t scalars[], size_t count, size_t device_id = 0, cudaStream_t stream = 0)
 {
     try
-    {
-        large_msm<BN254::scalar_t, BN254::projective_t, BN254::affine_t>(scalars, points, count, out, false, stream);
+    {   
+        cudaStreamCreate(&stream);
+        large_msm<BN254::scalar_t, BN254::projective_t, BN254::affine_t>(scalars, points, count, out, false, false, stream);
+        cudaStreamSynchronize(stream);
         return CUDA_SUCCESS;
     }
     catch (const std::runtime_error &ex)
@@ -25,18 +27,18 @@ int msm_cuda_bn254(BN254::projective_t *out, BN254::affine_t points[],
 extern "C" int msm_batch_cuda_bn254(BN254::projective_t* out, BN254::affine_t points[],
                               BN254::scalar_t scalars[], size_t batch_size, size_t msm_size, size_t device_id = 0, cudaStream_t stream = 0)
 {
-  try
-  {
-    cudaStreamCreate(&stream);
-    batched_large_msm<BN254::scalar_t, BN254::projective_t, BN254::affine_t>(scalars, points, batch_size, msm_size, out, false, stream);
-    cudaStreamSynchronize(stream);
-    return CUDA_SUCCESS;
-  }
-  catch (const std::runtime_error &ex)
-  {
-    printf("error %s", ex.what());
-    return -1;
-  }
+    try
+    {
+        cudaStreamCreate(&stream);
+        batched_large_msm<BN254::scalar_t, BN254::projective_t, BN254::affine_t>(scalars, points, batch_size, msm_size, out, false, stream);
+        cudaStreamSynchronize(stream);
+        return CUDA_SUCCESS;
+    }
+    catch (const std::runtime_error &ex)
+    {
+        printf("error %s", ex.what());
+        return -1;
+    }
 }
 
 /**
@@ -47,46 +49,138 @@ extern "C" int msm_batch_cuda_bn254(BN254::projective_t* out, BN254::affine_t po
  * @param d_points Points for the MSM. Must be on device.
  * @param count Length of `d_scalars` and `d_points` arrays (they should have equal length).
  */
- extern "C"
- int commit_cuda_bn254(BN254::projective_t* d_out, BN254::scalar_t* d_scalars, BN254::affine_t* d_points, size_t count, size_t device_id = 0, cudaStream_t stream = 0)
- {
-     try
-     {
-         large_msm(d_scalars, d_points, count, d_out, true, stream);
-         cudaStreamSynchronize(stream);
-         return 0;
-     }
-     catch (const std::runtime_error &ex)
-     {
-         printf("error %s", ex.what());
-         return -1;
-     }
- }
+extern "C"
+int commit_cuda_bn254(BN254::projective_t* d_out, BN254::scalar_t* d_scalars, BN254::affine_t* d_points, size_t count, size_t device_id = 0, cudaStream_t stream = 0)
+{
+    try
+    {
+        cudaStreamCreate(&stream);
+        large_msm(d_scalars, d_points, count, d_out, true, false, stream);
+        cudaStreamSynchronize(stream);
+        return CUDA_SUCCESS;
+    }
+    catch (const std::runtime_error &ex)
+    {
+        printf("error %s", ex.what());
+        return -1;
+    }
+}
+ 
+/**
+ * Commit to a batch of polynomials using the MSM.
+ * Note: this function just calls the MSM, it doesn't convert between evaluation and coefficient form of scalars or points.
+ * @param d_out Ouptut point to write the results to.
+ * @param d_scalars Scalars for the MSMs of all polynomials. Must be on device.
+ * @param d_points Points for the MSMs. Must be on device. It is assumed that this set of bases is used for each MSM.
+ * @param count Length of `d_points` array, `d_scalar` has length `count` * `batch_size`.
+ * @param batch_size Size of the batch.
+ */
+extern "C"
+int commit_batch_cuda_bn254(BN254::projective_t* d_out, BN254::scalar_t* d_scalars, BN254::affine_t* d_points, size_t count, size_t batch_size, size_t device_id = 0, cudaStream_t stream = 0)
+{
+    try
+    {
+        cudaStreamCreate(&stream);
+        batched_large_msm(d_scalars, d_points, batch_size, count, d_out, true, stream);
+        cudaStreamSynchronize(stream);
+        return CUDA_SUCCESS;
+    }
+    catch (const std::runtime_error &ex)
+    {
+        printf("error %s", ex.what());
+        return -1;
+    }
+}
+
+#if defined(G2_DEFINED)
+extern "C"
+int msm_g2_cuda_bn254(BN254::g2_projective_t *out, BN254::g2_affine_t points[],
+              BN254::scalar_t scalars[], size_t count, size_t device_id = 0, cudaStream_t stream = 0)
+{
+    try
+    {   
+        cudaStreamCreate(&stream);
+        large_msm<BN254::scalar_t, BN254::g2_projective_t, BN254::g2_affine_t>(scalars, points, count, out, false, false, stream);
+        cudaStreamSynchronize(stream);
+        return CUDA_SUCCESS;
+    }
+    catch (const std::runtime_error &ex)
+    {
+        printf("error %s", ex.what());
+        return -1;
+    }
+}
+
+extern "C" int msm_batch_g2_cuda_bn254(BN254::g2_projective_t* out, BN254::g2_affine_t points[],
+                              BN254::scalar_t scalars[], size_t batch_size, size_t msm_size, size_t device_id = 0, cudaStream_t stream = 0)
+{
+    try
+    {
+        cudaStreamCreate(&stream);
+        batched_large_msm<BN254::scalar_t, BN254::g2_projective_t, BN254::g2_affine_t>(scalars, points, batch_size, msm_size, out, false, stream);
+        cudaStreamSynchronize(stream);
+        return CUDA_SUCCESS;
+    }
+    catch (const std::runtime_error &ex)
+    {
+        printf("error %s", ex.what());
+        return -1;
+    }
+}
+
+/**
+ * Commit to a polynomial using the MSM in G2 group.
+ * Note: this function just calls the MSM, it doesn't convert between evaluation and coefficient form of scalars or points.
+ * @param d_out Ouptut G2 point to write the result to.
+ * @param d_scalars Scalars for the MSM. Must be on device.
+ * @param d_points G2 affine points for the MSM. Must be on device.
+ * @param count Length of `d_scalars` and `d_points` arrays (they should have equal length).
+ */
+extern "C"
+int commit_g2_cuda_bn254(BN254::g2_projective_t* d_out, BN254::scalar_t* d_scalars, BN254::g2_affine_t* d_points, size_t count, size_t device_id = 0, cudaStream_t stream = 0)
+{
+    // TODO: use device_id when working with multiple devices
+    (void)device_id;
+    try
+    {
+        cudaStreamCreate(&stream);
+        large_msm(d_scalars, d_points, count, d_out, true, false, stream);
+        cudaStreamSynchronize(stream);
+        return CUDA_SUCCESS;
+    }
+    catch (const std::runtime_error &ex)
+    {
+        printf("error %s", ex.what());
+        return -1;
+    }
+}
  
  /**
   * Commit to a batch of polynomials using the MSM.
   * Note: this function just calls the MSM, it doesn't convert between evaluation and coefficient form of scalars or points.
-  * @param d_out Ouptut point to write the results to.
+  * @param d_out Ouptut G2 point to write the results to.
   * @param d_scalars Scalars for the MSMs of all polynomials. Must be on device.
-  * @param d_points Points for the MSMs. Must be on device. It is assumed that this set of bases is used for each MSM.
+  * @param d_points G2 affine points for the MSMs. Must be on device. It is assumed that this set of bases is used for each MSM.
   * @param count Length of `d_points` array, `d_scalar` has length `count` * `batch_size`.
   * @param batch_size Size of the batch.
   */
- extern "C"
- int commit_batch_cuda_bn254(BN254::projective_t* d_out, BN254::scalar_t* d_scalars, BN254::affine_t* d_points, size_t count, size_t batch_size, size_t device_id = 0, cudaStream_t stream = 0)
- {
-     try
-     {
+extern "C"
+int commit_batch_g2_cuda_bn254(BN254::g2_projective_t* d_out, BN254::scalar_t* d_scalars, BN254::g2_affine_t* d_points, size_t count, size_t batch_size, size_t device_id = 0, cudaStream_t stream = 0)
+{
+    // TODO: use device_id when working with multiple devices
+    (void)device_id;
+    try
+    {
         cudaStreamCreate(&stream);
-         batched_large_msm(d_scalars, d_points, batch_size, count, d_out, true, stream);
-         cudaStreamSynchronize(stream);
-         return 0;
-     }
-     catch (const std::runtime_error &ex)
-     {
-         printf("error %s", ex.what());
-         return -1;
-     }
- }
-
- #endif
+        batched_large_msm(d_scalars, d_points, batch_size, count, d_out, true, stream);
+        cudaStreamSynchronize(stream);
+        return CUDA_SUCCESS;
+    }
+    catch (const std::runtime_error &ex)
+    {
+        printf("error %s", ex.what());
+        return -1;
+    }
+}
+#endif
+#endif
diff --git a/icicle/curves/bn254/msm.h b/icicle/curves/bn254/msm.h
new file mode 100644
index 000000000..a525ca583
--- /dev/null
+++ b/icicle/curves/bn254/msm.h
@@ -0,0 +1,62 @@
+
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+	
+// Code generated by Ingonyama DO NOT EDIT
+
+
+#include <stdbool.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+// msm.h
+
+#ifndef _BN254_MSM_H
+#define _BN254_MSM_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Incomplete declaration of BN254 projective and affine structs
+typedef struct BN254_projective_t BN254_projective_t;
+typedef struct BN254_g2_projective_t BN254_g2_projective_t;
+typedef struct BN254_affine_t BN254_affine_t;
+typedef struct BN254_g2_affine_t BN254_g2_affine_t;
+typedef struct BN254_scalar_t BN254_scalar_t;
+typedef cudaStream_t CudaStream_t;
+
+int msm_cuda_bn254(BN254_projective_t* out, BN254_affine_t* points,
+                   BN254_scalar_t* scalars, size_t count, size_t device_id);
+
+int msm_batch_cuda_bn254(BN254_projective_t* out, BN254_affine_t* points,
+                         BN254_scalar_t* scalars, size_t batch_size,
+                         size_t msm_size, size_t device_id);
+
+int commit_cuda_bn254(BN254_projective_t* d_out, BN254_scalar_t* d_scalars,
+                      BN254_affine_t* d_points, size_t count, size_t device_id);
+
+int commit_batch_cuda_bn254(BN254_projective_t* d_out, BN254_scalar_t* d_scalars,
+                            BN254_affine_t* d_points, size_t count,
+                            size_t batch_size, size_t device_id);
+
+int msm_g2_cuda_bn254(BN254_g2_projective_t *out, BN254_g2_affine_t* points, BN254_scalar_t* scalars, size_t count, size_t device_id);
+int msm_batch_g2_cuda_bn254(BN254_g2_projective_t* out, BN254_g2_affine_t* points, BN254_scalar_t* scalars, size_t batch_size, size_t msm_size, size_t device_id);
+int commit_g2_cuda_bn254(BN254_g2_projective_t* d_out, BN254_scalar_t* d_scalars, BN254_g2_affine_t* d_points, size_t count, size_t device_id);
+int commit_batch_g2_cuda_bn254(BN254_g2_projective_t* d_out, BN254_scalar_t* d_scalars, BN254_g2_affine_t* d_points, size_t count, size_t batch_size, size_t device_id, cudaStream_t stream);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _BN254_MSM_H */
diff --git a/icicle/curves/bn254/ntt.h b/icicle/curves/bn254/ntt.h
new file mode 100644
index 000000000..1841fb814
--- /dev/null
+++ b/icicle/curves/bn254/ntt.h
@@ -0,0 +1,68 @@
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+	
+// Code generated by Ingonyama DO NOT EDIT
+
+#include <stdbool.h>
+#include <cuda.h>
+// ntt.h
+
+#ifndef _BN254_NTT_H
+#define _BN254_NTT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Incomplete declaration of BN254 projective and affine structs
+typedef struct BN254_projective_t BN254_projective_t;
+typedef struct BN254_affine_t BN254_affine_t;
+typedef struct BN254_scalar_t BN254_scalar_t;
+
+int ntt_cuda_bn254(BN254_scalar_t *arr, uint32_t n, bool inverse, size_t decimation, size_t device_id);
+int ntt_batch_cuda_bn254(BN254_scalar_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id);
+
+int ecntt_cuda_bn254(BN254_projective_t *arr, uint32_t n, bool inverse, size_t device_id);
+int ecntt_batch_cuda_bn254(BN254_projective_t *arr, uint32_t arr_size, uint32_t batch_size, bool inverse, size_t device_id);
+
+
+BN254_scalar_t* build_domain_cuda_bn254(uint32_t domain_size, uint32_t logn, bool inverse, size_t device_id, size_t stream);
+int interpolate_scalars_cuda_bn254(BN254_scalar_t* d_out, BN254_scalar_t *d_evaluations, BN254_scalar_t *d_domain, unsigned n, unsigned device_id, size_t stream);
+int interpolate_scalars_batch_cuda_bn254(BN254_scalar_t* d_out, BN254_scalar_t* d_evaluations, BN254_scalar_t* d_domain, unsigned n, unsigned batch_size, size_t device_id, size_t stream);
+int interpolate_points_cuda_bn254(BN254_projective_t* d_out, BN254_projective_t *d_evaluations, BN254_scalar_t *d_domain, unsigned n, size_t device_id, size_t stream);
+int interpolate_points_batch_cuda_bn254(BN254_projective_t* d_out, BN254_projective_t* d_evaluations, BN254_scalar_t* d_domain,unsigned n, unsigned batch_size, size_t device_id, size_t stream);
+int interpolate_scalars_on_coset_cuda_bn254(BN254_scalar_t* d_out, BN254_scalar_t* d_evaluations, BN254_scalar_t* d_domain, unsigned n, BN254_scalar_t* coset_powers, size_t device_id, size_t stream);
+int interpolate_scalars_batch_on_coset_cuda_bn254(BN254_scalar_t* d_out, BN254_scalar_t* d_evaluations, BN254_scalar_t* d_domain, unsigned n, unsigned batch_size, BN254_scalar_t* coset_powers, size_t device_id, size_t stream);
+int evaluate_scalars_cuda_bn254(BN254_scalar_t* d_out, BN254_scalar_t *d_coefficients, BN254_scalar_t *d_domain, unsigned domain_size, unsigned n, unsigned device_id, size_t stream);
+int evaluate_scalars_batch_cuda_bn254(BN254_scalar_t* d_out, BN254_scalar_t* d_coefficients, BN254_scalar_t* d_domain, unsigned domain_size,unsigned n, unsigned batch_size, size_t device_id, size_t stream);
+int evaluate_points_cuda_bn254(BN254_projective_t* d_out, BN254_projective_t *d_coefficients, BN254_scalar_t *d_domain, unsigned domain_size, unsigned n, size_t device_id, size_t stream);
+int evaluate_points_batch_cuda_bn254(BN254_projective_t* d_out, BN254_projective_t* d_coefficients, BN254_scalar_t* d_domain, unsigned domain_size,unsigned n, unsigned batch_size, size_t device_id, size_t stream);
+int evaluate_scalars_on_coset_cuda_bn254(BN254_scalar_t* d_out, BN254_scalar_t *d_coefficients, BN254_scalar_t *d_domain, unsigned domain_size,unsigned n, BN254_scalar_t *coset_powers, unsigned device_id, size_t stream);
+int evaluate_scalars_on_coset_batch_cuda_bn254(BN254_scalar_t* d_out, BN254_scalar_t* d_coefficients, BN254_scalar_t* d_domain, unsigned domain_size, unsigned n, unsigned batch_size, BN254_scalar_t *coset_powers, size_t device_id, size_t stream);
+int evaluate_points_on_coset_cuda_bn254(BN254_projective_t* d_out, BN254_projective_t *d_coefficients, BN254_scalar_t *d_domain, unsigned domain_size,unsigned n, BN254_scalar_t *coset_powers, size_t device_id, size_t stream);
+int evaluate_points_on_coset_batch_cuda_bn254(BN254_projective_t* d_out, BN254_projective_t* d_coefficients, BN254_scalar_t* d_domain, unsigned domain_size, unsigned n, unsigned batch_size, BN254_scalar_t *coset_powers, size_t device_id, size_t stream);
+int reverse_order_scalars_cuda_bn254(BN254_scalar_t* arr, int n, size_t device_id, size_t stream);
+int reverse_order_scalars_batch_cuda_bn254(BN254_scalar_t* arr, int n, int batch_size, size_t device_id, size_t stream);
+int reverse_order_points_cuda_bn254(BN254_projective_t* arr, int n, size_t device_id, size_t stream);
+int reverse_order_points_batch_cuda_bn254(BN254_projective_t* arr, int n, int batch_size, size_t device_id, size_t stream);
+int add_scalars_cuda_bn254(BN254_scalar_t* d_out, BN254_scalar_t* d_in1, BN254_scalar_t* d_in2, unsigned n, size_t stream);
+int sub_scalars_cuda_bn254(BN254_scalar_t* d_out, BN254_scalar_t* d_in1, BN254_scalar_t* d_in2, unsigned n, size_t stream);
+int to_montgomery_scalars_cuda_bn254(BN254_scalar_t* d_inout, unsigned n, size_t stream);
+int from_montgomery_scalars_cuda_bn254(BN254_scalar_t* d_inout, unsigned n, size_t stream);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _BN254_NTT_H */
diff --git a/icicle/curves/bn254/params.cuh b/icicle/curves/bn254/params.cuh
index d4e4fda8b..5a8d184d2 100644
--- a/icicle/curves/bn254/params.cuh
+++ b/icicle/curves/bn254/params.cuh
@@ -18,68 +18,70 @@ namespace PARAMS_BN254 {
     static constexpr storage<limbs_count> m = {0xbe1de925, 0x620703a6, 0x09e880ae, 0x71448520, 0x68073014, 0xab074a58, 0x623a04a7, 0x54a47462};
     static constexpr storage<limbs_count> one = {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
     static constexpr storage<limbs_count> zero = {0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
+    static constexpr storage<limbs_count> montgomery_r = {0x4ffffffb, 0xac96341c, 0x9f60cd29, 0x36fc7695, 0x7879462e, 0x666ea36f, 0x9a07df2f, 0xe0a77c1};
+    static constexpr storage<limbs_count> montgomery_r_inv = {0x6db1194e, 0xdc5ba005, 0xe111ec87, 0x90ef5a9, 0xaeb85d5d, 0xc8260de4, 0x82c5551c, 0x15ebf951};
 
     static constexpr storage_array<omegas_count, limbs_count> omega = { {
-              {0xf0000000, 0x43e1f593, 0x79b97091, 0x2833e848, 0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72},
-              {0x608fc9cb, 0x20cff123, 0x7c4604a5, 0xcb49c351, 0x41a91758, 0xb3c4d79d, 0x00000000, 0x00000000},
-              {0x07b95a9b, 0x8b11d9ab, 0x41671f56, 0x20710ead, 0x30f81dee, 0xfb3acaee, 0x9778465c, 0x130b1711},
-              {0x373428de, 0xb85a71e6, 0xaeb0337e, 0x74954d30, 0x303402b7, 0x2bfc85eb, 0x409556c0, 0x02e40daf},
-              {0xf210979d, 0x8c99980c, 0x34905b4d, 0xef8f3113, 0xdf25d8e7, 0x0aeaf3e7, 0x03bfbd79, 0x27247136},
-              {0x763d698f, 0x78ce6a0b, 0x1d3213ee, 0xd80396ec, 0x67a8a676, 0x035cdc75, 0xb2a13d3a, 0x26177cf2},
-              {0x29bbd82a, 0x66025d34, 0xd51adad3, 0x7de451fd, 0x2391cd58, 0x75d44157, 0x67c7e8f7, 0x1a228e1f},
-              {0xe58d2045, 0x9e8224f9, 0x9db56d8b, 0x8763970f, 0x6924235d, 0x002c22ca, 0x9a5b1fe5, 0x23f7a8c4},
-              {0x57e32226, 0xecb0115c, 0x7986b170, 0x9de32043, 0x9ba3478e, 0xbda33f36, 0x42663c00, 0x2a98c60f},
-              {0x893de19c, 0x1e7cf96f, 0x41b8ab52, 0xeee3e28d, 0xd0b69f2c, 0x7d9ef422, 0x3fb50a52, 0x213a41b9},
-              {0x0984c448, 0xf08c8b53, 0xc402c42c, 0xb129e235, 0x9cd953ee, 0x06981b97, 0x54c83f3a, 0x14c28c45},
-              {0x7f2bce0e, 0x637162dd, 0x60632cfd, 0x3986de3a, 0x322a13d5, 0x1d597f9b, 0x443a15cd, 0x2288f608},
-              {0x4feaa40d, 0x6e4249aa, 0x55bea19d, 0xe320bcd2, 0x8a080b27, 0x46ecf54e, 0x669b23a8, 0x0be6f2f3},
-              {0x5faf820e, 0x2e0df3c8, 0xf57ba925, 0x94012fad, 0xec7e04b6, 0xd4a4c3f8, 0xdada7616, 0x09b10f9e},
-              {0x5ccf87c6, 0xfe7b2472, 0xbca1f36d, 0x28a9c54c, 0xa2fcbf44, 0x69b51fda, 0xaf3bccd6, 0x1e85c3d0},
-              {0xe06e6104, 0x6f7b3d2c, 0x0ca7fa8b, 0xa2dae3f7, 0x7f55cccb, 0xa8ed59c6, 0x9393d41a, 0x0136f0c1},
-              {0xe8be0cf9, 0x46e4b3fc, 0x26a4ec96, 0x95cac63c, 0x72c6fabd, 0xb5383490, 0x7a77e6f4, 0x0bf03fb7},
-              {0xbe7fae83, 0xf1533e2d, 0x2bf2f819, 0x07fa9bc3, 0x0ae79bd3, 0x639e807e, 0xd918b4d6, 0x048a18f9},
-              {0xfd994358, 0x81f47ff5, 0xa4046266, 0x82d21187, 0x4f8b37af, 0xb853f627, 0x83c8d939, 0x1d28a336},
-              {0x54fd384a, 0xa10aa9d9, 0x115fb459, 0x55c89a80, 0xf2fefc7c, 0x8124e414, 0x4dcb6e29, 0x240671d5},
-              {0x0198b787, 0xdec6153e, 0xe4ced161, 0xca96510d, 0x7a5aa862, 0x5be2fd37, 0xf296b11c, 0x2da73caa},
-              {0x05c55d1c, 0x4dce2389, 0xfa7c4637, 0xf9a0b409, 0x536fb2aa, 0x93cb1b47, 0xf192403b, 0x119bd737},
-              {0xa6e170a7, 0x052227f3, 0x497e76fa, 0x7b6d8e56, 0x2167875a, 0xaba6b5f1, 0xdf18f989, 0x0aeda119},
-              {0x5bebb03f, 0x22c5804b, 0x67f59436, 0xbe1e0138, 0x3485fed1, 0x67cf2e16, 0xc78bb32e, 0x2149424c},
-              {0x122289c9, 0x8c4c6154, 0xe4a315a6, 0x6b6af77a, 0x9b660726, 0xb5f15d86, 0x3d681050, 0x035c63f6},
-              {0x26251593, 0x1e5382ec, 0x4d18be62, 0x06b499fe, 0xc269da43, 0x42d636d0, 0x9bc0794a, 0x19bb352a},
-              {0x8c321a28, 0xcd6f38f4, 0x2c9f1792, 0x95cceb99, 0x0d152ffa, 0x0630d09e, 0x8b277331, 0x151d457a},
-              {0x88590882, 0xb8dde849, 0x0e1a5d5d, 0x67cf5acd, 0x723d5c5f, 0xe1ed0cc8, 0xc8953178, 0x188c51b4}
-    } };
-
-
-    static constexpr storage_array<omegas_count, limbs_count> omega_inv = { {
               {0xf0000000, 0x43e1f593, 0x79b97091, 0x2833e848, 0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72},
               {0x8f703636, 0x23120470, 0xfd736bec, 0x5cea24f6, 0x3fd84104, 0x048b6e19, 0xe131a029, 0x30644e72},
               {0xc1bd5e80, 0x948dad4a, 0xf8170a0a, 0x52627366, 0x96afef36, 0xec9b9e2f, 0xc8c14f22, 0x2b337de1},
               {0xe306460b, 0xb11509c6, 0x174efb98, 0x996dfbe1, 0x94dd508c, 0x1c6e4f45, 0x16cbbf4e, 0x21082ca2},
               {0x3bb512d0, 0x3eed4c53, 0x838eeb1d, 0x9c18d51b, 0x47c0b2a9, 0x9678200d, 0x306b93d2, 0x09c532c6},
               {0x118f023a, 0xdb94fb05, 0x26e324be, 0x46a6cb24, 0x49bdadf2, 0xc24cdb76, 0x5b080fca, 0x1418144d},
-              {0x3562e7f0, 0xa6d3ae87, 0xc2c72417, 0x0a6892e3, 0x9928147d, 0xd8f36419, 0x34009697, 0x197d1075},
-              {0x33dfc1c9, 0x2a6289b1, 0x5501716e, 0xbce46410, 0x46fc5cda, 0xf889ee59, 0x17eaabaf, 0x02322181},
-              {0xcc1f5f41, 0x228296e4, 0xb64ec189, 0x33dfcfa6, 0x761eb33b, 0x540e6644, 0x92785b8f, 0x280fea34},
-              {0x7da16033, 0x5c3b9077, 0x521453d1, 0x872b404c, 0x9b054370, 0xfa5f6841, 0xada992f1, 0x0d0daaff},
-              {0x717d26a6, 0xecb21619, 0xd2347943, 0xa7292758, 0xdb78b96e, 0xeb5c7bb1, 0xbe49270d, 0x26b0514e},
-              {0xeff004d4, 0x61f1b8d2, 0x5394943e, 0xcaec7c92, 0x3adcbc53, 0xb3054a41, 0xd4d8eb05, 0x2fd82a77},
-              {0x8518ecca, 0x1cccc58c, 0x1b0344ef, 0xd468e695, 0x4501b89c, 0x60888009, 0xa35f81da, 0x0b65bab2},
-              {0xef3775e2, 0xaac68b20, 0x1a8af1f5, 0xade7937c, 0x48e14944, 0x4e613e72, 0xf67b5e99, 0x07ee2fe0},
-              {0x01f73bfa, 0x7a9178dd, 0x4ad38023, 0x4e6a9df1, 0xc1cf1a77, 0x186f8ba1, 0x0113aedc, 0x1c75f370},
-              {0x6f0924c9, 0x825269f8, 0x321acd8d, 0x85dc2b62, 0x0cff4400, 0xc63cb6e9, 0x0d87b733, 0x0bf840f4},
-              {0x9cd2abc9, 0xb9064db6, 0x35033aba, 0x21800b41, 0x284fabbf, 0x2e7cd8b7, 0x50fd23e3, 0x14fdf780},
-              {0x3c5b53bd, 0x31a0e6e5, 0xad2ade0a, 0x000e1067, 0xf7740140, 0x7507f5ca, 0x4d4c1f98, 0x1faf0653},
-              {0x04b80dc8, 0x21ab655a, 0x7c0bd3dc, 0xf30ae094, 0x94ded480, 0x90f19302, 0x0ee779cb, 0x13a0614f},
-              {0xc5d0d45f, 0x325dbdfc, 0xdb23c86e, 0x531a0e2c, 0x79c537a7, 0xa2a71200, 0x2b0445a8, 0x2e103cac},
-              {0x0eb3de4a, 0x995227ff, 0xb0f25c6e, 0x735dd808, 0x36941528, 0x990dabf7, 0xf1fe47c5, 0x19ffeb1c},
-              {0x562cb6d5, 0xd61871ee, 0x8dc2c90d, 0xacd56e5a, 0x8d0d8980, 0xda46bba0, 0x92ec6935, 0x2d46308e},
-              {0xa70a7c13, 0x1703a78e, 0xdd4ce698, 0xc6bc1d64, 0x5693e78e, 0xbd63b0af, 0x568a26b0, 0x1d527113},
-              {0x02648ff7, 0x30b77d88, 0x5d7e4386, 0xf1a86cdd, 0x66dd8016, 0x69f57e82, 0x3aa86583, 0x11aeccf6},
-              {0x0e4cebf9, 0x8c389a89, 0x1086a5f0, 0x04596644, 0x79d41b0e, 0xeb3dabcc, 0x4e649ca0, 0x2977e823},
-              {0xe1ce2126, 0x3fb533e8, 0xba920fa8, 0xc4f9f250, 0xd91fa66c, 0x3b40e70b, 0x44d8f309, 0x295e48a4},
-              {0xdfc40a8b, 0x52bb0a4c, 0x46112483, 0x4fb64a4b, 0x460eac6d, 0x70ffb433, 0xe671b22c, 0x193903e1},
-              {0xe25ab83b, 0x44c8eb25, 0x9d2ac154, 0xc66b9e1b, 0xb17a4c68, 0xc023ff24, 0xb5e12a84, 0x18f27f93}
+              {0xba9d1811, 0x9d0e470c, 0xb6f24c79, 0x1dcb5564, 0xe85943e0, 0xdf5ce19c, 0xad310991, 0x16e73dfd},
+              {0x74a57a76, 0xc8936191, 0x6750f230, 0x61794254, 0x9f36ffb0, 0xf086204a, 0xa6148404, 0x07b0c561},
+              {0x470157ce, 0x893a7fa1, 0xfc782d75, 0xe8302a41, 0xdd9b0675, 0xffc02c0e, 0xf6e72f5b, 0x0f1ded1e},
+              {0xbc2e5912, 0x11f995e1, 0xa8d2d7ab, 0x39ba79c0, 0xb08771e3, 0xebbebc2b, 0x7017a420, 0x06fd19c1},
+              {0x769a2ee2, 0xd00a58f9, 0x7494f0ca, 0xb8c12c17, 0xa5355d71, 0xb4027fd7, 0x99c5042b, 0x027a3584},
+              {0x0042d43a, 0x1c477572, 0x6f039bb9, 0x76f169c7, 0xfd5a90a9, 0x01ddd073, 0xde2fd10f, 0x0931d596},
+              {0x9bbdd310, 0x4aa49b8d, 0x8e3a2d76, 0xd31bf3e2, 0x78b2667b, 0x001deac8, 0xb869ae62, 0x006fab49},
+              {0x617c6e85, 0xadaa01c2, 0x7420aae6, 0xb4a93ee1, 0x0ddca8a8, 0x1f4e51b8, 0xcdd9e481, 0x2d965651},
+              {0x4e26ecfb, 0xa93458fd, 0x4115a009, 0x022a2a2d, 0x69ec2bd0, 0x017171fa, 0x5941dc91, 0x2d1ba66f},
+              {0xdaac43b7, 0xd1628ba2, 0xe4347e7d, 0x16c8601d, 0xe081dcff, 0x649abebd, 0x5981ed45, 0x00eeb2cb},
+              {0xce8f58e5, 0x276e5858, 0x5655210e, 0x0512eca9, 0xe70e61f3, 0xc3708cc6, 0xa7d74902, 0x1bf82deb},
+              {0x7dcdc0e0, 0x84c6bfa5, 0x13f4d1bd, 0xc57088ff, 0xb5b95e4d, 0x5c0176fb, 0x3a8d46c1, 0x19ddbcaf},
+              {0x613f6cbd, 0x5c1d597f, 0x8357473a, 0x30525841, 0x968e4915, 0x51829353, 0x844bca52, 0x2260e724},
+              {0x53337857, 0x53422da9, 0xdbed349f, 0xac616632, 0x06d1e303, 0x27508aba, 0x0a0ed063, 0x26125da1},
+              {0xfcd0b523, 0xb2c87885, 0xca5a5ce3, 0x58f50577, 0x8598fc8c, 0x4222150e, 0xae2bdd1a, 0x1ded8980},
+              {0xa219447e, 0xa76dde56, 0x359eebbb, 0xec1a1f05, 0x8be08215, 0xcda0ceb6, 0xb1f8d9a7, 0x1ad92f46},
+              {0xab80c59d, 0xb54d4506, 0x22dd991f, 0x5680c640, 0xbc23a139, 0x6b7bcf70, 0x5ab4c74d, 0x0210fe63},
+              {0xe32b045b, 0x1c25f1e3, 0x2e832696, 0x145e0db8, 0x71c6441f, 0x852e2a03, 0x845d50d2, 0x0c9fabc7},
+              {0xb878331a, 0xeccd4f3e, 0x8dc6d26e, 0x7b26b748, 0xd9130cd4, 0xa19b0361, 0x326341ef, 0x2a734ebb},
+              {0x2f4e9212, 0x1c79bd57, 0x3d68f9ae, 0x605b52b6, 0xb8d89d4a, 0x0113eff9, 0xf1ff73b2, 0x1067569a},
+              {0x80928c44, 0x034afc45, 0xf6437da2, 0xb4823532, 0x6dc6e364, 0x5f256a9f, 0xb363ebe8, 0x049ae702},
+              {0x725b19f0, 0x9bd61b6e, 0x41112ed4, 0x402d111e, 0x8ef62abc, 0x00e0a7eb, 0xa58a7e85, 0x2a3c09f0}
+    } };
+
+
+    static constexpr storage_array<omegas_count, limbs_count> omega_inv = { {
+              {0xf0000000, 0x43e1f593, 0x79b97091, 0x2833e848, 0x8181585d, 0xb85045b6, 0xe131a029, 0x30644e72},
+              {0x608fc9cb, 0x20cff123, 0x7c4604a5, 0xcb49c351, 0x41a91758, 0xb3c4d79d, 0x00000000, 0x00000000},
+              {0x07b95a9b, 0x8b11d9ab, 0x41671f56, 0x20710ead, 0x30f81dee, 0xfb3acaee, 0x9778465c, 0x130b1711},
+              {0x373428de, 0xb85a71e6, 0xaeb0337e, 0x74954d30, 0x303402b7, 0x2bfc85eb, 0x409556c0, 0x02e40daf},
+              {0xf210979d, 0x8c99980c, 0x34905b4d, 0xef8f3113, 0xdf25d8e7, 0x0aeaf3e7, 0x03bfbd79, 0x27247136},
+              {0x763d698f, 0x78ce6a0b, 0x1d3213ee, 0xd80396ec, 0x67a8a676, 0x035cdc75, 0xb2a13d3a, 0x26177cf2},
+              {0xc64427d7, 0xdddf985f, 0xa49e95bd, 0xaa4f964a, 0x5def8b04, 0x427c045f, 0x7969b732, 0x1641c053},
+              {0x0329f5d6, 0x692c553d, 0x8712848a, 0xa54cf8c6, 0x38e2b5e6, 0x64751ad9, 0x7422fad3, 0x204bd327},
+              {0xaf6b3e4e, 0x52f26c0f, 0xf0bcc0c8, 0x4c277a07, 0xe4fcfcab, 0x546875d5, 0xaa9995b3, 0x09d8f821},
+              {0xb2e5cc71, 0xcaa2e1e9, 0x6e43404e, 0xed42b68e, 0x7a2c7f0a, 0x6ed80915, 0xde3c86d6, 0x1c4042c7},
+              {0x579d71ae, 0x20a3a65d, 0x0adc4420, 0xfd7efed8, 0xfddabf54, 0x3bb6dcd7, 0xbc73d07b, 0x0fa9bb21},
+              {0xc79e0e57, 0xb6f70f8d, 0xa04e05ac, 0x269d3fde, 0x2ba088d9, 0xcf2e371c, 0x11b88d9c, 0x1af864d2},
+              {0xabd95dc9, 0x3b0b205a, 0x978188ca, 0xc8df74fa, 0x6a1cb6c8, 0x08e124db, 0xbfac6104, 0x1670ed58},
+              {0x641c8410, 0xf8eee934, 0x677771c0, 0xf40976b0, 0x558e6e8c, 0x11680d42, 0x06e7e9e9, 0x281c036f},
+              {0xb2dbc0b4, 0xc92a742f, 0x4d384e68, 0xc3f02842, 0x2fa43d0d, 0x22701b6f, 0xe4590b37, 0x05d33766},
+              {0x02d842d4, 0x922d5ac8, 0xc830e4c6, 0x91126414, 0x082f37e0, 0xe92338c0, 0x7fe704e8, 0x0b5d56b7},
+              {0xd96f0d22, 0x20e75251, 0x6bd4e8c9, 0xc01c7f08, 0xf9dd50c4, 0x37d8b00b, 0xc43ca872, 0x244cf010},
+              {0x66c5174c, 0x7a823174, 0x22d5ad70, 0x7dbe118c, 0x111119c5, 0xf8d7c71d, 0x83780e87, 0x036853f0},
+              {0xca535321, 0xd98f9924, 0xe66e6c81, 0x22dbc0ef, 0x664ae1b7, 0xa15cf806, 0xa314fb67, 0x06e402c0},
+              {0xe26c91f3, 0x0852a8fd, 0x3baca626, 0x521f45cb, 0x2c51bfca, 0xab6473bc, 0x2100895f, 0x100c332d},
+              {0xa376d0f0, 0xf5fac783, 0x940797d3, 0x50fd246e, 0x145f5278, 0xab14ecc1, 0x41091b14, 0x19c6dfb8},
+              {0x7faa1396, 0x43dc52e2, 0x4beced23, 0xd437be9d, 0x6d3c38c3, 0xecc11e9c, 0x0c74a876, 0x2eb58439},
+              {0xd69ca83b, 0x811b03e7, 0xa1a6eadf, 0x126a786b, 0x4e2b8e61, 0x1dd75c9f, 0xbda6792b, 0x2165a1a5},
+              {0x110b737b, 0x02e1d4d1, 0xb323a164, 0x7be1488d, 0x9cd06163, 0xa334d317, 0xdb50e9cd, 0x2710c370},
+              {0x9550fe47, 0x45d2f3cb, 0xf6a8efc4, 0x5f43327b, 0xe993ee18, 0x5bcd0d50, 0xb21de952, 0x27f035bd},
+              {0x232e3983, 0x1d63cbae, 0xaa1b58e2, 0xac815161, 0x6aeb019e, 0x531f42a5, 0x03ca2ef5, 0x2dcd51d9},
+              {0x980db869, 0xa8b64ba8, 0xc9718f6c, 0x4c787f72, 0x15d27ced, 0x7746a25a, 0x435a46e9, 0x110bf78f},
+              {0x9d18157e, 0x72394277, 0xfd399d5d, 0xec9d51f8, 0x49d5387f, 0x6117635d, 0x9c229cd5, 0x01b77519}
     } };
     
 
@@ -128,6 +130,9 @@ namespace PARAMS_BN254 {
     static constexpr storage<limbs_count> m = {0x19bf90e5, 0x6f3aed8a, 0x67cd4c08, 0xae965e17, 0x68073013, 0xab074a58, 0x623a04a7, 0x54a47462};
     static constexpr storage<limbs_count> one = {0x00000001, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
     static constexpr storage<limbs_count> zero = {0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000};
+    static constexpr storage<limbs_count> montgomery_r = {0xc58f0d9d, 0xd35d438d, 0xf5c70b3d, 0xa78eb28, 0x7879462c, 0x666ea36f, 0x9a07df2f, 0xe0a77c1};
+    static constexpr storage<limbs_count> montgomery_r_inv = {0x14afa37, 0xed84884a, 0x278edf8, 0xeb202285, 0xb74492d9, 0xcf63e9cf, 0x59e5c639, 0x2e671571};
+
     // i^2, the square of the imaginary unit for the extension field
     static constexpr uint32_t i_squared = 1;
     // true if i^2 is negative
diff --git a/icicle/curves/bn254/projective.cu b/icicle/curves/bn254/projective.cu
index c95f48938..e4f56e5e6 100644
--- a/icicle/curves/bn254/projective.cu
+++ b/icicle/curves/bn254/projective.cu
@@ -16,4 +16,4 @@ extern "C" bool eq_g2_bn254(BN254::g2_projective_t *point1, BN254::g2_projective
   !((point1->x == BN254::g2_point_field_t::zero()) && (point1->y == BN254::g2_point_field_t::zero()) && (point1->z == BN254::g2_point_field_t::zero())) && 
   !((point2->x == BN254::g2_point_field_t::zero()) && (point2->y == BN254::g2_point_field_t::zero()) && (point2->z == BN254::g2_point_field_t::zero()));
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/icicle/curves/bn254/ve_mod_mult.cu b/icicle/curves/bn254/ve_mod_mult.cu
index 6acef1fab..c467bda94 100644
--- a/icicle/curves/bn254/ve_mod_mult.cu
+++ b/icicle/curves/bn254/ve_mod_mult.cu
@@ -51,6 +51,21 @@ extern "C" int32_t vec_mod_mult_scalar_bn254(BN254::scalar_t *inout,
   }
 }
 
+extern "C" int32_t vec_mod_mult_device_scalar_bn254(
+    BN254::scalar_t *inout,
+    BN254::scalar_t *scalar_vec,
+    size_t n_elements,
+    size_t device_id
+) {
+  try {
+    vector_mod_mult_device<BN254::scalar_t, BN254::scalar_t>(scalar_vec, inout, inout, n_elements);
+    return CUDA_SUCCESS;
+  } catch (const std::runtime_error &ex) {
+    printf("error %s", ex.what()); // TODO: error code and message
+    return -1;
+  }
+}
+
 extern "C" int32_t matrix_vec_mod_mult_bn254(BN254::scalar_t *matrix_flattened,
                                        BN254::scalar_t *input,
                                        BN254::scalar_t *output,
diff --git a/icicle/curves/bn254/ve_mod_mult.h b/icicle/curves/bn254/ve_mod_mult.h
new file mode 100644
index 000000000..6b974118e
--- /dev/null
+++ b/icicle/curves/bn254/ve_mod_mult.h
@@ -0,0 +1,41 @@
+// Copyright 2023 Ingonyama
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+	
+// Code generated by Ingonyama DO NOT EDIT
+
+#include <stdbool.h>
+#include <cuda.h>
+// ve_mod_mult.h
+
+#ifndef _BN254_VEC_MULT_H
+#define _BN254_VEC_MULT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct BN254_projective_t BN254_projective_t;
+typedef struct BN254_scalar_t BN254_scalar_t;
+
+int32_t vec_mod_mult_point_bn254(BN254_projective_t *inout, BN254_scalar_t *scalar_vec, size_t n_elments, size_t device_id);
+int32_t vec_mod_mult_scalar_bn254(BN254_scalar_t *inout, BN254_scalar_t *scalar_vec, size_t n_elments, size_t device_id);
+int32_t vec_mod_mult_device_scalar_bn254(BN254_scalar_t *inout, BN254_scalar_t *scalar_vec, size_t n_elements, size_t device_id);
+int32_t matrix_vec_mod_mult_bn254(BN254_scalar_t *matrix_flattened, BN254_scalar_t *input, BN254_scalar_t *output, size_t n_elments, size_t device_id);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _BN254_VEC_MULT_H */
diff --git a/icicle/curves/curve_template/lde.cu b/icicle/curves/curve_template/lde.cu
index 82240aeca..ef9d892c6 100644
--- a/icicle/curves/curve_template/lde.cu
+++ b/icicle/curves/curve_template/lde.cu
@@ -24,12 +24,12 @@ extern "C" ${CURVE_NAME_U}::scalar_t* build_domain_cuda_${CURVE_NAME_L}(uint32_t
     }
 }
 
-extern "C" int ntt_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::scalar_t *arr, uint32_t n, bool inverse, size_t device_id = 0, cudaStream_t stream = 0)
+extern "C" int ntt_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::scalar_t *arr, uint32_t n, bool inverse, Decimation decimation, size_t device_id = 0, cudaStream_t stream = 0)
 {
     try
     {
         cudaStreamCreate(&stream);
-        return ntt_end2end_template<${CURVE_NAME_U}::scalar_t,${CURVE_NAME_U}::scalar_t>(arr, n, inverse, stream); // TODO: pass device_id
+        return ntt_end2end_template<${CURVE_NAME_U}::scalar_t,${CURVE_NAME_U}::scalar_t>(arr, n, inverse, decimation, stream); // TODO: pass device_id
     }
     catch (const std::runtime_error &ex)
     {
@@ -39,12 +39,12 @@ extern "C" int ntt_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::scalar_t *arr, uint32_t
     }
 }
 
-extern "C" int ecntt_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::projective_t *arr, uint32_t n, bool inverse, size_t device_id = 0, cudaStream_t stream = 0)
+extern "C" int ecntt_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::projective_t *arr, uint32_t n, bool inverse, Decimation decimation, size_t device_id = 0, cudaStream_t stream = 0)
 {
     try
     {
         cudaStreamCreate(&stream);
-        return ntt_end2end_template<${CURVE_NAME_U}::projective_t,${CURVE_NAME_U}::scalar_t>(arr, n, inverse, stream); // TODO: pass device_id
+        return ntt_end2end_template<${CURVE_NAME_U}::projective_t,${CURVE_NAME_U}::scalar_t>(arr, n, inverse, decimation, stream); // TODO: pass device_id
     }
     catch (const std::runtime_error &ex)
     {
@@ -85,7 +85,8 @@ extern "C" int interpolate_scalars_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::scalar_
 {
     try
     {
-        return interpolate(d_out, d_evaluations, d_domain, n, stream);
+        ${CURVE_NAME_U}::scalar_t* _null = nullptr;
+        return interpolate(d_out, d_evaluations, d_domain, n, false, _null, stream);
     }
     catch (const std::runtime_error &ex)
     {
@@ -99,8 +100,37 @@ extern "C" int interpolate_scalars_batch_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::s
 {
     try
     {
+        ${CURVE_NAME_U}::scalar_t* _null = nullptr;
         cudaStreamCreate(&stream);
-        return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, stream);
+        return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, false, _null, stream);
+    }
+    catch (const std::runtime_error &ex)
+    {
+        printf("error %s", ex.what());
+        return -1;
+    }
+}
+
+extern "C" int interpolate_scalars_on_coset_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::scalar_t* d_out, ${CURVE_NAME_U}::scalar_t *d_evaluations, ${CURVE_NAME_U}::scalar_t *d_domain, unsigned n, ${CURVE_NAME_U}::scalar_t *coset_powers, unsigned device_id = 0, cudaStream_t stream = 0)
+{
+    try
+    {
+        return interpolate(d_out, d_evaluations, d_domain, n, true, coset_powers, stream);
+    }
+    catch (const std::runtime_error &ex)
+    {
+        printf("error %s", ex.what());
+        return -1;
+    }
+}
+
+extern "C" int interpolate_scalars_batch_on_coset_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::scalar_t* d_out, ${CURVE_NAME_U}::scalar_t* d_evaluations, ${CURVE_NAME_U}::scalar_t* d_domain, unsigned n,
+                                              unsigned batch_size, ${CURVE_NAME_U}::scalar_t* coset_powers, size_t device_id = 0, cudaStream_t stream = 0)
+{
+    try
+    {
+        cudaStreamCreate(&stream);
+        return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, true, coset_powers, stream);
     }
     catch (const std::runtime_error &ex)
     {
@@ -113,7 +143,8 @@ extern "C" int interpolate_points_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::projecti
 {
     try
     {
-        return interpolate(d_out, d_evaluations, d_domain, n, stream);
+        ${CURVE_NAME_U}::scalar_t* _null = nullptr;
+        return interpolate(d_out, d_evaluations, d_domain, n, false, _null, stream);
     }
     catch (const std::runtime_error &ex)
     {
@@ -127,8 +158,9 @@ extern "C" int interpolate_points_batch_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::pr
 {
     try
     {
+        ${CURVE_NAME_U}::scalar_t* _null = nullptr;
         cudaStreamCreate(&stream);
-        return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, stream);
+        return interpolate_batch(d_out, d_evaluations, d_domain, n, batch_size, false, _null, stream);
     }
     catch (const std::runtime_error &ex)
     {
@@ -268,6 +300,7 @@ extern "C" int reverse_order_scalars_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::scala
         uint32_t logn = uint32_t(log(n) / log(2));
         cudaStreamCreate(&stream);
         reverse_order(arr, n, logn, stream);
+        cudaStreamSynchronize(stream);
         return 0;
     }
     catch (const std::runtime_error &ex)
diff --git a/icicle/curves/curve_template/msm.cu b/icicle/curves/curve_template/msm.cu
index bbfe0a368..9a8ce6f95 100644
--- a/icicle/curves/curve_template/msm.cu
+++ b/icicle/curves/curve_template/msm.cu
@@ -11,7 +11,7 @@ int msm_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::projective_t *out, ${CURVE_NAME_U}
 {
     try
     {
-        large_msm<${CURVE_NAME_U}::scalar_t, ${CURVE_NAME_U}::projective_t, ${CURVE_NAME_U}::affine_t>(scalars, points, count, out, false, stream);
+        large_msm<${CURVE_NAME_U}::scalar_t, ${CURVE_NAME_U}::projective_t, ${CURVE_NAME_U}::affine_t>(scalars, points, count, out, false, false, stream);
         return CUDA_SUCCESS;
     }
     catch (const std::runtime_error &ex)
@@ -52,7 +52,7 @@ extern "C" int msm_batch_cuda_${CURVE_NAME_L}(${CURVE_NAME_U}::projective_t* out
  {
      try
      {
-         large_msm(d_scalars, d_points, count, d_out, true, stream);
+         large_msm(d_scalars, d_points, count, d_out, true, false, stream);
          cudaStreamSynchronize(stream);
          return 0;
      }
diff --git a/icicle/curves/curve_template/projective.cu b/icicle/curves/curve_template/projective.cu
index 32ba4e247..23190f046 100644
--- a/icicle/curves/curve_template/projective.cu
+++ b/icicle/curves/curve_template/projective.cu
@@ -16,4 +16,4 @@ extern "C" bool eq_g2_${CURVE_NAME_L}(${CURVE_NAME_U}::g2_projective_t *point1,
   !((point1->x == ${CURVE_NAME_U}::g2_point_field_t::zero()) && (point1->y == ${CURVE_NAME_U}::g2_point_field_t::zero()) && (point1->z == ${CURVE_NAME_U}::g2_point_field_t::zero())) && 
   !((point2->x == ${CURVE_NAME_U}::g2_point_field_t::zero()) && (point2->y == ${CURVE_NAME_U}::g2_point_field_t::zero()) && (point2->z == ${CURVE_NAME_U}::g2_point_field_t::zero()));
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/icicle/primitives/extension_field.cuh b/icicle/primitives/extension_field.cuh
index acdbc3543..28682b2b9 100644
--- a/icicle/primitives/extension_field.cuh
+++ b/icicle/primitives/extension_field.cuh
@@ -14,24 +14,15 @@ template <typename CONFIG> class ExtensionField {
       FWide real;
       FWide imaginary;
       
-      ExtensionField HOST_DEVICE_INLINE get_lower() {
-        return ExtensionField { real.get_lower(), imaginary.get_lower() };
+      friend HOST_DEVICE_INLINE ExtensionWide operator+(ExtensionWide xs, const ExtensionWide& ys) {   
+        return ExtensionWide { xs.real + ys.real, xs.imaginary + ys.imaginary };
       }
-
-      ExtensionField HOST_DEVICE_INLINE get_higher_with_slack() {
-        return ExtensionField { real.get_higher_with_slack(), imaginary.get_higher_with_slack() };
+  
+      friend HOST_DEVICE_INLINE ExtensionWide operator-(ExtensionWide xs, const ExtensionWide& ys) {   
+        return ExtensionWide { xs.real - ys.real, xs.imaginary - ys.imaginary };
       }
     };
 
-    friend HOST_DEVICE_INLINE ExtensionWide operator+(ExtensionWide xs, const ExtensionWide& ys) {   
-      return ExtensionField { xs.real + ys.real, xs.imaginary + ys.imaginary };
-    }
-
-    // an incomplete impl that assumes that xs > ys
-    friend HOST_DEVICE_INLINE ExtensionWide operator-(ExtensionWide xs, const ExtensionWide& ys) {   
-      return ExtensionField { xs.real - ys.real, xs.imaginary - ys.imaginary };
-    }
-
   public:
     typedef Field<CONFIG> FF;
     static constexpr unsigned TLC = 2 * CONFIG::limbs_count;
@@ -55,13 +46,12 @@ template <typename CONFIG> class ExtensionField {
       return ExtensionField { FF { CONFIG::g2_gen_y_re }, FF { CONFIG::g2_gen_y_im } };
     }
 
-
     static HOST_INLINE ExtensionField rand_host() {
       return ExtensionField { FF::rand_host(), FF::rand_host() };
     }
 
-    template <unsigned REDUCTION_SIZE = 1> static constexpr HOST_DEVICE_INLINE ExtensionField reduce(const ExtensionField &xs) {
-      return ExtensionField { FF::reduce<REDUCTION_SIZE>(&xs.real), FF::reduce<REDUCTION_SIZE>(&xs.imaginary) };
+    template <unsigned REDUCTION_SIZE = 1> static constexpr HOST_DEVICE_INLINE ExtensionField sub_modulus(const ExtensionField &xs) {
+      return ExtensionField { FF::sub_modulus<REDUCTION_SIZE>(&xs.real), FF::sub_modulus<REDUCTION_SIZE>(&xs.imaginary) };
     }
 
     friend std::ostream& operator<<(std::ostream& os, const ExtensionField& xs) {
@@ -79,21 +69,22 @@ template <typename CONFIG> class ExtensionField {
 
     template <unsigned MODULUS_MULTIPLE = 1>
     static constexpr HOST_DEVICE_INLINE ExtensionWide mul_wide(const ExtensionField& xs, const ExtensionField& ys) {
-      FWide real_prod = FF::mul_wide(xs.real * ys.real);
-      FWide imaginary_prod = FF::mul_wide(xs.imaginary * ys.imaginary);
+      FWide real_prod = FF::mul_wide(xs.real, ys.real);
+      FWide imaginary_prod = FF::mul_wide(xs.imaginary, ys.imaginary);
       FWide prod_of_sums = FF::mul_wide(xs.real + xs.imaginary, ys.real + ys.imaginary);
-      FWide i_sq_times_im = FF::mul_unsigned<CONFIG::i_squared>(imaginary_prod);
-      i_sq_times_im = CONFIG::i_squared_is_negative ? FF::neg(i_sq_times_im) : i_sq_times_im;
-      return ExtensionField { real_prod + i_sq_times_im, prod_of_sums - real_prod - imaginary_prod };
+      FWide i_sq_times_im = FF::template mul_unsigned<CONFIG::i_squared>(imaginary_prod);
+      i_sq_times_im = CONFIG::i_squared_is_negative ? FWide::neg(i_sq_times_im) : i_sq_times_im;
+      return ExtensionWide { real_prod + i_sq_times_im, prod_of_sums - real_prod - imaginary_prod };
+    }
+
+    template <unsigned MODULUS_MULTIPLE = 1>
+    static constexpr HOST_DEVICE_INLINE ExtensionField reduce(const ExtensionWide& xs) {
+      return ExtensionField { FF::template reduce<MODULUS_MULTIPLE>(xs.real), FF::template reduce<MODULUS_MULTIPLE>(xs.imaginary) };
     }
 
     friend HOST_DEVICE_INLINE ExtensionField operator*(const ExtensionField& xs, const ExtensionField& ys) {
-      FF real_prod = xs.real * ys.real;
-      FF imaginary_prod = xs.imaginary * ys.imaginary;
-      FF prod_of_sums = (xs.real + xs.imaginary) * (ys.real + ys.imaginary);
-      FF i_sq_times_im = FF::template mul_unsigned<CONFIG::i_squared>(imaginary_prod);
-      i_sq_times_im = CONFIG::i_squared_is_negative ? FF::neg(i_sq_times_im) : i_sq_times_im;
-      return ExtensionField { real_prod + i_sq_times_im, prod_of_sums - real_prod - imaginary_prod };
+      ExtensionWide xy = mul_wide(xs, ys);
+      return reduce(xy);
     }
 
     friend HOST_DEVICE_INLINE bool operator==(const ExtensionField& xs, const ExtensionField& ys) {
@@ -104,14 +95,16 @@ template <typename CONFIG> class ExtensionField {
       return !(xs == ys);
     }
 
-    template <const ExtensionField& mutliplier>
-    static constexpr HOST_DEVICE_INLINE ExtensionField mul_const(const ExtensionField &xs) {
-      constexpr uint32_t mul_real = mutliplier.real.limbs_storage.limbs[0];
-      constexpr uint32_t mul_imaginary = mutliplier.imaginary.limbs_storage.limbs[0];
-      FF real_prod = FF::template mul_unsigned<mul_real>(xs.real);
-      FF imaginary_prod = FF::template mul_unsigned<mul_imaginary>(xs.imaginary);
-      FF re_im = FF::template mul_unsigned<mul_real>(xs.imaginary);
-      FF im_re = FF::template mul_unsigned<mul_imaginary>(xs.real);
+    template <const ExtensionField& multiplier>
+    static HOST_DEVICE_INLINE ExtensionField mul_const(const ExtensionField &xs) {
+      static constexpr FF mul_real = multiplier.real;
+      static constexpr FF mul_imaginary = multiplier.imaginary;
+      const FF xs_real = xs.real;
+      const FF xs_imaginary = xs.imaginary;
+      FF real_prod = FF::template mul_const<mul_real>(xs_real);
+      FF imaginary_prod = FF::template mul_const<mul_imaginary>(xs_imaginary);
+      FF re_im = FF::template mul_const<mul_real>(xs_imaginary);
+      FF im_re = FF::template mul_const<mul_imaginary>(xs_real);
       FF i_sq_times_im = FF::template mul_unsigned<CONFIG::i_squared>(imaginary_prod);
       i_sq_times_im = CONFIG::i_squared_is_negative ? FF::neg(i_sq_times_im) : i_sq_times_im;
       return ExtensionField { real_prod + i_sq_times_im, re_im + im_re };
@@ -142,8 +135,10 @@ template <typename CONFIG> class ExtensionField {
     // inverse assumes that xs is nonzero
     static constexpr HOST_DEVICE_INLINE ExtensionField inverse(const ExtensionField& xs) {
       ExtensionField xs_conjugate = { xs.real, FF::neg(xs.imaginary) };
+      FF i_sq_times_im = FF::template mul_unsigned<CONFIG::i_squared>(FF::sqr(xs.imaginary));
+      i_sq_times_im = CONFIG::i_squared_is_negative ? FF::neg(i_sq_times_im) : i_sq_times_im;
       // TODO: wide here
-      FF xs_norm_squared = FF::sqr(xs.real) + FF::sqr(xs.imaginary);
+      FF xs_norm_squared = FF::sqr(xs.real) - i_sq_times_im;
       return xs_conjugate * ExtensionField { FF::inverse(xs_norm_squared), FF::zero() };
     }
 };
diff --git a/icicle/primitives/field.cuh b/icicle/primitives/field.cuh
index 4bff9cebd..11186e896 100644
--- a/icicle/primitives/field.cuh
+++ b/icicle/primitives/field.cuh
@@ -44,48 +44,55 @@ template <class CONFIG> class Field {
     }
 
     static HOST_INLINE Field omega(uint32_t logn) {
-        if (logn == 0) {
-            return Field { CONFIG::one };
-        }
+      if (logn == 0) {
+          return Field { CONFIG::one };
+      }
 
-        if (logn > CONFIG::omegas_count) {
-            throw std::invalid_argument( "Field: Invalid omega index" );
-        }
+      if (logn > CONFIG::omegas_count) {
+          throw std::invalid_argument( "Field: Invalid omega index" );
+      }
 
-        storage_array<CONFIG::omegas_count, TLC> const omega = CONFIG::omega;
-        return Field { omega.storages[logn-1] };
+      storage_array<CONFIG::omegas_count, TLC> const omega = CONFIG::omega;
+      return Field { omega.storages[logn-1] };
     }      
 
     static HOST_INLINE Field omega_inv(uint32_t logn) {
-        if (logn == 0) {
-            return Field { CONFIG::one };
-        }
+      if (logn == 0) {
+          return Field { CONFIG::one };
+      }
 
-        if (logn > CONFIG::omegas_count) {
-            throw std::invalid_argument( "Field: Invalid omega_inv index" );
-        }
+      if (logn > CONFIG::omegas_count) {
+          throw std::invalid_argument( "Field: Invalid omega_inv index" );
+      }
 
-        storage_array<CONFIG::omegas_count, TLC> const omega_inv = CONFIG::omega_inv;
-        return Field { omega_inv.storages[logn-1] };
+      storage_array<CONFIG::omegas_count, TLC> const omega_inv = CONFIG::omega_inv;
+      return Field { omega_inv.storages[logn-1] };
     }
-
+    
     static HOST_INLINE Field inv_log_size(uint32_t logn) {
-        if (logn == 0) {
-            return Field { CONFIG::one };
-        }
-
-        if (logn > CONFIG::omegas_count) {
-            throw std::invalid_argument( "Field: Invalid inv index" );
-        }
+      if (logn == 0) {
+        return Field { CONFIG::one };
+      }
 
-        storage_array<CONFIG::omegas_count, TLC> const inv = CONFIG::inv;
-        return Field { inv.storages[logn-1] };
+      if (logn > CONFIG::omegas_count) {
+          throw std::invalid_argument( "Field: Invalid inv index" );
+      }
+      storage_array<CONFIG::omegas_count, TLC> const inv = CONFIG::inv;
+      return Field { inv.storages[logn-1] };
     }
 
     static constexpr HOST_DEVICE_INLINE Field modulus() {
       return Field { CONFIG::modulus };
     }
 
+    static constexpr HOST_DEVICE_INLINE Field montgomery_r() {
+      return Field { CONFIG::montgomery_r };
+    }
+
+    static constexpr HOST_DEVICE_INLINE Field montgomery_r_inv() {
+      return Field { CONFIG::montgomery_r_inv };
+    }
+
   // private:
     typedef storage<TLC> ff_storage;
     typedef storage<2*TLC> ff_wide_storage;
@@ -95,44 +102,63 @@ template <class CONFIG> class Field {
     struct Wide {
       ff_wide_storage limbs_storage;
       
-      Field HOST_DEVICE_INLINE get_lower() {
+      static constexpr Field HOST_DEVICE_INLINE get_lower(const Wide &xs) {
         Field out{};
       #ifdef __CUDA_ARCH__
       #pragma unroll
       #endif
         for (unsigned i = 0; i < TLC; i++)
-          out.limbs_storage.limbs[i] = limbs_storage.limbs[i];
+          out.limbs_storage.limbs[i] = xs.limbs_storage.limbs[i];
         return out;
       }
 
-      Field HOST_DEVICE_INLINE get_higher_with_slack() {
+      static constexpr Field HOST_DEVICE_INLINE get_higher_with_slack(const Wide &xs) {
         Field out{};
       #ifdef __CUDA_ARCH__
       #pragma unroll
       #endif
         for (unsigned i = 0; i < TLC; i++) {
         #ifdef __CUDA_ARCH__
-          out.limbs_storage.limbs[i] = __funnelshift_lc(limbs_storage.limbs[i + TLC - 1], limbs_storage.limbs[i + TLC], slack_bits);
+          out.limbs_storage.limbs[i] = __funnelshift_lc(xs.limbs_storage.limbs[i + TLC - 1], xs.limbs_storage.limbs[i + TLC], slack_bits);
         #else
-          out.limbs_storage.limbs[i] = (limbs_storage.limbs[i + TLC] << slack_bits) + (limbs_storage.limbs[i + TLC - 1] >> (32 - slack_bits));
+          out.limbs_storage.limbs[i] = (xs.limbs_storage.limbs[i + TLC] << slack_bits) + (xs.limbs_storage.limbs[i + TLC - 1] >> (32 - slack_bits));
         #endif
         }
         return out;
       }
-    };
 
-    friend HOST_DEVICE_INLINE Wide operator+(Wide xs, const Wide& ys) {   
-      Wide rs = {};
-      add_limbs<false>(xs.limbs_storage, ys.limbs_storage, rs.limbs_storage);
-      return rs;
-    }
+      template <unsigned REDUCTION_SIZE = 1> static constexpr HOST_DEVICE_INLINE Wide sub_modulus_squared(const Wide &xs) {
+        if (REDUCTION_SIZE == 0)
+          return xs;
+        const ff_wide_storage modulus = get_modulus_squared<REDUCTION_SIZE>();
+        Wide rs = {};
+        return sub_limbs<true>(xs.limbs_storage, modulus, rs.limbs_storage) ? xs : rs;
+      }
 
-    // an incomplete impl that assumes that xs > ys
-    friend HOST_DEVICE_INLINE Wide operator-(Wide xs, const Wide& ys) {   
-      Wide rs = {};
-      sub_limbs<false>(xs.limbs_storage, ys.limbs_storage, rs.limbs_storage);
-      return rs;
-    }
+      template <unsigned MODULUS_MULTIPLE = 1>
+      static constexpr HOST_DEVICE_INLINE Wide neg(const Wide& xs) {
+        const ff_wide_storage modulus = get_modulus_squared<MODULUS_MULTIPLE>();
+        Wide rs = {};
+        sub_limbs<false>(modulus, xs.limbs_storage, rs.limbs_storage);
+        return rs;
+      }
+  
+      friend HOST_DEVICE_INLINE Wide operator+(Wide xs, const Wide& ys) {   
+        Wide rs = {};
+        add_limbs<false>(xs.limbs_storage, ys.limbs_storage, rs.limbs_storage);
+        return sub_modulus_squared<1>(rs);
+      }
+  
+      friend HOST_DEVICE_INLINE Wide operator-(Wide xs, const Wide& ys) {   
+        Wide rs = {};
+        uint32_t carry = sub_limbs<true>(xs.limbs_storage, ys.limbs_storage, rs.limbs_storage);
+        if (carry == 0)
+          return rs;
+        const ff_wide_storage modulus = get_modulus_squared<1>();
+        add_limbs<false>(rs.limbs_storage, modulus, rs.limbs_storage);
+        return rs;
+      }
+    };
 
     // return modulus
     template <unsigned MULTIPLIER = 1> static constexpr HOST_DEVICE_INLINE ff_storage get_modulus() {
@@ -232,6 +258,14 @@ template <class CONFIG> class Field {
       return CARRY_OUT ? carry : 0;
     }
 
+    static constexpr HOST_INLINE uint32_t sub_limbs_partial_host(uint32_t* x, uint32_t* y, uint32_t* r, uint32_t num_limbs) {
+      uint32_t carry = 0;
+      host_math::carry_chain<2 * TLC, false, true> chain;
+      for (unsigned i = 0; i < num_limbs; i++)
+        r[i] = chain.sub(x[i], y[i], carry);
+      return carry;
+    }
+
     template <bool CARRY_OUT, typename T> static constexpr HOST_DEVICE_INLINE uint32_t add_limbs(const T &xs, const T &ys, T &rs) {
     #ifdef __CUDA_ARCH__
       return add_sub_limbs_device<false, CARRY_OUT>(xs, ys, rs);
@@ -256,7 +290,17 @@ template <class CONFIG> class Field {
       }
     }
 
+    static DEVICE_INLINE void mul_n_msb(uint32_t *acc, const uint32_t *a, uint32_t bi, size_t n = TLC, size_t start_i = 0) {
+      #pragma unroll
+        for (size_t i = start_i; i < n; i += 2) {
+          acc[i] = ptx::mul_lo(a[i], bi);
+          acc[i + 1] = ptx::mul_hi(a[i], bi);
+        }
+      }
+
     static DEVICE_INLINE void cmad_n(uint32_t *acc, const uint32_t *a, uint32_t bi, size_t n = TLC) {
+      // multiply scalar by vector
+      // acc = acc + bi*A[::2]
       acc[0] = ptx::mad_lo_cc(a[0], bi, acc[0]);
       acc[1] = ptx::madc_hi_cc(a[0], bi, acc[1]);
     #pragma unroll
@@ -266,7 +310,21 @@ template <class CONFIG> class Field {
       }
     }
 
+    static DEVICE_INLINE void cmad_n_msb(uint32_t *acc, const uint32_t *a, uint32_t bi, size_t n = TLC, size_t a_start_idx=0) {
+      // multiply scalar by vector
+      // acc = acc + bi*A[::2]
+      acc[a_start_idx] = ptx::mad_lo_cc(a[a_start_idx], bi, acc[a_start_idx]);
+      acc[a_start_idx + 1] = ptx::madc_hi_cc(a[a_start_idx], bi, acc[a_start_idx + 1]);
+  #pragma unroll
+      for (size_t i = a_start_idx + 2; i < n; i += 2) {
+        acc[i] = ptx::madc_lo_cc(a[i], bi, acc[i]);
+        acc[i + 1] = ptx::madc_hi_cc(a[i], bi, acc[i + 1]);
+      }
+    }
+
     static DEVICE_INLINE void mad_row(uint32_t *odd, uint32_t *even, const uint32_t *a, uint32_t bi, size_t n = TLC) {
+      // odd = odd + bi*A
+      // even = even + bi*A
       cmad_n(odd, a + 1, bi, n - 2);
       odd[n - 2] = ptx::madc_lo_cc(a[n - 1], bi, 0);
       odd[n - 1] = ptx::madc_hi(a[n - 1], bi, 0);
@@ -274,6 +332,16 @@ template <class CONFIG> class Field {
       odd[n - 1] = ptx::addc(odd[n - 1], 0);
     }
 
+    static DEVICE_INLINE void mad_row_msb(uint32_t *odd, uint32_t *even, const uint32_t *a, uint32_t bi, size_t n = TLC, size_t a_start_idx = 0) {
+      // odd = odd + bi*A
+      // even = even + bi*A
+      cmad_n_msb(odd, a + 1, bi, n - 2, a_start_idx - 1);
+      odd[n - 2] = ptx::madc_lo_cc(a[n - 1], bi, 0);
+      odd[n - 1] = ptx::madc_hi(a[n - 1], bi, 0);
+      cmad_n_msb(even, a, bi, n, a_start_idx);
+      odd[n - 1] = ptx::addc(odd[n - 1], 0);
+    }
+
     static DEVICE_INLINE void multiply_raw_device(const ff_storage &as, const ff_storage &bs, ff_wide_storage &rs) {
       const uint32_t *a = as.limbs;
       const uint32_t *b = bs.limbs;
@@ -295,13 +363,280 @@ template <class CONFIG> class Field {
       even[i + 1] = ptx::addc(even[i + 1], 0);
     }
 
+    static DEVICE_INLINE void mult_no_carry(uint32_t a, uint32_t b, uint32_t *r) {
+      r[0] = ptx::mul_lo(a, b);
+      r[1] = ptx::mul_hi(a, b);
+    }
+
+    static DEVICE_INLINE void ingo_multiply_raw_device(const ff_storage &as, const ff_storage &bs, ff_wide_storage &rs) {
+      const uint32_t *a = as.limbs;
+      const uint32_t *b = bs.limbs;
+      uint32_t *r = rs.limbs;
+      uint32_t i, j;
+      uint32_t *even = rs.limbs;
+      __align__(8) uint32_t odd[2 * TLC];
+      for (uint32_t i = 0; i < 2 * TLC; i++)
+      {
+        even[i] = 0;
+        odd[i] = 0;
+      }
+      // first row special case, no carry in no carry out. split to non parts, even and odd.
+      for (i = 0; i < TLC - 1; i+=2 )
+      {
+        mult_no_carry(b[0], a[i], &even[i]);
+        mult_no_carry(b[0], a[i + 1], &odd[i]);
+      }
+
+      // doing two rows at one loop
+      for (i = 1; i < TLC - 1; i+=2)
+      {
+        // odd bi's 
+        // multiply accumulate even part of new row with odd part prev row (needs a carry)
+        // // j = 0, no carry in, only carry out
+        odd[i - 1] =  ptx::mad_lo_cc(a[0], b[i], odd[i - 1]);
+        odd[i] =      ptx::madc_hi_cc(a[0], b[i], odd[i]);
+        // for loop carry in carry out  
+        for (j = 2; j < TLC; j+=2) // 2, 4, 6
+        {
+          odd[i + j - 1] =  ptx::madc_lo_cc(a[j], b[i], odd[i + j - 1]);
+          odd[i + j] =      ptx::madc_hi_cc(a[j], b[i], odd[i + j]);  
+        }
+        odd[i + j - 1] = ptx::addc(odd[i + j - 1], 0); // handling last carry
+
+        // multiply accumulate odd part of new row with even part prev row (doesnt need a carry)
+        // j = 1, no carry in, only carry out
+        even[i + 1] =  ptx::mad_lo_cc(a[1], b[i], even[i + 1]);
+        even[i + 2] =  ptx::madc_hi_cc(a[1], b[i], even[i + 2]);  
+        // for loop carry in carry out
+        for (j = 3; j < TLC; j+=2)
+        {
+          even[i + j] =  ptx::madc_lo_cc(a[j], b[i], even[i + j]);
+          even[i + j + 1] =      ptx::madc_hi_cc(a[j], b[i], even[i + j + 1]);  
+        }
+
+        // even bi's
+        // multiply accumulate even part of new row with even part of prev row // needs a carry
+        // j = 0, no carry in, only carry out
+        even[i + 1] = ptx::mad_lo_cc(a[0], b[i + 1], even[i + 1]);
+        even[i + 2] = ptx::madc_hi_cc(a[0], b[i + 1], even[i + 2]);  
+        // for loop, carry in, carry out.
+        for (j = 2; j < TLC; j+=2)
+        {
+          even[i + j + 1] = ptx::madc_lo_cc(a[j], b[i + 1], even[i + j + 1]);
+          even[i + j + 2] = ptx::madc_hi_cc(a[j], b[i + 1], even[i + j + 2]);  
+        }
+        even[i + j + 1] = ptx::addc(even[i + j + 1], 0); // handling last carry
+        
+        // multiply accumulate odd part of new row with odd part of prev row
+        // j = 1, no carry in, only carry out
+        odd[i + 1] = ptx::mad_lo_cc(a[1], b[i + 1], odd[i + 1]);
+        odd[i + 2] = ptx::madc_hi_cc(a[1], b[i + 1], odd[i + 2]);  
+        // for loop, carry in, carry out.
+        for (j = 3; j < TLC; j+=2)
+        {
+          odd[i + j]      = ptx::madc_lo_cc(a[j], b[i + 1], odd[i + j]);
+          odd[i + j + 1]  = ptx::madc_hi_cc(a[j], b[i + 1], odd[i + j + 1]);  
+        }
+        
+      }
+
+      odd[i - 1] =  ptx::mad_lo_cc(a[0], b[i], odd[i - 1]);
+      odd[i] =      ptx::madc_hi_cc(a[0], b[i], odd[i]);
+      // for loop carry in carry out  
+      for (j = 2; j < TLC; j+=2)
+      {
+        odd[i + j - 1] =  ptx::madc_lo_cc(a[j], b[i], odd[i + j - 1]);
+        odd[i + j] =      ptx::madc_hi_cc(a[j], b[i], odd[i + j]);  
+      }
+      odd[i + j - 1] = ptx::addc(odd[i + j - 1], 0); // handling last carry
+
+      // multiply accumulate odd part of new row with even part prev row
+      // j = 1, no carry in, only carry out
+      even[i + 1] =  ptx::mad_lo_cc(a[1], b[i], even[i + 1]);
+      even[i + 2] =  ptx::madc_hi_cc(a[1], b[i], even[i + 2]);  
+      // for loop carry in carry out
+      for (j = 3; j < TLC; j+=2)
+      {
+        even[i + j] =  ptx::madc_lo_cc(a[j], b[i], even[i + j]);
+        even[i + j + 1] =      ptx::madc_hi_cc(a[j], b[i], even[i + j + 1]);  
+      }
+
+      // add even and odd parts
+      even[1] = ptx::add_cc(even[1], odd[0]);
+      for (i = 1; i < 2 * TLC - 2; i++)
+        even[i + 1] = ptx::addc_cc(even[i + 1], odd[i]);
+      even[i + 1] = ptx::addc(even[i + 1], 0);
+    }
+
+    static DEVICE_INLINE void ingo_msb_multiply_raw_device(const ff_storage &as, const ff_storage &bs, ff_wide_storage &rs) {
+      const uint32_t *a = as.limbs;
+      const uint32_t *b = bs.limbs;
+      uint32_t *r = rs.limbs;
+      uint32_t i, j;
+      uint32_t *even = rs.limbs;
+      __align__(8) uint32_t odd[2 * TLC];
+      for (uint32_t i = 0; i < 2 * TLC; i++)
+      {
+        even[i] = 0;
+        odd[i] = 0;
+      }
+      // only last element from first row.
+      mult_no_carry(b[0], a[TLC - 1], &odd[TLC - 2]);
+      
+      // doing two rows at one loop
+      #pragma unroll
+      for (i = 1; i < TLC - 1; i+=2)
+      {
+        const uint32_t first_active_j = TLC - 1 - i;
+        const uint32_t first_active_j_odd = first_active_j + (1 - (first_active_j % 2));
+        const uint32_t first_active_j_even = first_active_j + first_active_j % 2  ;
+        // odd bi's 
+        // multiply accumulate even part of new row with odd part prev row (needs a carry)
+        // j = 0, no carry in, only carry out
+        odd[first_active_j_even + i - 1] =  ptx::mad_lo_cc(a[first_active_j_even], b[i],  odd[first_active_j_even + i - 1]);
+        odd[first_active_j_even + i]     =  ptx::madc_hi_cc(a[first_active_j_even], b[i], odd[first_active_j_even + i]);
+        // for loop carry in carry out
+        #pragma unroll  
+        for (j = first_active_j_even + 2; j < TLC; j+=2)
+        {
+          odd[i + j - 1] =  ptx::madc_lo_cc(a[j], b[i], odd[i + j - 1]);
+          odd[i + j] =      ptx::madc_hi_cc(a[j], b[i], odd[i + j]);  
+        }
+        odd[i + j - 1] = ptx::addc(odd[i + j - 1], 0); // handling last carry
+
+        // multiply accumulate odd part of new row with even part prev row (doesnt need a carry)
+        // j = 1, no carry in, only carry out
+        even[i + first_active_j_odd]      =  ptx::mad_lo_cc(a[first_active_j_odd], b[i], even[i + first_active_j_odd]);
+        even[i + first_active_j_odd + 1]  =  ptx::madc_hi_cc(a[first_active_j_odd], b[i], even[i + first_active_j_odd + 1]);  
+        // for loop carry in carry out
+        #pragma unroll
+        for (j = first_active_j_odd + 2; j < TLC; j+=2)
+        {
+          even[i + j] =  ptx::madc_lo_cc(a[j], b[i], even[i + j]);
+          even[i + j + 1] =      ptx::madc_hi_cc(a[j], b[i], even[i + j + 1]);  
+        }
+
+        // even bi's
+        uint32_t const first_active_j1 = TLC - 1 - (i + 1) ;
+        uint32_t const first_active_j_odd1 = first_active_j1 + (1 - (first_active_j1 % 2));  
+        uint32_t const first_active_j_even1 = first_active_j1 + first_active_j1 % 2;
+        // multiply accumulate even part of new row with even part of prev row // needs a carry
+        // j = 0, no carry in, only carry out
+        even[first_active_j_even1 + i + 1] = ptx::mad_lo_cc(a[first_active_j_even1], b[i + 1], even[first_active_j_even1 + i + 1]);
+        even[first_active_j_even1 + i + 2] = ptx::madc_hi_cc(a[first_active_j_even1], b[i + 1], even[first_active_j_even1 + i + 2]);  
+        // for loop, carry in, carry out.
+        #pragma unroll
+        for (j = first_active_j_even1 + 2; j < TLC; j+=2)
+        {
+          even[i + j + 1] = ptx::madc_lo_cc(a[j], b[i + 1], even[i + j + 1]);
+          even[i + j + 2] = ptx::madc_hi_cc(a[j], b[i + 1], even[i + j + 2]);  
+        }
+        even[i + j + 1] = ptx::addc(even[i + j + 1], 0); // handling last carry
+        
+        // multiply accumulate odd part of new row with odd part of prev row
+        // j = 1, no carry in, only carry out
+        odd[first_active_j_odd1 + i] = ptx::mad_lo_cc(a[first_active_j_odd1], b[i + 1], odd[first_active_j_odd1 + i]);
+        odd[first_active_j_odd1+ i + 1] = ptx::madc_hi_cc(a[first_active_j_odd1], b[i + 1], odd[first_active_j_odd1 + i + 1]);  
+        // for loop, carry in, carry out.
+        #pragma unroll
+        for (j = first_active_j_odd1 + 2; j < TLC; j+=2)
+        {
+          odd[i + j]      = ptx::madc_lo_cc(a[j], b[i + 1], odd[i + j]);
+          odd[i + j + 1]  = ptx::madc_hi_cc(a[j], b[i + 1], odd[i + j + 1]);  
+        }
+        
+      }
+
+      // last round, i = TLC - 1
+      odd[i - 1] =  ptx::mad_lo_cc(a[0], b[i], odd[i - 1]);
+      odd[i] =      ptx::madc_hi_cc(a[0], b[i], odd[i]);
+      // for loop carry in carry out
+      #pragma unroll  
+      for (j = 2; j < TLC; j+=2)
+      {
+        odd[i + j - 1] =  ptx::madc_lo_cc(a[j], b[i], odd[i + j - 1]);
+        odd[i + j] =      ptx::madc_hi_cc(a[j], b[i], odd[i + j]);  
+      }
+      odd[i + j - 1] = ptx::addc(odd[i + j - 1], 0); // handling last carry
+
+      // multiply accumulate odd part of new row with even part prev row
+      // j = 1, no carry in, only carry out
+      even[i + 1] =  ptx::mad_lo_cc(a[1], b[i], even[i + 1]);
+      even[i + 2] =  ptx::madc_hi_cc(a[1], b[i], even[i + 2]);  
+      // for loop carry in carry out
+      #pragma unroll
+      for (j = 3; j < TLC; j+=2)
+      {
+        even[i + j] =  ptx::madc_lo_cc(a[j], b[i], even[i + j]);
+        even[i + j + 1] =      ptx::madc_hi_cc(a[j], b[i], even[i + j + 1]);  
+      }
+
+      // add even and odd parts
+      even[1] = ptx::add_cc(even[1], odd[0]);
+      #pragma unroll
+      for (i = 1; i < 2 * TLC - 2; i++)
+        even[i + 1] = ptx::addc_cc(even[i + 1], odd[i]);
+      even[i + 1] = ptx::addc(even[i + 1], 0);
+    }
+
+    static DEVICE_INLINE void multiply_lsb_raw_device(const ff_storage &as, const ff_storage &bs, ff_wide_storage &rs) {
+      // r = a * b is correcrt for the first TLC + 1 digits. (not computing from TLC + 1 to 2*TLC - 2).
+      const uint32_t *a = as.limbs;
+      const uint32_t *b = bs.limbs;
+      uint32_t *even = rs.limbs;
+      __align__(8) uint32_t odd[2 * TLC - 2];
+      mul_n(even, a, b[0]);
+      mul_n(odd, a + 1, b[0]);
+      mad_row(&even[2], &odd[0], a, b[1]);
+      size_t i;
+    #pragma unroll
+      for (i = 2; i < TLC - 1; i += 2) {
+        mad_row(&odd[i], &even[i], a, b[i], TLC - i + 2);
+        mad_row(&even[i + 2], &odd[i], a, b[i + 1], TLC - i + 2);
+      }
+
+      // merge |even| and |odd|
+      even[1] = ptx::add_cc(even[1], odd[0]);
+      for (i = 1; i < TLC + 1; i++)
+        even[i + 1] = ptx::addc_cc(even[i + 1], odd[i]);
+      even[i + 1] = ptx::addc(even[i + 1], 0);
+    }
+
+    static DEVICE_INLINE void multiply_msb_raw_device(const ff_storage &as, const ff_storage &bs, ff_wide_storage &rs) {
+      const uint32_t *a = as.limbs;
+      const uint32_t *b = bs.limbs;
+      uint32_t *even = rs.limbs;
+      __align__(8) uint32_t odd[2 * TLC - 2];
+      for (int i=0; i<2*TLC - 1; i++)
+      {
+        even[i] = 0;
+        odd[i] = 0;
+      }
+      uint32_t min_indexes_sum = TLC - 1;
+      // only diagonal
+      mul_n_msb(even, a, b[0], TLC, min_indexes_sum);
+      mul_n_msb(odd, a + 1, b[0], TLC, min_indexes_sum - 1);
+      mad_row_msb(&even[2], &odd[0], a, b[1], TLC, min_indexes_sum - 1);
+      size_t i;
+    #pragma unroll
+      for (i = 2; i < TLC - 1; i += 2) {
+        mad_row(&odd[i], &even[i], a, b[i]);
+        mad_row(&even[i + 2], &odd[i], a, b[i + 1]);
+      }
+      // merge |even| and |odd|
+      even[1] = ptx::add_cc(even[1], odd[0]);
+      for (i = 1; i < 2 * TLC - 2; i++)
+        even[i + 1] = ptx::addc_cc(even[i + 1], odd[i]);
+      even[i + 1] = ptx::addc(even[i + 1], 0);
+    }
+
     static HOST_INLINE void multiply_raw_host(const ff_storage &as, const ff_storage &bs, ff_wide_storage &rs) {
       const uint32_t *a = as.limbs;
       const uint32_t *b = bs.limbs;
       uint32_t *r = rs.limbs;
       for (unsigned i = 0; i < TLC; i++) {
         uint32_t carry = 0;
-        for (unsigned j = 0; j < TLC; j++)
+        for (unsigned j = 0; j < TLC; j++) 
           r[j + i] = host_math::madc_cc(a[j], b[i], r[j + i], carry);
         r[TLC + i] = carry;
       }
@@ -315,6 +650,22 @@ template <class CONFIG> class Field {
     #endif
     }
 
+    static HOST_DEVICE_INLINE void multiply_raw_lsb(const ff_storage &as, const ff_storage &bs, ff_wide_storage &rs) {
+    #ifdef __CUDA_ARCH__
+      return multiply_lsb_raw_device(as, bs, rs);
+    #else
+      return multiply_raw_host(as, bs, rs);
+    #endif
+    }
+
+    static HOST_DEVICE_INLINE void multiply_raw_msb(const ff_storage &as, const ff_storage &bs, ff_wide_storage &rs) {
+    #ifdef __CUDA_ARCH__
+      return multiply_raw_device(as, bs, rs);
+    #else
+      return multiply_raw_host(as, bs, rs);
+    #endif
+    }
+
   public:
     ff_storage limbs_storage;
 
@@ -345,7 +696,7 @@ template <class CONFIG> class Field {
       return value;
     }
 
-    template <unsigned REDUCTION_SIZE = 1> static constexpr HOST_DEVICE_INLINE Field reduce(const Field &xs) {
+    template <unsigned REDUCTION_SIZE = 1> static constexpr HOST_DEVICE_INLINE Field sub_modulus(const Field &xs) {
       if (REDUCTION_SIZE == 0)
         return xs;
       const ff_storage modulus = get_modulus<REDUCTION_SIZE>();
@@ -368,7 +719,7 @@ template <class CONFIG> class Field {
     friend HOST_DEVICE_INLINE Field operator+(Field xs, const Field& ys) {
       Field rs = {};
       add_limbs<false>(xs.limbs_storage, ys.limbs_storage, rs.limbs_storage);
-      return reduce<1>(rs);
+      return sub_modulus<1>(rs);
     }
 
     friend HOST_DEVICE_INLINE Field operator-(Field xs, const Field& ys) {
@@ -388,20 +739,49 @@ template <class CONFIG> class Field {
       return rs;
     }
 
-    friend HOST_DEVICE_INLINE Field operator*(const Field& xs, const Field& ys) {
-      Wide xy = mul_wide(xs, ys);
-      Field xy_hi = xy.get_higher_with_slack();
+    static constexpr DEVICE_INLINE uint32_t sub_limbs_partial_device(uint32_t *x, uint32_t *y, uint32_t *r, uint32_t num_limbs) {
+      r[0] = ptx::sub_cc(x[0], y[0]);
+      #pragma unroll
+      for (unsigned i = 1; i < num_limbs; i++)
+        r[i] = ptx::subc_cc(x[i], y[i]);
+      return ptx::subc(0, 0);
+    }
+
+    static constexpr HOST_DEVICE_INLINE uint32_t sub_limbs_partial(uint32_t *x, uint32_t *y, uint32_t *r, uint32_t num_limbs) {
+    #ifdef __CUDA_ARCH__
+      return sub_limbs_partial_device(x, y, r, num_limbs);
+    #else
+      return sub_limbs_partial_host(x, y, r, num_limbs);
+    #endif
+    }
+
+    template <unsigned MODULUS_MULTIPLE = 1>
+    static constexpr HOST_DEVICE_INLINE Field reduce(const Wide& xs) {
+      Field xs_hi = Wide::get_higher_with_slack(xs); // xy << slack_bits
       Wide l = {};
-      multiply_raw(xy_hi.limbs_storage, get_m(), l.limbs_storage);
-      Field l_hi = l.get_higher_with_slack();
+      multiply_raw_msb(xs_hi.limbs_storage, get_m(), l.limbs_storage);      // MSB mult
+      Field l_hi = Wide::get_higher_with_slack(l);
       Wide lp = {};
-      multiply_raw(l_hi.limbs_storage, get_modulus(), lp.limbs_storage);
-      Wide r_wide = xy - lp;
+      multiply_raw_lsb(l_hi.limbs_storage, get_modulus(), lp.limbs_storage); // LSB mult
+      Wide r_wide = xs - lp; 
       Wide r_wide_reduced = {};
-      uint32_t reduced = sub_limbs<true>(r_wide.limbs_storage, modulus_wide(), r_wide_reduced.limbs_storage);
-      r_wide = reduced ? r_wide : r_wide_reduced;
-      Field r = r_wide.get_lower();
-      return reduce<1>(r);
+      for (unsigned i = 0; i < TLC + 1; i++)
+      {
+        uint32_t carry = sub_limbs_partial(r_wide.limbs_storage.limbs, modulus_wide().limbs, r_wide_reduced.limbs_storage.limbs, TLC + 1);
+        if (carry == 0) // continue to reduce
+          r_wide = r_wide_reduced;
+        else // done
+          break;
+      }
+      
+      // number of wrap around is bounded by TLC +  1 times.
+      Field r = Wide::get_lower(r_wide);
+      return r;
+    }
+
+    friend HOST_DEVICE_INLINE Field operator*(const Field& xs, const Field& ys) {
+      Wide xy = mul_wide(xs, ys); // full mult
+      return reduce(xy);
     }
 
     friend HOST_DEVICE_INLINE bool operator==(const Field& xs, const Field& ys) {
@@ -425,8 +805,19 @@ template <class CONFIG> class Field {
       return !(xs == ys);
     }
 
-    template <const Field& multiplier, class T> static constexpr HOST_DEVICE_INLINE T mul_const(const T &xs) {
-      return mul_unsigned<multiplier.limbs_storage.limbs[0], T>(xs);
+    template <const Field& multiplier>
+    static HOST_DEVICE_INLINE Field mul_const(const Field& xs) {
+      Field mul = multiplier;
+      static bool is_u32 = true;
+    #ifdef __CUDA_ARCH__
+    #pragma unroll
+    #endif
+      for (unsigned i = 1; i < TLC; i++)
+        is_u32 &= (mul.limbs_storage.limbs[i] == 0);
+
+      if (is_u32)
+        return mul_unsigned<multiplier.limbs_storage.limbs[0], Field>(xs);
+      return mul * xs;
     }
 
     template <uint32_t mutliplier, class T, unsigned REDUCTION_SIZE = 1>
@@ -485,7 +876,7 @@ template <class CONFIG> class Field {
     #endif
       }
       r[TLC - 1] = x[TLC - 1] >> 1;
-      return reduce<MODULUS_MULTIPLE>(rs);
+      return sub_modulus<MODULUS_MULTIPLE>(rs);
     }
 
     static constexpr HOST_DEVICE_INLINE bool lt(const Field &xs, const Field &ys) {
diff --git a/icicle/primitives/projective.cuh b/icicle/primitives/projective.cuh
index d73d711f7..5ba274818 100644
--- a/icicle/primitives/projective.cuh
+++ b/icicle/primitives/projective.cuh
@@ -33,47 +33,47 @@ class Projective {
     }
 
     friend HOST_DEVICE_INLINE Projective operator+(Projective p1, const Projective& p2) {   
-      const FF X1 = p1.x;                                               //                   < 2
-      const FF Y1 = p1.y;                                               //                   < 2
-      const FF Z1 = p1.z;                                               //                   < 2
-      const FF X2 = p2.x;                                               //                   < 2
-      const FF Y2 = p2.y;                                               //                   < 2
-      const FF Z2 = p2.z;                                               //                   < 2
-      const FF t00 = X1 * X2;                                           // t00 ← X1 · X2     < 2
-      const FF t01 = Y1 * Y2;                                           // t01 ← Y1 · Y2     < 2
-      const FF t02 = Z1 * Z2;                                           // t02 ← Z1 · Z2     < 2
-      const FF t03 = X1 + Y1;                                           // t03 ← X1 + Y1     < 4
-      const FF t04 = X2 + Y2;                                           // t04 ← X2 + Y2     < 4
-      const FF t05 = t03 * t04;                                         // t03 ← t03 · t04   < 3
-      const FF t06 = t00 + t01;                                         // t06 ← t00 + t01   < 4
-      const FF t07 = t05 - t06;                                         // t05 ← t05 − t06   < 2
-      const FF t08 = Y1 + Z1;                                           // t08 ← Y1 + Z1     < 4
-      const FF t09 = Y2 + Z2;                                           // t09 ← Y2 + Z2     < 4
-      const FF t10 = t08 * t09;                                         // t10 ← t08 · t09   < 3
-      const FF t11 = t01 + t02;                                         // t11 ← t01 + t02   < 4
-      const FF t12 = t10 - t11;                                         // t12 ← t10 − t11   < 2
-      const FF t13 = X1 + Z1;                                           // t13 ← X1 + Z1     < 4
-      const FF t14 = X2 + Z2;                                           // t14 ← X2 + Z2     < 4
-      const FF t15 = t13 * t14;                                         // t15 ← t13 · t14   < 3
-      const FF t16 = t00 + t02;                                         // t16 ← t00 + t02   < 4
-      const FF t17 = t15 - t16;                                         // t17 ← t15 − t16   < 2
-      const FF t18 = t00 + t00;                                         // t18 ← t00 + t00   < 2
-      const FF t19 = t18 + t00;                                         // t19 ← t18 + t00   < 2
+      const FF X1 = p1.x;                                      //                   < 2
+      const FF Y1 = p1.y;                                      //                   < 2
+      const FF Z1 = p1.z;                                      //                   < 2
+      const FF X2 = p2.x;                                      //                   < 2
+      const FF Y2 = p2.y;                                      //                   < 2
+      const FF Z2 = p2.z;                                      //                   < 2
+      const FF t00 = X1 * X2;                                  // t00 ← X1 · X2     < 2
+      const FF t01 = Y1 * Y2;                                  // t01 ← Y1 · Y2     < 2
+      const FF t02 = Z1 * Z2;                                  // t02 ← Z1 · Z2     < 2
+      const FF t03 = X1 + Y1;                                  // t03 ← X1 + Y1     < 4
+      const FF t04 = X2 + Y2;                                  // t04 ← X2 + Y2     < 4
+      const FF t05 = t03 * t04;                                // t03 ← t03 · t04   < 3
+      const FF t06 = t00 + t01;                                // t06 ← t00 + t01   < 4
+      const FF t07 = t05 - t06;                                // t05 ← t05 − t06   < 2
+      const FF t08 = Y1 + Z1;                                  // t08 ← Y1 + Z1     < 4
+      const FF t09 = Y2 + Z2;                                  // t09 ← Y2 + Z2     < 4
+      const FF t10 = t08 * t09;                                // t10 ← t08 · t09   < 3
+      const FF t11 = t01 + t02;                                // t11 ← t01 + t02   < 4
+      const FF t12 = t10 - t11;                                // t12 ← t10 − t11   < 2
+      const FF t13 = X1 + Z1;                                  // t13 ← X1 + Z1     < 4
+      const FF t14 = X2 + Z2;                                  // t14 ← X2 + Z2     < 4
+      const FF t15 = t13 * t14;                                // t15 ← t13 · t14   < 3
+      const FF t16 = t00 + t02;                                // t16 ← t00 + t02   < 4
+      const FF t17 = t15 - t16;                                // t17 ← t15 − t16   < 2
+      const FF t18 = t00 + t00;                                // t18 ← t00 + t00   < 2
+      const FF t19 = t18 + t00;                                // t19 ← t18 + t00   < 2
       const FF t20 = FF::template mul_unsigned<3>(
-        FF::template mul_const<B_VALUE>(t02));                          // t20 ← b3 · t02    < 2
-      const FF t21 = t01 + t20;                                         // t21 ← t01 + t20   < 2
-      const FF t22 = t01 - t20;                                         // t22 ← t01 − t20   < 2
+        FF::template mul_const<B_VALUE>(t02));                 // t20 ← b3 · t02    < 2
+      const FF t21 = t01 + t20;                                // t21 ← t01 + t20   < 2
+      const FF t22 = t01 - t20;                                // t22 ← t01 − t20   < 2
       const FF t23 = FF::template mul_unsigned<3>(
-        FF::template mul_const<B_VALUE>(t17));                          // t23 ← b3 · t17    < 2
-      const FF t24 = t12 * t23;                                         // t24 ← t12 · t23   < 2
-      const FF t25 = t07 * t22;                                         // t25 ← t07 · t22   < 2
-      const FF X3 = t25 - t24;                                          // X3 ← t25 − t24    < 2
-      const FF t27 = t23 * t19;                                         // t27 ← t23 · t19   < 2
-      const FF t28 = t22 * t21;                                         // t28 ← t22 · t21   < 2
-      const FF Y3 = t28 + t27;                                          // Y3 ← t28 + t27    < 2
-      const FF t30 = t19 * t07;                                         // t30 ← t19 · t07   < 2
-      const FF t31 = t21 * t12;                                         // t31 ← t21 · t12   < 2
-      const FF Z3 = t31 + t30;                                          // Z3 ← t31 + t30    < 2
+        FF::template mul_const<B_VALUE>(t17));                 // t23 ← b3 · t17    < 2
+      const auto t24 = FF::mul_wide(t12, t23);                 // t24 ← t12 · t23   < 2
+      const auto t25 = FF::mul_wide(t07, t22);                 // t25 ← t07 · t22   < 2
+      const FF X3 = FF::reduce(t25 - t24);                     // X3 ← t25 − t24    < 2
+      const auto t27 = FF::mul_wide(t23, t19);                 // t27 ← t23 · t19   < 2
+      const auto t28 = FF::mul_wide(t22, t21);                 // t28 ← t22 · t21   < 2
+      const FF Y3 = FF::reduce(t28 + t27);                     // Y3 ← t28 + t27    < 2
+      const auto t30 = FF::mul_wide(t19, t07);                 // t30 ← t19 · t07   < 2
+      const auto t31 = FF::mul_wide(t21, t12);                 // t31 ← t21 · t12   < 2
+      const FF Z3 = FF::reduce(t31 + t30);                     // Z3 ← t31 + t30    < 2
       return {X3, Y3, Z3};
     }
 
@@ -82,13 +82,47 @@ class Projective {
     }
 
     friend HOST_DEVICE_INLINE Projective operator+(Projective p1, const Affine<FF>& p2) {   
-      // TODO: change the implementation to a more efficient mixed adder later on
-      return p1 + from_affine(p2);
-    }
-
-    friend HOST_INLINE std::ostream& operator<<(std::ostream& os, const Projective& point) {
-      os << "Point { x: " << point.x << "; y: " << point.y << "; z: " << point.z << " }";
-      return os;
+      const FF X1 = p1.x;                                      //                   < 2
+      const FF Y1 = p1.y;                                      //                   < 2
+      const FF Z1 = p1.z;                                      //                   < 2
+      const FF X2 = p2.x;                                      //                   < 2
+      const FF Y2 = p2.y;                                      //                   < 2
+      const FF t00 = X1 * X2;                                  // t00 ← X1 · X2     < 2
+      const FF t01 = Y1 * Y2;                                  // t01 ← Y1 · Y2     < 2
+      const FF t02 = Z1;                                       // t02 ← Z1          < 2
+      const FF t03 = X1 + Y1;                                  // t03 ← X1 + Y1     < 4
+      const FF t04 = X2 + Y2;                                  // t04 ← X2 + Y2     < 4
+      const FF t05 = t03 * t04;                                // t03 ← t03 · t04   < 3
+      const FF t06 = t00 + t01;                                // t06 ← t00 + t01   < 4
+      const FF t07 = t05 - t06;                                // t05 ← t05 − t06   < 2
+      const FF t08 = Y1 + Z1;                                  // t08 ← Y1 + Z1     < 4
+      const FF t09 = Y2 + FF::one();                           // t09 ← Y2 + 1      < 4
+      const FF t10 = t08 * t09;                                // t10 ← t08 · t09   < 3
+      const FF t11 = t01 + t02;                                // t11 ← t01 + t02   < 4
+      const FF t12 = t10 - t11;                                // t12 ← t10 − t11   < 2
+      const FF t13 = X1 + Z1;                                  // t13 ← X1 + Z1     < 4
+      const FF t14 = X2 + FF::one();                           // t14 ← X2 + 1      < 4
+      const FF t15 = t13 * t14;                                // t15 ← t13 · t14   < 3
+      const FF t16 = t00 + t02;                                // t16 ← t00 + t02   < 4
+      const FF t17 = t15 - t16;                                // t17 ← t15 − t16   < 2
+      const FF t18 = t00 + t00;                                // t18 ← t00 + t00   < 2
+      const FF t19 = t18 + t00;                                // t19 ← t18 + t00   < 2
+      const FF t20 = FF::template mul_unsigned<3>(
+        FF::template mul_const<B_VALUE>(t02));                 // t20 ← b3 · t02    < 2
+      const FF t21 = t01 + t20;                                // t21 ← t01 + t20   < 2
+      const FF t22 = t01 - t20;                                // t22 ← t01 − t20   < 2
+      const FF t23 = FF::template mul_unsigned<3>(
+        FF::template mul_const<B_VALUE>(t17));                 // t23 ← b3 · t17    < 2
+      const auto t24 = FF::mul_wide(t12, t23);                 // t24 ← t12 · t23   < 2
+      const auto t25 = FF::mul_wide(t07, t22);                 // t25 ← t07 · t22   < 2
+      const FF X3 = FF::reduce(t25 - t24);                     // X3 ← t25 − t24    < 2
+      const auto t27 = FF::mul_wide(t23, t19);                 // t27 ← t23 · t19   < 2
+      const auto t28 = FF::mul_wide(t22, t21);                 // t28 ← t22 · t21   < 2
+      const FF Y3 = FF::reduce(t28 + t27);                     // Y3 ← t28 + t27    < 2
+      const auto t30 = FF::mul_wide(t19, t07);                 // t30 ← t19 · t07   < 2
+      const auto t31 = FF::mul_wide(t21, t12);                 // t31 ← t21 · t12   < 2
+      const FF Z3 = FF::reduce(t31 + t30);                     // Z3 ← t31 + t30    < 2
+      return {X3, Y3, Z3};
     }
 
     friend HOST_DEVICE_INLINE Projective operator-(Projective p1, const Affine<FF>& p2) {   
@@ -115,6 +149,11 @@ class Projective {
       return (p1.x * p2.z == p2.x * p1.z) && (p1.y * p2.z == p2.y * p1.z);
     }
 
+    friend HOST_INLINE std::ostream& operator<<(std::ostream& os, const Projective& point) {
+      os << "Point { x: " << point.x << "; y: " << point.y << "; z: " << point.z << " }";
+      return os;
+    }
+
     static HOST_DEVICE_INLINE bool is_zero(const Projective &point) {
       return point.x == FF::zero() && point.y != FF::zero() && point.z == FF::zero();
     }
diff --git a/icicle/primitives/test.cu b/icicle/primitives/test.cu
index 8870cbe3d..adc6572d5 100644
--- a/icicle/primitives/test.cu
+++ b/icicle/primitives/test.cu
@@ -1,8 +1,9 @@
 #include <cuda_runtime.h>
 #include <gtest/gtest.h>
-
 #include "test_kernels.cuh"
-
+#include <iostream>
+#include <boost/multiprecision/cpp_int.hpp>
+namespace mp = boost::multiprecision;
 
 template <class T>
 int device_populate_random(T* d_elements, unsigned n) {
@@ -20,49 +21,63 @@ int device_set(T* d_elements, T el, unsigned n) {
     return cudaMemcpy(d_elements, h_elements, sizeof(T) * n, cudaMemcpyHostToDevice);
 }
 
+mp::int1024_t convert_to_boost_mp(uint32_t *a, uint32_t length)
+{
+  mp::int1024_t res = 0;
+  for (uint32_t i = 0; i < length; i++)
+  {
+    res += (mp::int1024_t)(a[i]) << 32 * i;
+  }
+  return res;
+}
+
 class PrimitivesTest : public ::testing::Test {
 protected:
-  static const unsigned n = 1 << 5;
-
-  proj *points1{};
-  proj *points2{};
-  g2_proj *g2_points1{};
-  g2_proj *g2_points2{};
-  scalar_field *scalars1{};
-  scalar_field *scalars2{};
-  proj *zero_points{};
-  g2_proj *g2_zero_points{};
-  scalar_field *zero_scalars{};
-  scalar_field *one_scalars{};
-  affine *aff_points{};
-  g2_affine *g2_aff_points{};
-  proj *res_points1{};
-  proj *res_points2{};
-  g2_proj *g2_res_points1{};
-  g2_proj *g2_res_points2{};
-  scalar_field *res_scalars1{};
-  scalar_field *res_scalars2{};
+  static const unsigned n = 1 << 4;
+
+  projective_t *points1{};
+  projective_t *points2{};
+  g2_projective_t *g2_points1{};
+  g2_projective_t *g2_points2{};
+  scalar_field_t *scalars1{};
+  scalar_field_t *scalars2{};
+  projective_t *zero_points{};
+  g2_projective_t *g2_zero_points{};
+  scalar_field_t *zero_scalars{};
+  scalar_field_t *one_scalars{};
+  affine_t *aff_points{};
+  g2_affine_t *g2_aff_points{};
+  projective_t *res_points1{};
+  projective_t *res_points2{};
+  g2_projective_t *g2_res_points1{};
+  g2_projective_t *g2_res_points2{};
+  scalar_field_t *res_scalars1{};
+  scalar_field_t *res_scalars2{};
+  scalar_field_t::Wide *res_scalars_wide{};
+  scalar_field_t::Wide *res_scalars_wide_full{};
 
   PrimitivesTest() {
     assert(!cudaDeviceReset());
-    assert(!cudaMallocManaged(&points1, n * sizeof(proj)));
-    assert(!cudaMallocManaged(&points2, n * sizeof(proj)));
-    assert(!cudaMallocManaged(&g2_points1, n * sizeof(g2_proj)));
-    assert(!cudaMallocManaged(&g2_points2, n * sizeof(g2_proj)));
-    assert(!cudaMallocManaged(&scalars1, n * sizeof(scalar_field)));
-    assert(!cudaMallocManaged(&scalars2, n * sizeof(scalar_field)));
-    assert(!cudaMallocManaged(&zero_points, n * sizeof(proj)));
-    assert(!cudaMallocManaged(&g2_zero_points, n * sizeof(g2_proj)));
-    assert(!cudaMallocManaged(&zero_scalars, n * sizeof(scalar_field)));
-    assert(!cudaMallocManaged(&one_scalars, n * sizeof(scalar_field)));
-    assert(!cudaMallocManaged(&aff_points, n * sizeof(affine)));
-    assert(!cudaMallocManaged(&g2_aff_points, n * sizeof(g2_affine)));
-    assert(!cudaMallocManaged(&res_points1, n * sizeof(proj)));
-    assert(!cudaMallocManaged(&res_points2, n * sizeof(proj)));
-    assert(!cudaMallocManaged(&g2_res_points1, n * sizeof(g2_proj)));
-    assert(!cudaMallocManaged(&g2_res_points2, n * sizeof(g2_proj)));
-    assert(!cudaMallocManaged(&res_scalars1, n * sizeof(scalar_field)));
-    assert(!cudaMallocManaged(&res_scalars2, n * sizeof(scalar_field)));
+    assert(!cudaMallocManaged(&points1, n * sizeof(projective_t)));
+    assert(!cudaMallocManaged(&points2, n * sizeof(projective_t)));
+    assert(!cudaMallocManaged(&g2_points1, n * sizeof(g2_projective_t)));
+    assert(!cudaMallocManaged(&g2_points2, n * sizeof(g2_projective_t)));
+    assert(!cudaMallocManaged(&scalars1, n * sizeof(scalar_field_t)));
+    assert(!cudaMallocManaged(&scalars2, n * sizeof(scalar_field_t)));
+    assert(!cudaMallocManaged(&zero_points, n * sizeof(projective_t)));
+    assert(!cudaMallocManaged(&g2_zero_points, n * sizeof(g2_projective_t)));
+    assert(!cudaMallocManaged(&zero_scalars, n * sizeof(scalar_field_t)));
+    assert(!cudaMallocManaged(&one_scalars, n * sizeof(scalar_field_t)));
+    assert(!cudaMallocManaged(&aff_points, n * sizeof(affine_t)));
+    assert(!cudaMallocManaged(&g2_aff_points, n * sizeof(g2_affine_t)));
+    assert(!cudaMallocManaged(&res_points1, n * sizeof(projective_t)));
+    assert(!cudaMallocManaged(&res_points2, n * sizeof(projective_t)));
+    assert(!cudaMallocManaged(&g2_res_points1, n * sizeof(g2_projective_t)));
+    assert(!cudaMallocManaged(&g2_res_points2, n * sizeof(g2_projective_t)));
+    assert(!cudaMallocManaged(&res_scalars1, n * sizeof(scalar_field_t)));
+    assert(!cudaMallocManaged(&res_scalars2, n * sizeof(scalar_field_t)));
+    assert(!cudaMallocManaged(&res_scalars_wide, n * sizeof(scalar_field_t::Wide)));
+    assert(!cudaMallocManaged(&res_scalars_wide_full, n * sizeof(scalar_field_t::Wide)));
   }
 
   ~PrimitivesTest() override {
@@ -84,28 +99,34 @@ protected:
     cudaFree(g2_res_points2);
     cudaFree(res_scalars1);
     cudaFree(res_scalars2);
+
+    cudaFree(res_scalars_wide);
+    cudaFree(res_scalars_wide_full);
+
     cudaDeviceReset();
   }
 
   void SetUp() override {
-    ASSERT_EQ(device_populate_random<proj>(points1, n), cudaSuccess);
-    ASSERT_EQ(device_populate_random<proj>(points2, n), cudaSuccess);
-    ASSERT_EQ(device_populate_random<g2_proj>(g2_points1, n), cudaSuccess);
-    ASSERT_EQ(device_populate_random<g2_proj>(g2_points2, n), cudaSuccess);
-    ASSERT_EQ(device_populate_random<scalar_field>(scalars1, n), cudaSuccess);
-    ASSERT_EQ(device_populate_random<scalar_field>(scalars2, n), cudaSuccess);
-    ASSERT_EQ(device_set<proj>(zero_points, proj::zero(), n), cudaSuccess);
-    ASSERT_EQ(device_set<g2_proj>(g2_zero_points, g2_proj::zero(), n), cudaSuccess);
-    ASSERT_EQ(device_set<scalar_field>(zero_scalars, scalar_field::zero(), n), cudaSuccess);
-    ASSERT_EQ(device_set<scalar_field>(one_scalars, scalar_field::one(), n), cudaSuccess);
-    ASSERT_EQ(cudaMemset(aff_points, 0, n * sizeof(affine)), cudaSuccess);
-    ASSERT_EQ(cudaMemset(g2_aff_points, 0, n * sizeof(g2_affine)), cudaSuccess);
-    ASSERT_EQ(cudaMemset(res_points1, 0, n * sizeof(proj)), cudaSuccess);
-    ASSERT_EQ(cudaMemset(res_points2, 0, n * sizeof(proj)), cudaSuccess);
-    ASSERT_EQ(cudaMemset(g2_res_points1, 0, n * sizeof(g2_proj)), cudaSuccess);
-    ASSERT_EQ(cudaMemset(g2_res_points2, 0, n * sizeof(g2_proj)), cudaSuccess);
-    ASSERT_EQ(cudaMemset(res_scalars1, 0, n * sizeof(scalar_field)), cudaSuccess);
-    ASSERT_EQ(cudaMemset(res_scalars2, 0, n * sizeof(scalar_field)), cudaSuccess);
+    ASSERT_EQ(device_populate_random<projective_t>(points1, n), cudaSuccess);
+    ASSERT_EQ(device_populate_random<projective_t>(points2, n), cudaSuccess);
+    ASSERT_EQ(device_populate_random<g2_projective_t>(g2_points1, n), cudaSuccess);
+    ASSERT_EQ(device_populate_random<g2_projective_t>(g2_points2, n), cudaSuccess);
+    ASSERT_EQ(device_populate_random<scalar_field_t>(scalars1, n), cudaSuccess);
+    ASSERT_EQ(device_populate_random<scalar_field_t>(scalars2, n), cudaSuccess);
+    ASSERT_EQ(device_set<projective_t>(zero_points, projective_t::zero(), n), cudaSuccess);
+    ASSERT_EQ(device_set<g2_projective_t>(g2_zero_points, g2_projective_t::zero(), n), cudaSuccess);
+    ASSERT_EQ(device_set<scalar_field_t>(zero_scalars, scalar_field_t::zero(), n), cudaSuccess);
+    ASSERT_EQ(device_set<scalar_field_t>(one_scalars, scalar_field_t::one(), n), cudaSuccess);
+    ASSERT_EQ(cudaMemset(aff_points, 0, n * sizeof(affine_t)), cudaSuccess);
+    ASSERT_EQ(cudaMemset(g2_aff_points, 0, n * sizeof(g2_affine_t)), cudaSuccess);
+    ASSERT_EQ(cudaMemset(res_points1, 0, n * sizeof(projective_t)), cudaSuccess);
+    ASSERT_EQ(cudaMemset(res_points2, 0, n * sizeof(projective_t)), cudaSuccess);
+    ASSERT_EQ(cudaMemset(g2_res_points1, 0, n * sizeof(g2_projective_t)), cudaSuccess);
+    ASSERT_EQ(cudaMemset(g2_res_points2, 0, n * sizeof(g2_projective_t)), cudaSuccess);
+    ASSERT_EQ(cudaMemset(res_scalars1, 0, n * sizeof(scalar_field_t)), cudaSuccess);
+    ASSERT_EQ(cudaMemset(res_scalars2, 0, n * sizeof(scalar_field_t)), cudaSuccess);
+    ASSERT_EQ(cudaMemset(res_scalars_wide, 0, n * sizeof(scalar_field_t::Wide)), cudaSuccess);
+    ASSERT_EQ(cudaMemset(res_scalars_wide_full, 0, n * sizeof(scalar_field_t::Wide)), cudaSuccess);
   }
 };
 
@@ -183,7 +204,7 @@ TEST_F(PrimitivesTest, FieldMultiplicationSqrEq) {
 
 TEST_F(PrimitivesTest, ECRandomPointsAreOnCurve) {
   for (unsigned i = 0; i < n; i++)
-    ASSERT_PRED1(proj::is_on_curve, points1[i]);
+    ASSERT_PRED1(projective_t::is_on_curve, points1[i]);
 }
 
 TEST_F(PrimitivesTest, ECPointAdditionSubtractionCancel) {
@@ -260,7 +281,7 @@ TEST_F(PrimitivesTest, ECScalarMultiplicationIsDistributiveOverAddition) {
 TEST_F(PrimitivesTest, ECProjectiveToAffine) {
   ASSERT_EQ(point_vec_to_affine(points1, aff_points, n), cudaSuccess);
   for (unsigned i = 0; i < n; i++)
-    ASSERT_EQ(points1[i], proj::from_affine(aff_points[i]));
+    ASSERT_EQ(points1[i], projective_t::from_affine(aff_points[i]));
 }
 
 TEST_F(PrimitivesTest, ECMixedPointAddition) {
@@ -279,9 +300,192 @@ TEST_F(PrimitivesTest, ECMixedAdditionOfNegatedPointEqSubtraction) {
     ASSERT_EQ(res_points1[i], points1[i] + res_points2[i]);
 }
 
+TEST_F(PrimitivesTest, MP_LSB_MULT) {
+  // LSB multiply, check correctness of first TLC + 1 digits result.
+  ASSERT_EQ(mp_lsb_mult(scalars1, scalars2, res_scalars_wide), cudaSuccess);
+  std::cout << "first GPU lsb mult output  = 0x";
+  for (int i=0; i<2*scalar_field_t::TLC; i++)
+  {
+    std::cout << std::hex << res_scalars_wide[0].limbs_storage.limbs[i];
+  }
+  std::cout << std::endl;
+
+
+  ASSERT_EQ(mp_mult(scalars1, scalars2, res_scalars_wide_full), cudaSuccess);
+  std::cout << "first GPU full mult output = 0x";
+  for (int i=0; i<2*scalar_field_t::TLC; i++)
+  {
+    std::cout << std::hex << res_scalars_wide_full[0].limbs_storage.limbs[i];
+  }
+  std::cout << std::endl;
+  for (int j = 0; j < n; j++)
+  {
+    for (int i=0; i<scalar_field_t::TLC + 1; i++)
+    {
+      ASSERT_EQ(res_scalars_wide_full[j].limbs_storage.limbs[i], res_scalars_wide[j].limbs_storage.limbs[i]);
+    }
+  }
+}
+
+TEST_F(PrimitivesTest, MP_MSB_MULT) {
+  // MSB multiply, take n msb bits of multiplication, assert that the error is up to 1.
+  ASSERT_EQ(mp_msb_mult(scalars1, scalars2, res_scalars_wide), cudaSuccess);
+  std::cout << "first GPU msb mult output  = 0x";
+  for (int i=2*scalar_field_t::TLC - 1; i >=0 ; i--)
+  {
+    std::cout << std::hex << res_scalars_wide[0].limbs_storage.limbs[i] << " ";
+  }
+  std::cout << std::endl;
+
+
+  ASSERT_EQ(mp_mult(scalars1, scalars2, res_scalars_wide_full), cudaSuccess);
+  std::cout << "first GPU full mult output = 0x";
+  for (int i=2*scalar_field_t::TLC - 1; i >=0 ; i--)
+  {
+    std::cout << std::hex << res_scalars_wide_full[0].limbs_storage.limbs[i] << " ";
+  }
+
+  std::cout << std::endl;
+
+  for (int i=0; i < 2*scalar_field_t::TLC - 1; i++)
+  {
+    if (res_scalars_wide_full[0].limbs_storage.limbs[i] == res_scalars_wide[0].limbs_storage.limbs[i])
+        std::cout << "matched word idx = " << i << std::endl;
+  }
+
+}
+
+TEST_F(PrimitivesTest, INGO_MP_MULT) {
+  // MSB multiply, take n msb bits of multiplication, assert that the error is up to 1.
+  ASSERT_EQ(ingo_mp_mult(scalars1, scalars2, res_scalars_wide), cudaSuccess);
+  std::cout << "INGO   = 0x";
+  for (int i=0; i < 2*scalar_field_t::TLC ; i++)
+  {
+    std::cout << std::hex << res_scalars_wide[0].limbs_storage.limbs[i] << " ";
+  }
+  std::cout << std::endl;
+
+
+  ASSERT_EQ(mp_mult(scalars1, scalars2, res_scalars_wide_full), cudaSuccess);
+  std::cout << "ZKSYNC = 0x";
+  for (int i=0; i < 2*scalar_field_t::TLC ; i++)
+  {
+    std::cout << std::hex << res_scalars_wide_full[0].limbs_storage.limbs[i] << " ";
+  }
+
+  std::cout << std::endl;
+
+  for (int i=0; i < 2*scalar_field_t::TLC - 1; i++)
+  {
+    if (res_scalars_wide_full[0].limbs_storage.limbs[i] == res_scalars_wide[0].limbs_storage.limbs[i])
+        std::cout << "matched word idx = " << i << std::endl;
+  }
+  for (int j=0; j<n; j++)
+  {
+    for (int i=0; i < 2*scalar_field_t::TLC - 1; i++)
+    {
+      ASSERT_EQ(res_scalars_wide_full[j].limbs_storage.limbs[i], res_scalars_wide[j].limbs_storage.limbs[i]);
+    }
+  }
+
+}
+
+
+TEST_F(PrimitivesTest, INGO_MP_MSB_MULT) {
+  // MSB multiply, take n msb bits of multiplication, assert that the error is up to 1.
+  ASSERT_EQ(ingo_mp_msb_mult(scalars1, scalars2, res_scalars_wide, n), cudaSuccess);
+  std::cout << "INGO MSB   = 0x";
+  for (int i=2*scalar_field_t::TLC - 1; i >= 0  ; i--)
+  {
+    std::cout << std::hex << res_scalars_wide[0].limbs_storage.limbs[i] << " ";
+  }
+  std::cout << std::endl;
+
+  ASSERT_EQ(mp_mult(scalars1, scalars2, res_scalars_wide_full), cudaSuccess);
+  std::cout << "ZKSYNC = 0x";
+  for (int i=2*scalar_field_t::TLC - 1; i >= 0  ; i--)
+  {
+    std::cout << std::hex << res_scalars_wide_full[0].limbs_storage.limbs[i] << " ";
+  }
+
+  std::cout << std::endl;
+  
+  
+  // for (int i=scalar_field::TLC; i < 2*scalar_field::TLC - 1; i++)
+  // {
+  //   ASSERT_EQ(in_bound, true);
+  // }
+  // for (int j=0; j<n; j++)
+  // {
+  //   for (int i=0; i < 2*scalar_field::TLC - 1; i++)
+  //   {
+  //     ASSERT_EQ(res_scalars_wide_full[j].limbs_storage.limbs[i], res_scalars_wide[j].limbs_storage.limbs[i]);
+  //   }
+  // }
+  // mp testing
+  mp::int1024_t scalar_1_mp = 0;
+  mp::int1024_t scalar_2_mp = 0;
+  mp::int1024_t res_mp = 0;
+  mp::int1024_t res_gpu = 0;
+  uint32_t num_limbs = scalar_field_t::TLC;
+  
+  for (int j=0; j<n; j++)
+  {
+    uint32_t* scalar1_limbs = scalars1[j].limbs_storage.limbs;
+    uint32_t* scalar2_limbs = scalars2[j].limbs_storage.limbs;
+    scalar_1_mp = convert_to_boost_mp(scalar1_limbs, num_limbs);
+    scalar_2_mp = convert_to_boost_mp(scalar2_limbs, num_limbs);
+    res_mp = scalar_1_mp * scalar_2_mp;
+    res_mp = res_mp >> (num_limbs * 32);
+    res_gpu = convert_to_boost_mp(&(res_scalars_wide[j]).limbs_storage.limbs[num_limbs], num_limbs);
+    std::cout  << "res  mp = " << res_mp << std::endl;
+    std::cout << "res gpu = " << res_gpu << std::endl;
+    std::cout << "error = " << res_mp - res_gpu << std::endl;
+    bool upper_bound = res_gpu <= res_mp;
+    bool lower_bound = res_gpu > (res_mp - num_limbs);
+    bool in_bound = upper_bound && lower_bound;
+    
+    
+    ASSERT_EQ(in_bound, true);
+  }
+}
+
+TEST_F(PrimitivesTest, INGO_MP_MOD_MULT) {
+  std::cout  << " taking num limbs " <<  std::endl;
+  uint32_t num_limbs = scalar_field_t::TLC;
+  std::cout  << " calling gpu... = " <<  std::endl;
+  ASSERT_EQ(ingo_mp_mod_mult(scalars1, scalars2, res_scalars1, n), cudaSuccess);
+  std::cout  << " gpu call done " <<  std::endl;
+  // mp testing
+  mp::int1024_t scalar_1_mp = 0;
+  mp::int1024_t scalar_2_mp = 0;
+  mp::int1024_t res_mp = 0;
+  mp::int1024_t res_gpu = 0;
+  mp::int1024_t p = convert_to_boost_mp(scalar_field_t::get_modulus().limbs, num_limbs);
+  std::cout << " p = " << p << std::endl;
+  
+  
+  for (int j=0; j<n; j++)
+  {
+    uint32_t* scalar1_limbs = scalars1[j].limbs_storage.limbs;
+    uint32_t* scalar2_limbs = scalars2[j].limbs_storage.limbs;
+    scalar_1_mp = convert_to_boost_mp(scalar1_limbs, num_limbs);
+    scalar_2_mp = convert_to_boost_mp(scalar2_limbs, num_limbs);
+    // std::cout << " s1 = " << scalar_1_mp << std::endl;
+    // std::cout << " s2 = " << scalar_2_mp << std::endl;
+    res_mp = (scalar_1_mp * scalar_2_mp) % p;
+    res_gpu = convert_to_boost_mp((res_scalars1[j]).limbs_storage.limbs, num_limbs);
+    std::cout  << "res  mp = " << res_mp << std::endl;
+    std::cout << "res gpu = " << res_gpu << std::endl;
+    std::cout << "error = " << res_mp - res_gpu << std::endl;
+    ASSERT_EQ(res_gpu, res_mp);
+  }
+}
+
+
 TEST_F(PrimitivesTest, G2ECRandomPointsAreOnCurve) {
-  for (unsigned i = 0; i < 2; i++)
-    ASSERT_PRED1(g2_proj::is_on_curve, g2_points1[i]);
+  for (unsigned i = 0; i < n; i++)
+    ASSERT_PRED1(g2_projective_t::is_on_curve, g2_points1[i]);
 }
 
 TEST_F(PrimitivesTest, G2ECPointAdditionSubtractionCancel) {
@@ -358,7 +562,7 @@ TEST_F(PrimitivesTest, G2ECScalarMultiplicationIsDistributiveOverAddition) {
 TEST_F(PrimitivesTest, G2ECProjectiveToAffine) {
   ASSERT_EQ(point_vec_to_affine(g2_points1, g2_aff_points, n), cudaSuccess);
   for (unsigned i = 0; i < n; i++)
-    ASSERT_EQ(g2_points1[i], g2_proj::from_affine(g2_aff_points[i]));
+    ASSERT_EQ(g2_points1[i], g2_projective_t::from_affine(g2_aff_points[i]));
 }
 
 TEST_F(PrimitivesTest, G2ECMixedPointAddition) {
@@ -377,7 +581,6 @@ TEST_F(PrimitivesTest, G2ECMixedAdditionOfNegatedPointEqSubtraction) {
     ASSERT_EQ(g2_res_points1[i], g2_points1[i] + g2_res_points2[i]);
 }
 
-
 int main(int argc, char **argv) {
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
diff --git a/icicle/primitives/test_kernels.cuh b/icicle/primitives/test_kernels.cuh
index ea1dd21be..30733b855 100644
--- a/icicle/primitives/test_kernels.cuh
+++ b/icicle/primitives/test_kernels.cuh
@@ -1,20 +1,16 @@
 #pragma once
 
+#ifndef G2_DEFINED
+#define G2_DEFINED
+
 // TODO: change the curve depending on env variable
-#include "../curves/bls12_381.cuh"
+#include "../curves/bn254/curve_config.cuh"
 #include "projective.cuh"
 #include "extension_field.cuh"
 
-typedef Field<fp_config> scalar_field;
-typedef Field<fq_config> base_field;
-typedef Affine<base_field> affine;
-static constexpr base_field b = base_field{ weierstrass_b };
-typedef Projective<base_field, scalar_field, b> proj;
-typedef ExtensionField<fq_config> base_extension_field;
-typedef Affine<base_extension_field> g2_affine;
-static constexpr base_extension_field b2 = base_extension_field{ base_field {b_re},  base_field {b_im}};
-typedef Projective<base_extension_field, scalar_field, b2> g2_proj;
+#endif
 
+using namespace BN254;
 
 template <class T1, class T2>
 __global__ void add_elements_kernel(const T1 *x, const T2 *y, T1 *result, const unsigned count) {
@@ -72,27 +68,27 @@ template <class F, class G> int vec_mul(const F *x, const G *y, G *result, const
   return error ? error : cudaDeviceSynchronize();
 }
 
-__global__ void inv_field_elements_kernel(const scalar_field *x, scalar_field *result, const unsigned count) {
+__global__ void inv_field_elements_kernel(const scalar_field_t *x, scalar_field_t *result, const unsigned count) {
   const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
   if (gid >= count)
     return;
-  result[gid] = scalar_field::inverse(x[gid]);
+  result[gid] = scalar_field_t::inverse(x[gid]);
 }
 
-int field_vec_inv(const scalar_field *x, scalar_field *result, const unsigned count) {
+int field_vec_inv(const scalar_field_t *x, scalar_field_t *result, const unsigned count) {
   inv_field_elements_kernel<<<(count - 1) / 32 + 1, 32>>>(x, result, count);
   int error = cudaGetLastError();
   return error ? error : cudaDeviceSynchronize();
 }
 
-__global__ void sqr_field_elements_kernel(const scalar_field *x, scalar_field *result, const unsigned count) {
+__global__ void sqr_field_elements_kernel(const scalar_field_t *x, scalar_field_t *result, const unsigned count) {
   const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
   if (gid >= count)
     return;
-  result[gid] = scalar_field::sqr(x[gid]);
+  result[gid] = scalar_field_t::sqr(x[gid]);
 }
 
-int field_vec_sqr(const scalar_field *x, scalar_field *result, const unsigned count) {
+int field_vec_sqr(const scalar_field_t *x, scalar_field_t *result, const unsigned count) {
   sqr_field_elements_kernel<<<(count - 1) / 32 + 1, 32>>>(x, result, count);
   int error = cudaGetLastError();
   return error ? error : cudaDeviceSynchronize();
@@ -111,3 +107,87 @@ template <class P, class A> int point_vec_to_affine(const P *x, A *result, const
   int error = cudaGetLastError();
   return error ? error : cudaDeviceSynchronize();
 }
+
+
+__global__ void mp_mult_kernel(const scalar_field_t *x, const scalar_field_t *y, scalar_field_t::Wide *result) {
+  const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
+  scalar_field_t::multiply_raw_device(x[gid].limbs_storage, y[gid].limbs_storage, result[gid].limbs_storage);
+}
+
+
+int mp_mult(const scalar_field_t *x, scalar_field_t *y, scalar_field_t::Wide *result)
+{
+  mp_mult_kernel<<<1, 32>>>(x, y, result);
+  int error = cudaGetLastError();
+  return error ? error :  cudaDeviceSynchronize();
+}
+
+
+
+__global__ void mp_lsb_mult_kernel(const scalar_field_t *x, const scalar_field_t *y, scalar_field_t::Wide *result) {
+  const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
+  scalar_field_t::multiply_lsb_raw_device(x[gid].limbs_storage, y[gid].limbs_storage, result[gid].limbs_storage);
+}
+
+
+int mp_lsb_mult(const scalar_field_t *x, scalar_field_t *y, scalar_field_t::Wide *result)
+{
+  mp_lsb_mult_kernel<<<1, 32>>>(x, y, result);
+  int error = cudaGetLastError();
+  return error ? error :  cudaDeviceSynchronize();
+}
+
+__global__ void mp_msb_mult_kernel(const scalar_field_t *x, const scalar_field_t *y, scalar_field_t::Wide *result) {
+  const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
+  scalar_field_t::multiply_msb_raw_device(x[gid].limbs_storage, y[gid].limbs_storage, result[gid].limbs_storage);
+}
+
+
+int mp_msb_mult(const scalar_field_t *x, scalar_field_t *y, scalar_field_t::Wide *result)
+{
+  mp_msb_mult_kernel<<<1, 1>>>(x, y, result);
+  int error = cudaGetLastError();
+  return error ? error :  cudaDeviceSynchronize();
+}
+
+
+__global__ void ingo_mp_mult_kernel(const scalar_field_t *x, const scalar_field_t *y, scalar_field_t::Wide *result) {
+  const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
+  scalar_field_t::ingo_multiply_raw_device(x[gid].limbs_storage, y[gid].limbs_storage, result[gid].limbs_storage);
+}
+
+
+int ingo_mp_mult(const scalar_field_t *x, scalar_field_t *y, scalar_field_t::Wide *result)
+{
+  ingo_mp_mult_kernel<<<1, 32>>>(x, y, result);
+  int error = cudaGetLastError();
+  return error ? error :  cudaDeviceSynchronize();
+}
+
+
+__global__ void ingo_mp_msb_mult_kernel(const scalar_field_t *x, const scalar_field_t *y, scalar_field_t::Wide *result) {
+  const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
+  scalar_field_t::ingo_msb_multiply_raw_device(x[gid].limbs_storage, y[gid].limbs_storage, result[gid].limbs_storage);
+}
+
+
+int ingo_mp_msb_mult(const scalar_field_t *x, scalar_field_t *y, scalar_field_t::Wide *result, const unsigned n)
+{
+  ingo_mp_msb_mult_kernel<<<1, n>>>(x, y, result);
+  int error = cudaGetLastError();
+  return error ? error :  cudaDeviceSynchronize();
+}
+
+
+__global__ void ingo_mp_mod_mult_kernel(const scalar_field_t *x, const scalar_field_t *y, scalar_field_t *result) {
+  const unsigned gid = blockIdx.x * blockDim.x + threadIdx.x;
+  result[gid] = x[gid] * y[gid];
+}
+
+
+int ingo_mp_mod_mult(const scalar_field_t *x, scalar_field_t *y, scalar_field_t *result, const unsigned n)
+{
+  ingo_mp_mod_mult_kernel<<<1, n>>>(x, y, result);
+  int error = cudaGetLastError();
+  return error ? error :  cudaDeviceSynchronize();
+}
\ No newline at end of file
diff --git a/icicle/utils/mont.cuh b/icicle/utils/mont.cuh
new file mode 100644
index 000000000..a41071020
--- /dev/null
+++ b/icicle/utils/mont.cuh
@@ -0,0 +1,25 @@
+#pragma once
+
+#include "../appUtils/vector_manipulation/ve_mod_mult.cuh"
+
+template <typename E>
+int convert_montgomery(E *d_inout, size_t n_elments, bool is_into, cudaStream_t stream)
+{    
+    // Set the grid and block dimensions
+    int num_threads = MAX_THREADS_PER_BLOCK;
+    int num_blocks = (n_elments + num_threads - 1) / num_threads;
+    E mont = is_into ? E::montgomery_r() : E::montgomery_r_inv();
+    template_normalize_kernel<<<num_blocks, num_threads, 0, stream>>>(d_inout, n_elments, mont);
+
+    return 0; //TODO: void with propper error handling
+}
+
+template <typename E>
+int to_montgomery(E* d_inout, unsigned n, cudaStream_t stream) {
+    return convert_montgomery(d_inout, n, true, stream);
+}
+
+template <typename E>
+int from_montgomery(E* d_inout, unsigned n, cudaStream_t stream){
+    return convert_montgomery(d_inout, n, false, stream);
+}
\ No newline at end of file
diff --git a/src/test_bn254.rs b/src/test_bn254.rs
index ef86e3156..7cce80e7e 100644
--- a/src/test_bn254.rs
+++ b/src/test_bn254.rs
@@ -73,6 +73,16 @@ extern "C" {
         device_id: usize
     ) -> c_int;
 
+     fn ntt_inplace_coset_batch_cuda_bn254(        
+        d_inout: DevicePointer<ScalarField_BN254>,
+        d_twiddles: DevicePointer<ScalarField_BN254>,
+        n: usize,
+        batch_size: usize,
+        inverse: bool,
+        is_coset: bool,
+        d_coset: DevicePointer<ScalarField_BN254>,
+        device_id: usize) -> c_int;
+
     fn interpolate_scalars_cuda_bn254(
         d_out: DevicePointer<ScalarField_BN254>,
         d_evaluations: DevicePointer<ScalarField_BN254>,
@@ -651,6 +661,29 @@ pub fn evaluate_scalars_on_coset_batch_bn254(
     return res;
 }
 
+//extern "C" int ntt_inplace_coset_batch_cuda_bn254(BN254::scalar_t* d_inout, BN254::scalar_t* d_twiddles,
+//    unsigned n, unsigned batch_size, bool inverse, bool is_coset, BN254::scalar_t* coset, size_t device_id = 0, cudaStream_t stream = 0)
+pub fn ntt_inplace_coset_batch_bn254(
+    d_inout: &mut DeviceBuffer<ScalarField_BN254>,
+    d_twiddles: &mut DeviceBuffer<ScalarField_BN254>,
+    batch_size: usize,
+    inverse: bool,
+    d_coset: &mut DeviceBuffer<ScalarField_BN254>,
+) -> i32 {
+    unsafe {
+        ntt_inplace_coset_batch_cuda_bn254(
+            d_inout.as_device_ptr(),
+            d_twiddles.as_device_ptr(),
+            d_twiddles.len(),
+            batch_size,
+            inverse,
+            d_coset.len() > 0,
+            d_coset.as_device_ptr(),
+            0
+        )
+    }
+}
+
 pub fn evaluate_points_on_coset_bn254(
     d_coefficients: &mut DeviceBuffer<Point_BN254>,
     d_domain: &mut DeviceBuffer<ScalarField_BN254>,
@@ -786,6 +819,22 @@ pub fn generate_random_points_bn254(
         .collect()
 }
 
+pub fn generate_random_points100_bn254(
+    count: usize,
+    mut rng: Box<dyn RngCore>,
+) -> Vec<PointAffineNoInfinity_BN254> {
+    let mut res =  Vec::new();
+    for i in 0..count{
+        if (i<100) {
+            res.push(Point_BN254::from_ark(G1Projective_BN254::rand(&mut rng)).to_xy_strip_z());
+        }
+        else {
+            res.push(res[i-100]);
+        }
+    }
+    return res;
+}
+
 pub fn generate_random_points_proj_bn254(count: usize, mut rng: Box<dyn RngCore>) -> Vec<Point_BN254> {
     (0..count)
         .map(|_| Point_BN254::from_ark(G1Projective_BN254::rand(&mut rng)))
@@ -898,12 +947,13 @@ pub(crate) mod tests_bn254 {
 
     #[test]
     fn test_msm() {
-        let test_sizes = [6, 9];
+        let test_sizes = [24];
 
         for pow2 in test_sizes {
             let count = 1 << pow2;
             let seed = None; // set Some to provide seed
-            let points = generate_random_points_bn254(count, get_rng_bn254(seed));
+            // let points = generate_random_points_bn254(count, get_rng_bn254(seed));
+            let points = generate_random_points100_bn254(count, get_rng_bn254(seed));
             let scalars = generate_random_scalars_bn254(count, get_rng_bn254(seed));
 
             let msm_result = msm_bn254(&points, &scalars, 0);
@@ -1413,13 +1463,19 @@ pub(crate) mod tests_bn254 {
         let (_, _, mut d_large_domain) = set_up_scalars_bn254(0, log_test_size + 1, false);
         let mut d_coset_powers = build_domain_bn254(test_size, log_test_size + 1, false);
 
+        println!("d_coset_powers len {}", d_coset_powers.len());
+
         let mut d_evals_large = evaluate_scalars_batch_bn254(&mut d_coeffs, &mut d_large_domain, batch_size);
         let mut h_evals_large: Vec<ScalarField_BN254> = (0..2 * test_size * batch_size).map(|_| ScalarField_BN254::zero()).collect();
         d_evals_large.copy_to(&mut h_evals_large[..]).unwrap();
         let mut d_evals = evaluate_scalars_batch_bn254(&mut d_coeffs, &mut d_domain, batch_size);
         let mut h_evals: Vec<ScalarField_BN254> = (0..test_size * batch_size).map(|_| ScalarField_BN254::zero()).collect();
         d_evals.copy_to(&mut h_evals[..]).unwrap();
-        let mut d_evals_coset = evaluate_scalars_on_coset_batch_bn254(&mut d_coeffs, &mut d_domain, batch_size, &mut d_coset_powers);
+        
+        // let mut d_evals_coset = evaluate_scalars_on_coset_batch_bn254(&mut d_coeffs, &mut d_domain, batch_size, &mut d_coset_powers);
+        ntt_inplace_coset_batch_bn254(&mut d_coeffs, &mut d_domain, batch_size, false, &mut d_coset_powers);
+        let d_evals_coset = d_coeffs;
+
         let mut h_evals_coset: Vec<ScalarField_BN254> = (0..test_size * batch_size).map(|_| ScalarField_BN254::zero()).collect();
         d_evals_coset.copy_to(&mut h_evals_coset[..]).unwrap();
 
@@ -1499,7 +1555,7 @@ pub(crate) mod tests_bn254 {
         assert_eq!(intoo, expected);
     }
 
-    #[test]
+    //#[test]
     #[allow(non_snake_case)]
     fn test_vec_point_mul() {
         let dummy_one = Point_BN254 {